{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0036764705882355, "eval_steps": 500, "global_step": 2451, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 961.0, "completions/max_terminated_length": 961.0, "completions/mean_length": 365.90625, "completions/mean_terminated_length": 365.90625, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.41505885124206543, "epoch": 0.0012254901960784314, "frac_reward_zero_std": 0.25, "grad_norm": 1.1695018623555844, "kl": 0.0, "learning_rate": 0.0, "loss": 0.091, "num_tokens": 39210.0, "reward": 0.8125, "reward_std": 0.4973389506340027, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.7420127391815186, "sampling/importance_sampling_ratio/mean": 1.0003938674926758, "sampling/importance_sampling_ratio/min": 0.30550429224967957, "sampling/sampling_logp_difference/max": 1.1857914924621582, "sampling/sampling_logp_difference/mean": 0.014100993052124977, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 574.0, "completions/max_terminated_length": 574.0, "completions/mean_length": 375.5625, "completions/mean_terminated_length": 375.5625, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "entropy": 0.4766429364681244, "epoch": 0.0024509803921568627, "frac_reward_zero_std": 0.25, "grad_norm": 0.9978308836306047, "kl": 0.0, "learning_rate": 4.065040650406504e-09, "loss": -0.0501, "num_tokens": 79438.0, "reward": 0.40625, "reward_std": 0.5959457159042358, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000145673751831, "sampling/importance_sampling_ratio/min": 0.6484100818634033, "sampling/sampling_logp_difference/max": 0.7369050979614258, "sampling/sampling_logp_difference/mean": 0.01617528311908245, "step": 2 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 639.0, "completions/max_terminated_length": 639.0, "completions/mean_length": 340.875, "completions/mean_terminated_length": 340.875, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.6024458408355713, "epoch": 0.003676470588235294, "frac_reward_zero_std": 0.25, "grad_norm": 1.0467773437449983, "kl": 0.000302223110338673, "learning_rate": 8.130081300813008e-09, "loss": 0.002, "num_tokens": 119446.0, "reward": 0.625, "reward_std": 0.6645200252532959, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0002312660217285, "sampling/importance_sampling_ratio/min": 0.18051205575466156, "sampling/sampling_logp_difference/max": 1.7119576930999756, "sampling/sampling_logp_difference/mean": 0.017202120274305344, "step": 3 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1889.0, "completions/max_terminated_length": 1889.0, "completions/mean_length": 443.796875, "completions/mean_terminated_length": 443.796875, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "entropy": 0.40364548563957214, "epoch": 0.004901960784313725, "frac_reward_zero_std": 0.75, "grad_norm": 0.4955964014238433, "kl": 0.00025927514070644975, "learning_rate": 1.2195121951219512e-08, "loss": -0.043, "num_tokens": 166633.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.5494352579116821, "sampling/importance_sampling_ratio/mean": 0.9999737739562988, "sampling/importance_sampling_ratio/min": 0.6569480895996094, "sampling/sampling_logp_difference/max": 0.43789052963256836, "sampling/sampling_logp_difference/mean": 0.012304041534662247, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 915.0, "completions/max_terminated_length": 915.0, "completions/mean_length": 353.453125, "completions/mean_terminated_length": 353.453125, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "entropy": 0.44968217611312866, "epoch": 0.006127450980392157, "frac_reward_zero_std": 0.75, "grad_norm": 0.5441219928142339, "kl": 0.0003811737697105855, "learning_rate": 1.6260162601626016e-08, "loss": 0.0123, "num_tokens": 214502.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.6272985935211182, "sampling/importance_sampling_ratio/mean": 0.9998967051506042, "sampling/importance_sampling_ratio/min": 0.6297216415405273, "sampling/sampling_logp_difference/max": 0.4869213104248047, "sampling/sampling_logp_difference/mean": 0.015574390068650246, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/max_terminated_length": 480.0, "completions/mean_length": 266.265625, "completions/mean_terminated_length": 266.265625, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.5314531922340393, "epoch": 0.007352941176470588, "frac_reward_zero_std": 0.25, "grad_norm": 1.288896770569737, "kl": 0.00046820472925901413, "learning_rate": 2.032520325203252e-08, "loss": -0.0298, "num_tokens": 246967.0, "reward": 0.1875, "reward_std": 0.6285127401351929, "rewards/decision_reward_func/mean": 0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 1.4558207988739014, "sampling/importance_sampling_ratio/mean": 1.0000114440917969, "sampling/importance_sampling_ratio/min": 0.6369333267211914, "sampling/sampling_logp_difference/max": 0.45109033584594727, "sampling/sampling_logp_difference/mean": 0.01772291772067547, "step": 6 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 832.0, "completions/max_terminated_length": 832.0, "completions/mean_length": 388.296875, "completions/mean_terminated_length": 388.296875, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "entropy": 0.3720030188560486, "epoch": 0.00857843137254902, "frac_reward_zero_std": 0.5, "grad_norm": 0.8583965733323762, "kl": 0.00039064735756255686, "learning_rate": 2.4390243902439023e-08, "loss": 0.0463, "num_tokens": 290554.0, "reward": 0.625, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.4644094705581665, "sampling/importance_sampling_ratio/mean": 0.9999487996101379, "sampling/importance_sampling_ratio/min": 0.6021889448165894, "sampling/sampling_logp_difference/max": 0.5071840286254883, "sampling/sampling_logp_difference/mean": 0.012569723650813103, "step": 7 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 589.0, "completions/max_terminated_length": 589.0, "completions/mean_length": 321.265625, "completions/mean_terminated_length": 321.265625, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.3708134889602661, "epoch": 0.00980392156862745, "frac_reward_zero_std": 0.75, "grad_norm": 0.5071176558735883, "kl": 0.00030992773827165365, "learning_rate": 2.8455284552845527e-08, "loss": 0.0025, "num_tokens": 328763.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.3715956211090088, "sampling/importance_sampling_ratio/mean": 0.9994015693664551, "sampling/importance_sampling_ratio/min": 0.5260623097419739, "sampling/sampling_logp_difference/max": 0.6423356533050537, "sampling/sampling_logp_difference/mean": 0.012563306838274002, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 605.0, "completions/max_terminated_length": 605.0, "completions/mean_length": 322.90625, "completions/mean_terminated_length": 322.90625, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "entropy": 0.4355572462081909, "epoch": 0.011029411764705883, "frac_reward_zero_std": 0.5, "grad_norm": 0.9246240716311902, "kl": 0.0002924255095422268, "learning_rate": 3.252032520325203e-08, "loss": 0.0228, "num_tokens": 371557.0, "reward": 0.375, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.5422954559326172, "sampling/importance_sampling_ratio/mean": 1.0002633333206177, "sampling/importance_sampling_ratio/min": 0.6401677131652832, "sampling/sampling_logp_difference/max": 0.44602513313293457, "sampling/sampling_logp_difference/mean": 0.01383017748594284, "step": 9 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1486.0, "completions/max_terminated_length": 1486.0, "completions/mean_length": 469.609375, "completions/mean_terminated_length": 469.609375, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.4678601026535034, "epoch": 0.012254901960784314, "frac_reward_zero_std": 0.5, "grad_norm": 0.7757133257618328, "kl": 0.00034989556297659874, "learning_rate": 3.658536585365853e-08, "loss": 0.0024, "num_tokens": 422220.0, "reward": -0.34375, "reward_std": 0.3723389506340027, "rewards/decision_reward_func/mean": -0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999489784240723, "sampling/importance_sampling_ratio/min": 0.5654895901679993, "sampling/sampling_logp_difference/max": 0.727350115776062, "sampling/sampling_logp_difference/mean": 0.015067042782902718, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 697.0, "completions/max_terminated_length": 697.0, "completions/mean_length": 338.609375, "completions/mean_terminated_length": 338.609375, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.5678433775901794, "epoch": 0.013480392156862746, "frac_reward_zero_std": 0.25, "grad_norm": 1.0817897136801518, "kl": 0.0004554027400445193, "learning_rate": 4.065040650406504e-08, "loss": 0.0025, "num_tokens": 460643.0, "reward": 0.4375, "reward_std": 0.6494960784912109, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.580706238746643, "sampling/importance_sampling_ratio/mean": 0.9996622800827026, "sampling/importance_sampling_ratio/min": 0.5127862095832825, "sampling/sampling_logp_difference/max": 0.6678962707519531, "sampling/sampling_logp_difference/mean": 0.01801103539764881, "step": 11 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 782.0, "completions/max_terminated_length": 782.0, "completions/mean_length": 349.078125, "completions/mean_terminated_length": 349.078125, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.37111133337020874, "epoch": 0.014705882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 0.7987946382256754, "kl": 0.0003580403863452375, "learning_rate": 4.4715447154471546e-08, "loss": -0.0024, "num_tokens": 499880.0, "reward": 0.1875, "reward_std": 0.3943893015384674, "rewards/decision_reward_func/mean": 0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 1.499313473701477, "sampling/importance_sampling_ratio/mean": 0.9994770884513855, "sampling/importance_sampling_ratio/min": 0.6194482445716858, "sampling/sampling_logp_difference/max": 0.4789261817932129, "sampling/sampling_logp_difference/mean": 0.013263946399092674, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 676.0, "completions/max_terminated_length": 676.0, "completions/mean_length": 279.421875, "completions/mean_terminated_length": 279.421875, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "entropy": 0.4052412509918213, "epoch": 0.015931372549019607, "frac_reward_zero_std": 0.5, "grad_norm": 0.8491332923548865, "kl": 0.0003585988888517022, "learning_rate": 4.878048780487805e-08, "loss": 0.0118, "num_tokens": 532131.0, "reward": 0.90625, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.4391707181930542, "sampling/importance_sampling_ratio/mean": 1.0003132820129395, "sampling/importance_sampling_ratio/min": 0.6120945811271667, "sampling/sampling_logp_difference/max": 0.49086856842041016, "sampling/sampling_logp_difference/mean": 0.014977063983678818, "step": 13 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 623.0, "completions/max_terminated_length": 623.0, "completions/mean_length": 282.921875, "completions/mean_terminated_length": 282.921875, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.40877875685691833, "epoch": 0.01715686274509804, "frac_reward_zero_std": 1.0, "grad_norm": 0.001580207973435736, "kl": 0.00041365064680576324, "learning_rate": 5.2845528455284554e-08, "loss": 0.0, "num_tokens": 565246.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4872089624404907, "sampling/importance_sampling_ratio/mean": 0.9998214244842529, "sampling/importance_sampling_ratio/min": 0.5823137760162354, "sampling/sampling_logp_difference/max": 0.5407458543777466, "sampling/sampling_logp_difference/mean": 0.01487233117222786, "step": 14 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/max_terminated_length": 536.0, "completions/mean_length": 301.1875, "completions/mean_terminated_length": 301.1875, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.30358201265335083, "epoch": 0.01838235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.0012999925330394548, "kl": 0.00037317664828151464, "learning_rate": 5.6910569105691055e-08, "loss": 0.0, "num_tokens": 602074.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6076985597610474, "sampling/importance_sampling_ratio/mean": 0.9999232292175293, "sampling/importance_sampling_ratio/min": 0.6222097277641296, "sampling/sampling_logp_difference/max": 0.4748036861419678, "sampling/sampling_logp_difference/mean": 0.012510273605585098, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 805.0, "completions/max_terminated_length": 805.0, "completions/mean_length": 379.34375, "completions/mean_terminated_length": 379.34375, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "entropy": 0.4808722734451294, "epoch": 0.0196078431372549, "frac_reward_zero_std": 0.5, "grad_norm": 0.9026597121369546, "kl": 0.0003862033481709659, "learning_rate": 6.097560975609756e-08, "loss": -0.0404, "num_tokens": 652000.0, "reward": 0.6875, "reward_std": 0.3811737596988678, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.9290928840637207, "sampling/importance_sampling_ratio/mean": 1.0004785060882568, "sampling/importance_sampling_ratio/min": 0.23741376399993896, "sampling/sampling_logp_difference/max": 1.437950849533081, "sampling/sampling_logp_difference/mean": 0.015814729034900665, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 957.0, "completions/max_terminated_length": 957.0, "completions/mean_length": 364.25, "completions/mean_terminated_length": 364.25, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "entropy": 0.4145900011062622, "epoch": 0.020833333333333332, "frac_reward_zero_std": 1.0, "grad_norm": 0.0016944089255073668, "kl": 0.0002639095182530582, "learning_rate": 6.504065040650406e-08, "loss": 0.0, "num_tokens": 691712.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4370158910751343, "sampling/importance_sampling_ratio/mean": 0.9997653961181641, "sampling/importance_sampling_ratio/min": 0.7128937244415283, "sampling/sampling_logp_difference/max": 0.36256861686706543, "sampling/sampling_logp_difference/mean": 0.01335146650671959, "step": 17 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 725.0, "completions/max_terminated_length": 725.0, "completions/mean_length": 290.78125, "completions/mean_terminated_length": 290.78125, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.4278489351272583, "epoch": 0.022058823529411766, "frac_reward_zero_std": 0.75, "grad_norm": 0.6440057748451199, "kl": 0.00040159851778298616, "learning_rate": 6.910569105691057e-08, "loss": 0.0111, "num_tokens": 725026.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.4332549571990967, "sampling/importance_sampling_ratio/mean": 0.999975860118866, "sampling/importance_sampling_ratio/min": 0.6078106164932251, "sampling/sampling_logp_difference/max": 0.497891902923584, "sampling/sampling_logp_difference/mean": 0.015049072913825512, "step": 18 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 930.0, "completions/max_terminated_length": 930.0, "completions/mean_length": 459.90625, "completions/mean_terminated_length": 459.90625, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "entropy": 0.4138451814651489, "epoch": 0.023284313725490197, "frac_reward_zero_std": 0.75, "grad_norm": 0.43899002171518675, "kl": 0.00026487570721656084, "learning_rate": 7.317073170731706e-08, "loss": -0.0054, "num_tokens": 775004.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.6485095024108887, "sampling/importance_sampling_ratio/mean": 1.0000149011611938, "sampling/importance_sampling_ratio/min": 0.5139825344085693, "sampling/sampling_logp_difference/max": 0.6655660271644592, "sampling/sampling_logp_difference/mean": 0.012440871447324753, "step": 19 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 779.0, "completions/max_terminated_length": 779.0, "completions/mean_length": 294.25, "completions/mean_terminated_length": 294.25, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "entropy": 0.421992689371109, "epoch": 0.024509803921568627, "frac_reward_zero_std": 0.5, "grad_norm": 1.0548079014576117, "kl": 0.00034335185773670673, "learning_rate": 7.723577235772358e-08, "loss": 0.0592, "num_tokens": 813484.0, "reward": 0.875, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.4150763750076294, "sampling/importance_sampling_ratio/mean": 0.999954104423523, "sampling/importance_sampling_ratio/min": 0.571739673614502, "sampling/sampling_logp_difference/max": 0.5590715408325195, "sampling/sampling_logp_difference/mean": 0.014073312282562256, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 584.0, "completions/max_terminated_length": 584.0, "completions/mean_length": 297.421875, "completions/mean_terminated_length": 297.421875, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "entropy": 0.5068727135658264, "epoch": 0.025735294117647058, "frac_reward_zero_std": 0.75, "grad_norm": 0.6815498063584722, "kl": 0.00035881518851965666, "learning_rate": 8.130081300813008e-08, "loss": -0.0085, "num_tokens": 850135.0, "reward": 0.25, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.25, "rewards/decision_reward_func/std": 0.9759001135826111, "sampling/importance_sampling_ratio/max": 1.5744246244430542, "sampling/importance_sampling_ratio/mean": 0.9998977184295654, "sampling/importance_sampling_ratio/min": 0.6262728571891785, "sampling/sampling_logp_difference/max": 0.4679691791534424, "sampling/sampling_logp_difference/mean": 0.015307108871638775, "step": 21 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 744.0, "completions/max_terminated_length": 744.0, "completions/mean_length": 348.015625, "completions/mean_terminated_length": 348.015625, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "entropy": 0.3931849002838135, "epoch": 0.02696078431372549, "frac_reward_zero_std": 0.75, "grad_norm": 0.5926775579053882, "kl": 0.0002758222399279475, "learning_rate": 8.536585365853659e-08, "loss": 0.0071, "num_tokens": 892312.0, "reward": 0.59375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.454338788986206, "sampling/importance_sampling_ratio/mean": 0.9999901652336121, "sampling/importance_sampling_ratio/min": 0.6895697712898254, "sampling/sampling_logp_difference/max": 0.37455129623413086, "sampling/sampling_logp_difference/mean": 0.013142036274075508, "step": 22 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 571.0, "completions/max_terminated_length": 571.0, "completions/mean_length": 320.59375, "completions/mean_terminated_length": 320.59375, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.580235481262207, "epoch": 0.028186274509803922, "frac_reward_zero_std": 0.25, "grad_norm": 1.0513289536552304, "kl": 0.00033618308953009546, "learning_rate": 8.943089430894309e-08, "loss": -0.0251, "num_tokens": 932830.0, "reward": 0.59375, "reward_std": 0.6601393222808838, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.6238267421722412, "sampling/importance_sampling_ratio/mean": 1.0001075267791748, "sampling/importance_sampling_ratio/min": 0.6930760741233826, "sampling/sampling_logp_difference/max": 0.4847855567932129, "sampling/sampling_logp_difference/mean": 0.01707499474287033, "step": 23 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 839.0, "completions/max_terminated_length": 839.0, "completions/mean_length": 293.453125, "completions/mean_terminated_length": 293.453125, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.42370331287384033, "epoch": 0.029411764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.01089566682333, "kl": 0.00041805661749094725, "learning_rate": 9.349593495934959e-08, "loss": 0.0792, "num_tokens": 967627.0, "reward": 0.84375, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.5878833532333374, "sampling/importance_sampling_ratio/mean": 0.9999502897262573, "sampling/importance_sampling_ratio/min": 0.6485497951507568, "sampling/sampling_logp_difference/max": 0.4624018669128418, "sampling/sampling_logp_difference/mean": 0.015088239684700966, "step": 24 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 955.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 402.171875, "completions/mean_terminated_length": 402.171875, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "entropy": 0.6353285908699036, "epoch": 0.030637254901960783, "frac_reward_zero_std": 0.25, "grad_norm": 1.3246222040118851, "kl": 0.00041435169987380505, "learning_rate": 9.75609756097561e-08, "loss": -0.0299, "num_tokens": 1014006.0, "reward": -0.28125, "reward_std": 0.6037135720252991, "rewards/decision_reward_func/mean": -0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 1.9633315801620483, "sampling/importance_sampling_ratio/mean": 0.9997776746749878, "sampling/importance_sampling_ratio/min": 0.4757631719112396, "sampling/sampling_logp_difference/max": 0.7428350448608398, "sampling/sampling_logp_difference/mean": 0.020722277462482452, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 786.0, "completions/max_terminated_length": 786.0, "completions/mean_length": 332.640625, "completions/mean_terminated_length": 332.640625, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.33449697494506836, "epoch": 0.031862745098039214, "frac_reward_zero_std": 0.75, "grad_norm": 0.7137086337661988, "kl": 0.00028956064488738775, "learning_rate": 1.016260162601626e-07, "loss": -0.0327, "num_tokens": 1051775.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.4378141164779663, "sampling/importance_sampling_ratio/mean": 0.9997680187225342, "sampling/importance_sampling_ratio/min": 0.7045885324478149, "sampling/sampling_logp_difference/max": 0.36312389373779297, "sampling/sampling_logp_difference/mean": 0.0120763648301363, "step": 26 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 766.0, "completions/max_terminated_length": 766.0, "completions/mean_length": 372.734375, "completions/mean_terminated_length": 372.734375, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.29292771220207214, "epoch": 0.03308823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.0016105157908655658, "kl": 0.00028421563911251724, "learning_rate": 1.0569105691056911e-07, "loss": 0.0, "num_tokens": 1097518.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3740522861480713, "sampling/importance_sampling_ratio/mean": 0.9999227523803711, "sampling/importance_sampling_ratio/min": 0.5598083138465881, "sampling/sampling_logp_difference/max": 0.5801608562469482, "sampling/sampling_logp_difference/mean": 0.010855845175683498, "step": 27 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 640.0, "completions/max_terminated_length": 640.0, "completions/mean_length": 323.03125, "completions/mean_terminated_length": 323.03125, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "entropy": 0.5288899540901184, "epoch": 0.03431372549019608, "frac_reward_zero_std": 0.75, "grad_norm": 0.6162753251855522, "kl": 0.0004327974165789783, "learning_rate": 1.097560975609756e-07, "loss": -0.0011, "num_tokens": 1134784.0, "reward": 0.78125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.594279408454895, "sampling/importance_sampling_ratio/mean": 1.0000537633895874, "sampling/importance_sampling_ratio/min": 0.6143806576728821, "sampling/sampling_logp_difference/max": 0.4871406555175781, "sampling/sampling_logp_difference/mean": 0.01772783137857914, "step": 28 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 675.0, "completions/max_terminated_length": 675.0, "completions/mean_length": 292.984375, "completions/mean_terminated_length": 292.984375, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.4683876633644104, "epoch": 0.03553921568627451, "frac_reward_zero_std": 0.5, "grad_norm": 1.1122165969668638, "kl": 0.00037626695120707154, "learning_rate": 1.1382113821138211e-07, "loss": 0.006, "num_tokens": 1171743.0, "reward": 0.40625, "reward_std": 0.497555673122406, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.40093994140625, "sampling/importance_sampling_ratio/mean": 1.0003654956817627, "sampling/importance_sampling_ratio/min": 0.6677870154380798, "sampling/sampling_logp_difference/max": 0.4037860631942749, "sampling/sampling_logp_difference/mean": 0.015685275197029114, "step": 29 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 589.0, "completions/max_terminated_length": 589.0, "completions/mean_length": 306.5625, "completions/mean_terminated_length": 306.5625, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.5330952405929565, "epoch": 0.03676470588235294, "frac_reward_zero_std": 0.0, "grad_norm": 1.781766720821268, "kl": 0.00043866573832929134, "learning_rate": 1.1788617886178862e-07, "loss": 0.0705, "num_tokens": 1204803.0, "reward": 0.28125, "reward_std": 0.6601393222808838, "rewards/decision_reward_func/mean": 0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 1.4311414957046509, "sampling/importance_sampling_ratio/mean": 1.0001766681671143, "sampling/importance_sampling_ratio/min": 0.6262676119804382, "sampling/sampling_logp_difference/max": 0.46797752380371094, "sampling/sampling_logp_difference/mean": 0.017488066107034683, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1612.0, "completions/max_terminated_length": 1612.0, "completions/mean_length": 445.1875, "completions/mean_terminated_length": 445.1875, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.38493630290031433, "epoch": 0.03799019607843137, "frac_reward_zero_std": 0.5, "grad_norm": 0.7785536068689526, "kl": 0.0003935876884497702, "learning_rate": 1.219512195121951e-07, "loss": 0.0192, "num_tokens": 1248287.0, "reward": 0.5, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.551242470741272, "sampling/importance_sampling_ratio/mean": 1.0001380443572998, "sampling/importance_sampling_ratio/min": 0.6033728122711182, "sampling/sampling_logp_difference/max": 0.5052199363708496, "sampling/sampling_logp_difference/mean": 0.013811971992254257, "step": 31 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 649.0, "completions/max_terminated_length": 649.0, "completions/mean_length": 321.46875, "completions/mean_terminated_length": 321.46875, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "entropy": 0.3749151825904846, "epoch": 0.0392156862745098, "frac_reward_zero_std": 0.75, "grad_norm": 0.7172100914349975, "kl": 0.00035367754753679037, "learning_rate": 1.260162601626016e-07, "loss": 0.0083, "num_tokens": 1293741.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.531674861907959, "sampling/importance_sampling_ratio/mean": 0.999983549118042, "sampling/importance_sampling_ratio/min": 0.6938153505325317, "sampling/sampling_logp_difference/max": 0.4263617992401123, "sampling/sampling_logp_difference/mean": 0.013548271730542183, "step": 32 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 700.0, "completions/max_terminated_length": 700.0, "completions/mean_length": 390.375, "completions/mean_terminated_length": 390.375, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.43568336963653564, "epoch": 0.04044117647058824, "frac_reward_zero_std": 0.75, "grad_norm": 0.5595435208763134, "kl": 0.0003851033980026841, "learning_rate": 1.3008130081300813e-07, "loss": 0.0237, "num_tokens": 1337541.0, "reward": 0.6875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.5293048620224, "sampling/importance_sampling_ratio/mean": 1.0002975463867188, "sampling/importance_sampling_ratio/min": 0.5502884387969971, "sampling/sampling_logp_difference/max": 0.5973126888275146, "sampling/sampling_logp_difference/mean": 0.014686541631817818, "step": 33 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 928.0, "completions/max_terminated_length": 928.0, "completions/mean_length": 337.8125, "completions/mean_terminated_length": 337.8125, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.45803898572921753, "epoch": 0.041666666666666664, "frac_reward_zero_std": 0.25, "grad_norm": 1.1654212619976199, "kl": 0.00036487175384536386, "learning_rate": 1.3414634146341465e-07, "loss": 0.0555, "num_tokens": 1379129.0, "reward": 0.625, "reward_std": 0.5847553014755249, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.5710982084274292, "sampling/importance_sampling_ratio/mean": 1.0002315044403076, "sampling/importance_sampling_ratio/min": 0.4654851257801056, "sampling/sampling_logp_difference/max": 0.7646751403808594, "sampling/sampling_logp_difference/mean": 0.0145266093313694, "step": 34 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/max_terminated_length": 517.0, "completions/mean_length": 307.21875, "completions/mean_terminated_length": 307.21875, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.45345938205718994, "epoch": 0.0428921568627451, "frac_reward_zero_std": 0.5, "grad_norm": 1.003373856040202, "kl": 0.0004271030775271356, "learning_rate": 1.3821138211382114e-07, "loss": -0.0261, "num_tokens": 1418567.0, "reward": 0.375, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.5341546535491943, "sampling/importance_sampling_ratio/mean": 1.0000030994415283, "sampling/importance_sampling_ratio/min": 0.6817525029182434, "sampling/sampling_logp_difference/max": 0.4279794692993164, "sampling/sampling_logp_difference/mean": 0.016810128465294838, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 589.0, "completions/max_terminated_length": 589.0, "completions/mean_length": 377.140625, "completions/mean_terminated_length": 377.140625, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "entropy": 0.4033944606781006, "epoch": 0.04411764705882353, "frac_reward_zero_std": 0.75, "grad_norm": 0.5525752985034718, "kl": 0.0002919721882790327, "learning_rate": 1.4227642276422763e-07, "loss": -0.0494, "num_tokens": 1463376.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.4058122634887695, "sampling/importance_sampling_ratio/mean": 0.9999144077301025, "sampling/importance_sampling_ratio/min": 0.4787677526473999, "sampling/sampling_logp_difference/max": 0.7365397214889526, "sampling/sampling_logp_difference/mean": 0.013767299242317677, "step": 36 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 807.0, "completions/max_terminated_length": 807.0, "completions/mean_length": 402.765625, "completions/mean_terminated_length": 402.765625, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.5617048144340515, "epoch": 0.04534313725490196, "frac_reward_zero_std": 0.5, "grad_norm": 0.8330850570786604, "kl": 0.0003554814029484987, "learning_rate": 1.4634146341463413e-07, "loss": 0.0024, "num_tokens": 1508049.0, "reward": 0.125, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 1.4349106550216675, "sampling/importance_sampling_ratio/mean": 1.000146508216858, "sampling/importance_sampling_ratio/min": 0.6210473775863647, "sampling/sampling_logp_difference/max": 0.4763479232788086, "sampling/sampling_logp_difference/mean": 0.015833299607038498, "step": 37 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 768.0, "completions/max_terminated_length": 768.0, "completions/mean_length": 408.53125, "completions/mean_terminated_length": 408.53125, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.40969347953796387, "epoch": 0.04656862745098039, "frac_reward_zero_std": 0.25, "grad_norm": 0.9847192126760824, "kl": 0.0003739820676855743, "learning_rate": 1.5040650406504065e-07, "loss": -0.0149, "num_tokens": 1554979.0, "reward": -0.15625, "reward_std": 0.625, "rewards/decision_reward_func/mean": -0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.6210757493972778, "sampling/importance_sampling_ratio/mean": 0.9999745488166809, "sampling/importance_sampling_ratio/min": 0.44078096747398376, "sampling/sampling_logp_difference/max": 0.8192071914672852, "sampling/sampling_logp_difference/mean": 0.015138452872633934, "step": 38 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 850.0, "completions/max_terminated_length": 850.0, "completions/mean_length": 420.1875, "completions/mean_terminated_length": 420.1875, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.6661462783813477, "epoch": 0.04779411764705882, "frac_reward_zero_std": 0.0, "grad_norm": 1.1421437039632967, "kl": 0.0004234796797391027, "learning_rate": 1.5447154471544717e-07, "loss": 0.0316, "num_tokens": 1596223.0, "reward": 0.25, "reward_std": 0.8785127401351929, "rewards/decision_reward_func/mean": 0.25, "rewards/decision_reward_func/std": 0.9759001135826111, "sampling/importance_sampling_ratio/max": 1.782810091972351, "sampling/importance_sampling_ratio/mean": 0.9998263120651245, "sampling/importance_sampling_ratio/min": 0.5805458426475525, "sampling/sampling_logp_difference/max": 0.578190803527832, "sampling/sampling_logp_difference/mean": 0.019840117543935776, "step": 39 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 849.0, "completions/max_terminated_length": 849.0, "completions/mean_length": 364.4375, "completions/mean_terminated_length": 364.4375, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "entropy": 0.5669698715209961, "epoch": 0.049019607843137254, "frac_reward_zero_std": 1.0, "grad_norm": 0.0013664377387275013, "kl": 0.00036802375689148903, "learning_rate": 1.5853658536585366e-07, "loss": 0.0, "num_tokens": 1637611.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.402492880821228, "sampling/importance_sampling_ratio/mean": 0.9998829364776611, "sampling/importance_sampling_ratio/min": 0.613205075263977, "sampling/sampling_logp_difference/max": 0.489055871963501, "sampling/sampling_logp_difference/mean": 0.017242010682821274, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 827.0, "completions/max_terminated_length": 827.0, "completions/mean_length": 424.59375, "completions/mean_terminated_length": 424.59375, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "entropy": 0.6023030281066895, "epoch": 0.05024509803921569, "frac_reward_zero_std": 0.0, "grad_norm": 1.0523601053350122, "kl": 0.0003144932852592319, "learning_rate": 1.6260162601626016e-07, "loss": 0.028, "num_tokens": 1681025.0, "reward": 0.125, "reward_std": 0.8415650129318237, "rewards/decision_reward_func/mean": 0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 1.575642704963684, "sampling/importance_sampling_ratio/mean": 0.9997628927230835, "sampling/importance_sampling_ratio/min": 0.5217536687850952, "sampling/sampling_logp_difference/max": 0.650559663772583, "sampling/sampling_logp_difference/mean": 0.017206106334924698, "step": 41 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 558.0, "completions/max_terminated_length": 558.0, "completions/mean_length": 334.5, "completions/mean_terminated_length": 334.5, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "entropy": 0.4098169803619385, "epoch": 0.051470588235294115, "frac_reward_zero_std": 0.75, "grad_norm": 0.6653260116103709, "kl": 0.0003027720667887479, "learning_rate": 1.6666666666666665e-07, "loss": -0.0185, "num_tokens": 1720097.0, "reward": 0.78125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.5748292207717896, "sampling/importance_sampling_ratio/mean": 1.000213623046875, "sampling/importance_sampling_ratio/min": 0.6254992485046387, "sampling/sampling_logp_difference/max": 0.4692051410675049, "sampling/sampling_logp_difference/mean": 0.013513647019863129, "step": 42 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 581.0, "completions/max_terminated_length": 581.0, "completions/mean_length": 288.75, "completions/mean_terminated_length": 288.75, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.42680129408836365, "epoch": 0.05269607843137255, "frac_reward_zero_std": 0.5, "grad_norm": 1.0140492288667766, "kl": 0.0003811779897660017, "learning_rate": 1.7073170731707317e-07, "loss": 0.0042, "num_tokens": 1753409.0, "reward": 0.3125, "reward_std": 0.42898139357566833, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.5556113719940186, "sampling/importance_sampling_ratio/mean": 0.9999961853027344, "sampling/importance_sampling_ratio/min": 0.6445651054382324, "sampling/sampling_logp_difference/max": 0.44186854362487793, "sampling/sampling_logp_difference/mean": 0.015065609477460384, "step": 43 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 727.0, "completions/max_terminated_length": 727.0, "completions/mean_length": 332.859375, "completions/mean_terminated_length": 332.859375, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.48518580198287964, "epoch": 0.05392156862745098, "frac_reward_zero_std": 0.5, "grad_norm": 1.0337397642319142, "kl": 0.00035097202635370195, "learning_rate": 1.7479674796747966e-07, "loss": -0.0035, "num_tokens": 1793144.0, "reward": 0.8125, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.44613516330719, "sampling/importance_sampling_ratio/mean": 1.0003323554992676, "sampling/importance_sampling_ratio/min": 0.6788093447685242, "sampling/sampling_logp_difference/max": 0.38741493225097656, "sampling/sampling_logp_difference/mean": 0.015649795532226562, "step": 44 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 799.0, "completions/max_terminated_length": 799.0, "completions/mean_length": 358.109375, "completions/mean_terminated_length": 358.109375, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "entropy": 0.5233907699584961, "epoch": 0.05514705882352941, "frac_reward_zero_std": 0.25, "grad_norm": 1.184099770106814, "kl": 0.0003698566579259932, "learning_rate": 1.7886178861788619e-07, "loss": -0.0585, "num_tokens": 1845951.0, "reward": 0.65625, "reward_std": 0.5827301740646362, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.6493314504623413, "sampling/importance_sampling_ratio/mean": 1.0001838207244873, "sampling/importance_sampling_ratio/min": 0.5599596500396729, "sampling/sampling_logp_difference/max": 0.5798904895782471, "sampling/sampling_logp_difference/mean": 0.017032615840435028, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 554.0, "completions/max_terminated_length": 554.0, "completions/mean_length": 283.828125, "completions/mean_terminated_length": 283.828125, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.43020832538604736, "epoch": 0.056372549019607844, "frac_reward_zero_std": 0.5, "grad_norm": 1.1491311358225167, "kl": 0.0005243588821031153, "learning_rate": 1.8292682926829268e-07, "loss": 0.0003, "num_tokens": 1876036.0, "reward": 0.40625, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.9443782567977905, "sampling/importance_sampling_ratio/mean": 0.9994151592254639, "sampling/importance_sampling_ratio/min": 0.6690720319747925, "sampling/sampling_logp_difference/max": 0.6649422645568848, "sampling/sampling_logp_difference/mean": 0.016582967713475227, "step": 46 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1105.0, "completions/max_terminated_length": 1105.0, "completions/mean_length": 406.96875, "completions/mean_terminated_length": 406.96875, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.44694045186042786, "epoch": 0.05759803921568627, "frac_reward_zero_std": 0.5, "grad_norm": 0.7169723830807738, "kl": 0.0003126176306977868, "learning_rate": 1.8699186991869917e-07, "loss": -0.0403, "num_tokens": 1918034.0, "reward": 0.875, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.5656219720840454, "sampling/importance_sampling_ratio/mean": 1.0000123977661133, "sampling/importance_sampling_ratio/min": 0.7265259027481079, "sampling/sampling_logp_difference/max": 0.44828319549560547, "sampling/sampling_logp_difference/mean": 0.014549804851412773, "step": 47 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 796.0, "completions/max_terminated_length": 796.0, "completions/mean_length": 333.171875, "completions/mean_terminated_length": 333.171875, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.3466792404651642, "epoch": 0.058823529411764705, "frac_reward_zero_std": 0.5, "grad_norm": 0.9365656190483648, "kl": 0.00033920869464054704, "learning_rate": 1.910569105691057e-07, "loss": -0.0023, "num_tokens": 1954109.0, "reward": 0.53125, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.7523581981658936, "sampling/importance_sampling_ratio/mean": 1.000047206878662, "sampling/importance_sampling_ratio/min": 0.6802659034729004, "sampling/sampling_logp_difference/max": 0.560962438583374, "sampling/sampling_logp_difference/mean": 0.012411970645189285, "step": 48 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 798.0, "completions/max_terminated_length": 798.0, "completions/mean_length": 404.34375, "completions/mean_terminated_length": 404.34375, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "entropy": 0.46099793910980225, "epoch": 0.06004901960784314, "frac_reward_zero_std": 0.75, "grad_norm": 0.5381620781046295, "kl": 0.00031924547511152923, "learning_rate": 1.951219512195122e-07, "loss": -0.0386, "num_tokens": 1998371.0, "reward": -0.3125, "reward_std": 0.25, "rewards/decision_reward_func/mean": -0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.4658123254776, "sampling/importance_sampling_ratio/mean": 0.9998260140419006, "sampling/importance_sampling_ratio/min": 0.6261754631996155, "sampling/sampling_logp_difference/max": 0.4681246280670166, "sampling/sampling_logp_difference/mean": 0.01451820507645607, "step": 49 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/max_terminated_length": 547.0, "completions/mean_length": 289.890625, "completions/mean_terminated_length": 289.890625, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.427793025970459, "epoch": 0.061274509803921566, "frac_reward_zero_std": 0.5, "grad_norm": 1.508885053398108, "kl": 0.000365684914868325, "learning_rate": 1.9918699186991868e-07, "loss": -0.0188, "num_tokens": 2033596.0, "reward": 0.46875, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.4335907697677612, "sampling/importance_sampling_ratio/mean": 0.9992477297782898, "sampling/importance_sampling_ratio/min": 0.6182100772857666, "sampling/sampling_logp_difference/max": 0.4809269905090332, "sampling/sampling_logp_difference/mean": 0.01484981831163168, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 599.0, "completions/max_terminated_length": 599.0, "completions/mean_length": 347.984375, "completions/mean_terminated_length": 347.984375, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.5368505716323853, "epoch": 0.0625, "frac_reward_zero_std": 0.25, "grad_norm": 0.9863767628204594, "kl": 0.00038946475251577795, "learning_rate": 2.032520325203252e-07, "loss": -0.002, "num_tokens": 2073403.0, "reward": 0.71875, "reward_std": 0.5959457159042358, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.5744339227676392, "sampling/importance_sampling_ratio/mean": 1.0001511573791504, "sampling/importance_sampling_ratio/min": 0.5771892666816711, "sampling/sampling_logp_difference/max": 0.5495851039886475, "sampling/sampling_logp_difference/mean": 0.01756846159696579, "step": 51 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 794.0, "completions/max_terminated_length": 794.0, "completions/mean_length": 416.015625, "completions/mean_terminated_length": 416.015625, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "entropy": 0.5500391125679016, "epoch": 0.06372549019607843, "frac_reward_zero_std": 0.25, "grad_norm": 0.9918905206181959, "kl": 0.0003417959378566593, "learning_rate": 2.073170731707317e-07, "loss": -0.0286, "num_tokens": 2125852.0, "reward": 0.625, "reward_std": 0.5847553014755249, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.4133186340332031, "sampling/importance_sampling_ratio/mean": 0.9998364448547363, "sampling/importance_sampling_ratio/min": 0.6397423148155212, "sampling/sampling_logp_difference/max": 0.4466899037361145, "sampling/sampling_logp_difference/mean": 0.01627325639128685, "step": 52 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 666.0, "completions/max_terminated_length": 666.0, "completions/mean_length": 284.703125, "completions/mean_terminated_length": 284.703125, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.5966601967811584, "epoch": 0.06495098039215687, "frac_reward_zero_std": 0.75, "grad_norm": 0.6765611974359862, "kl": 0.00041398147004656494, "learning_rate": 2.1138211382113822e-07, "loss": 0.007, "num_tokens": 2165689.0, "reward": -0.125, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": -0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 1.443811058998108, "sampling/importance_sampling_ratio/mean": 0.9996282458305359, "sampling/importance_sampling_ratio/min": 0.7229898571968079, "sampling/sampling_logp_difference/max": 0.36728620529174805, "sampling/sampling_logp_difference/mean": 0.018271643668413162, "step": 53 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 587.0, "completions/max_terminated_length": 587.0, "completions/mean_length": 319.15625, "completions/mean_terminated_length": 319.15625, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.6160104274749756, "epoch": 0.0661764705882353, "frac_reward_zero_std": 0.25, "grad_norm": 1.0146191390487427, "kl": 0.0003690510638989508, "learning_rate": 2.154471544715447e-07, "loss": -0.0133, "num_tokens": 2201315.0, "reward": 0.125, "reward_std": 0.6047805547714233, "rewards/decision_reward_func/mean": 0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 1.5717647075653076, "sampling/importance_sampling_ratio/mean": 1.0002620220184326, "sampling/importance_sampling_ratio/min": 0.522520899772644, "sampling/sampling_logp_difference/max": 0.6490902900695801, "sampling/sampling_logp_difference/mean": 0.018170783296227455, "step": 54 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1063.0, "completions/max_terminated_length": 1063.0, "completions/mean_length": 416.484375, "completions/mean_terminated_length": 416.484375, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.5158703327178955, "epoch": 0.06740196078431372, "frac_reward_zero_std": 0.0, "grad_norm": 1.0558154757792615, "kl": 0.0002765360986813903, "learning_rate": 2.195121951219512e-07, "loss": 0.0454, "num_tokens": 2252818.0, "reward": 0.6875, "reward_std": 0.6311737298965454, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.8064885139465332, "sampling/importance_sampling_ratio/mean": 1.00006103515625, "sampling/importance_sampling_ratio/min": 0.6526806950569153, "sampling/sampling_logp_difference/max": 0.5913848876953125, "sampling/sampling_logp_difference/mean": 0.015147916041314602, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 815.0, "completions/max_terminated_length": 815.0, "completions/mean_length": 368.296875, "completions/mean_terminated_length": 368.296875, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.33448028564453125, "epoch": 0.06862745098039216, "frac_reward_zero_std": 1.0, "grad_norm": 0.0025046965766391018, "kl": 0.00033337785862386227, "learning_rate": 2.235772357723577e-07, "loss": 0.0, "num_tokens": 2295301.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4773266315460205, "sampling/importance_sampling_ratio/mean": 1.0005863904953003, "sampling/importance_sampling_ratio/min": 0.6299718022346497, "sampling/sampling_logp_difference/max": 0.4620802402496338, "sampling/sampling_logp_difference/mean": 0.012091251090168953, "step": 56 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1202.0, "completions/max_terminated_length": 1202.0, "completions/mean_length": 399.171875, "completions/mean_terminated_length": 399.171875, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "entropy": 0.4278004765510559, "epoch": 0.06985294117647059, "frac_reward_zero_std": 0.25, "grad_norm": 0.9399848301097212, "kl": 0.0003223472158424556, "learning_rate": 2.2764227642276422e-07, "loss": 0.0451, "num_tokens": 2337776.0, "reward": 0.59375, "reward_std": 0.5977387428283691, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.508169174194336, "sampling/importance_sampling_ratio/mean": 1.0000829696655273, "sampling/importance_sampling_ratio/min": 0.6771644949913025, "sampling/sampling_logp_difference/max": 0.4108964204788208, "sampling/sampling_logp_difference/mean": 0.013593319803476334, "step": 57 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1163.0, "completions/max_terminated_length": 1163.0, "completions/mean_length": 371.15625, "completions/mean_terminated_length": 371.15625, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.39865872263908386, "epoch": 0.07107843137254902, "frac_reward_zero_std": 0.75, "grad_norm": 0.45981008351450436, "kl": 0.00032601162092760205, "learning_rate": 2.3170731707317074e-07, "loss": -0.0031, "num_tokens": 2375402.0, "reward": -0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": -0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.3864145278930664, "sampling/importance_sampling_ratio/mean": 1.0000454187393188, "sampling/importance_sampling_ratio/min": 0.6655469536781311, "sampling/sampling_logp_difference/max": 0.4071460962295532, "sampling/sampling_logp_difference/mean": 0.013254405930638313, "step": 58 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/max_terminated_length": 472.0, "completions/mean_length": 300.25, "completions/mean_terminated_length": 300.25, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.3187522292137146, "epoch": 0.07230392156862746, "frac_reward_zero_std": 1.0, "grad_norm": 0.0015168639888374267, "kl": 0.0003256848722230643, "learning_rate": 2.3577235772357723e-07, "loss": 0.0, "num_tokens": 2408954.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4949426651000977, "sampling/importance_sampling_ratio/mean": 0.9997645616531372, "sampling/importance_sampling_ratio/min": 0.6134526133537292, "sampling/sampling_logp_difference/max": 0.48865222930908203, "sampling/sampling_logp_difference/mean": 0.012079503387212753, "step": 59 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 545.0, "completions/max_terminated_length": 545.0, "completions/mean_length": 289.296875, "completions/mean_terminated_length": 289.296875, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.554637610912323, "epoch": 0.07352941176470588, "frac_reward_zero_std": 0.25, "grad_norm": 1.2207932483871313, "kl": 0.0004153632908128202, "learning_rate": 2.3983739837398373e-07, "loss": -0.073, "num_tokens": 2441933.0, "reward": 0.0625, "reward_std": 0.690913200378418, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.4753808975219727, "sampling/importance_sampling_ratio/mean": 1.0002830028533936, "sampling/importance_sampling_ratio/min": 0.6483654379844666, "sampling/sampling_logp_difference/max": 0.4333007335662842, "sampling/sampling_logp_difference/mean": 0.017418142408132553, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 894.0, "completions/max_terminated_length": 894.0, "completions/mean_length": 376.671875, "completions/mean_terminated_length": 376.671875, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "entropy": 0.3899710476398468, "epoch": 0.07475490196078431, "frac_reward_zero_std": 0.75, "grad_norm": 0.6864020413841946, "kl": 0.000373355345800519, "learning_rate": 2.439024390243902e-07, "loss": 0.0085, "num_tokens": 2485864.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.4134167432785034, "sampling/importance_sampling_ratio/mean": 0.9999734163284302, "sampling/importance_sampling_ratio/min": 0.3368506133556366, "sampling/sampling_logp_difference/max": 1.0881158113479614, "sampling/sampling_logp_difference/mean": 0.013359070755541325, "step": 61 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 954.0, "completions/max_terminated_length": 954.0, "completions/mean_length": 368.890625, "completions/mean_terminated_length": 368.890625, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.502585768699646, "epoch": 0.07598039215686274, "frac_reward_zero_std": 0.5, "grad_norm": 0.8645485047034707, "kl": 0.00038482306990772486, "learning_rate": 2.479674796747967e-07, "loss": 0.0136, "num_tokens": 2527153.0, "reward": 0.5625, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.522111177444458, "sampling/importance_sampling_ratio/mean": 0.9997476935386658, "sampling/importance_sampling_ratio/min": 0.4823719561100006, "sampling/sampling_logp_difference/max": 0.7290398478507996, "sampling/sampling_logp_difference/mean": 0.017361216247081757, "step": 62 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 672.0, "completions/max_terminated_length": 672.0, "completions/mean_length": 272.109375, "completions/mean_terminated_length": 272.109375, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.46052178740501404, "epoch": 0.07720588235294118, "frac_reward_zero_std": 0.75, "grad_norm": 0.6845973311088729, "kl": 0.0005387598648667336, "learning_rate": 2.520325203252032e-07, "loss": 0.0169, "num_tokens": 2558824.0, "reward": 0.0625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.4145008325576782, "sampling/importance_sampling_ratio/mean": 1.0002951622009277, "sampling/importance_sampling_ratio/min": 0.16017460823059082, "sampling/sampling_logp_difference/max": 1.8314907550811768, "sampling/sampling_logp_difference/mean": 0.015504274517297745, "step": 63 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1017.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 384.859375, "completions/mean_terminated_length": 384.859375, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.5026362538337708, "epoch": 0.0784313725490196, "frac_reward_zero_std": 0.5, "grad_norm": 1.063493773635962, "kl": 0.00030419701943174005, "learning_rate": 2.5609756097560976e-07, "loss": -0.0332, "num_tokens": 2604383.0, "reward": 0.4375, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.6563924551010132, "sampling/importance_sampling_ratio/mean": 1.0000817775726318, "sampling/importance_sampling_ratio/min": 0.538443386554718, "sampling/sampling_logp_difference/max": 0.6190729737281799, "sampling/sampling_logp_difference/mean": 0.015331205911934376, "step": 64 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1316.0, "completions/max_terminated_length": 1316.0, "completions/mean_length": 349.796875, "completions/mean_terminated_length": 349.796875, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "entropy": 0.38261836767196655, "epoch": 0.07965686274509803, "frac_reward_zero_std": 0.75, "grad_norm": 0.6365407690602217, "kl": 0.00035536885843612254, "learning_rate": 2.6016260162601625e-07, "loss": -0.0344, "num_tokens": 2646082.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.9044489860534668, "sampling/importance_sampling_ratio/mean": 1.0001323223114014, "sampling/importance_sampling_ratio/min": 0.642989993095398, "sampling/sampling_logp_difference/max": 0.6441926956176758, "sampling/sampling_logp_difference/mean": 0.012886488810181618, "step": 65 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 596.0, "completions/max_terminated_length": 596.0, "completions/mean_length": 298.171875, "completions/mean_terminated_length": 298.171875, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.5439075231552124, "epoch": 0.08088235294117647, "frac_reward_zero_std": 0.75, "grad_norm": 0.8796650184603221, "kl": 0.00044345733476802707, "learning_rate": 2.6422764227642274e-07, "loss": 0.047, "num_tokens": 2691997.0, "reward": 0.84375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.418542504310608, "sampling/importance_sampling_ratio/mean": 0.9999518990516663, "sampling/importance_sampling_ratio/min": 0.6812332272529602, "sampling/sampling_logp_difference/max": 0.3838505744934082, "sampling/sampling_logp_difference/mean": 0.017620086669921875, "step": 66 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2014.0, "completions/max_terminated_length": 2014.0, "completions/mean_length": 407.609375, "completions/mean_terminated_length": 407.609375, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "entropy": 0.4749307930469513, "epoch": 0.0821078431372549, "frac_reward_zero_std": 0.5, "grad_norm": 1.0533004841009321, "kl": 0.0003975223808083683, "learning_rate": 2.682926829268293e-07, "loss": 0.2147, "num_tokens": 2734068.0, "reward": 0.78125, "reward_std": 0.375, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.4569950103759766, "sampling/importance_sampling_ratio/mean": 1.0000975131988525, "sampling/importance_sampling_ratio/min": 0.5918757319450378, "sampling/sampling_logp_difference/max": 0.5244585871696472, "sampling/sampling_logp_difference/mean": 0.015808656811714172, "step": 67 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1419.0, "completions/max_terminated_length": 1419.0, "completions/mean_length": 424.34375, "completions/mean_terminated_length": 424.34375, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "entropy": 0.4416956901550293, "epoch": 0.08333333333333333, "frac_reward_zero_std": 0.25, "grad_norm": 0.8696816969461196, "kl": 0.00032891443697735667, "learning_rate": 2.7235772357723573e-07, "loss": -0.0011, "num_tokens": 2786618.0, "reward": 0.5, "reward_std": 0.7191373109817505, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.9283627271652222, "sampling/importance_sampling_ratio/mean": 1.000214695930481, "sampling/importance_sampling_ratio/min": 0.6187191009521484, "sampling/sampling_logp_difference/max": 0.6566712856292725, "sampling/sampling_logp_difference/mean": 0.013868343085050583, "step": 68 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 595.0, "completions/max_terminated_length": 595.0, "completions/mean_length": 356.171875, "completions/mean_terminated_length": 356.171875, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "entropy": 0.592164158821106, "epoch": 0.08455882352941177, "frac_reward_zero_std": 0.5, "grad_norm": 0.9010277625117132, "kl": 0.0004434102156665176, "learning_rate": 2.764227642276423e-07, "loss": 0.0521, "num_tokens": 2825893.0, "reward": 0.875, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.4771404266357422, "sampling/importance_sampling_ratio/mean": 1.0000978708267212, "sampling/importance_sampling_ratio/min": 0.3751973807811737, "sampling/sampling_logp_difference/max": 0.9803031086921692, "sampling/sampling_logp_difference/mean": 0.01848888397216797, "step": 69 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1051.0, "completions/max_terminated_length": 1051.0, "completions/mean_length": 397.296875, "completions/mean_terminated_length": 397.296875, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.5564912557601929, "epoch": 0.0857843137254902, "frac_reward_zero_std": 0.25, "grad_norm": 1.0258177891373412, "kl": 0.0003644720418378711, "learning_rate": 2.8048780487804877e-07, "loss": 0.031, "num_tokens": 2873960.0, "reward": 0.4375, "reward_std": 0.690913200378418, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.6529303789138794, "sampling/importance_sampling_ratio/mean": 0.999885082244873, "sampling/importance_sampling_ratio/min": 0.5989751815795898, "sampling/sampling_logp_difference/max": 0.5125350952148438, "sampling/sampling_logp_difference/mean": 0.016418159008026123, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 890.0, "completions/max_terminated_length": 890.0, "completions/mean_length": 357.65625, "completions/mean_terminated_length": 357.65625, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.37217697501182556, "epoch": 0.08700980392156862, "frac_reward_zero_std": 0.75, "grad_norm": 0.5501979955160462, "kl": 0.00031230325112119317, "learning_rate": 2.8455284552845527e-07, "loss": 0.0121, "num_tokens": 2912258.0, "reward": 0.0625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.7435591220855713, "sampling/importance_sampling_ratio/mean": 0.9997398853302002, "sampling/importance_sampling_ratio/min": 0.606547474861145, "sampling/sampling_logp_difference/max": 0.5559284687042236, "sampling/sampling_logp_difference/mean": 0.013133936561644077, "step": 71 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 839.0, "completions/max_terminated_length": 839.0, "completions/mean_length": 458.328125, "completions/mean_terminated_length": 458.328125, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "entropy": 0.54712975025177, "epoch": 0.08823529411764706, "frac_reward_zero_std": 0.25, "grad_norm": 0.8203648571036939, "kl": 0.00026464861002750695, "learning_rate": 2.886178861788618e-07, "loss": -0.0117, "num_tokens": 2958007.0, "reward": 0.53125, "reward_std": 0.6833621263504028, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.4207580089569092, "sampling/importance_sampling_ratio/mean": 0.9999531507492065, "sampling/importance_sampling_ratio/min": 0.6992238759994507, "sampling/sampling_logp_difference/max": 0.3577842712402344, "sampling/sampling_logp_difference/mean": 0.014530394226312637, "step": 72 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 782.0, "completions/max_terminated_length": 782.0, "completions/mean_length": 415.8125, "completions/mean_terminated_length": 415.8125, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "entropy": 0.512789249420166, "epoch": 0.08946078431372549, "frac_reward_zero_std": 0.5, "grad_norm": 0.8694949951240943, "kl": 0.00038676554686389863, "learning_rate": 2.9268292682926825e-07, "loss": -0.0146, "num_tokens": 3003435.0, "reward": 0.03125, "reward_std": 0.42516323924064636, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0002280473709106, "sampling/importance_sampling_ratio/min": 0.6483097672462463, "sampling/sampling_logp_difference/max": 1.2181992530822754, "sampling/sampling_logp_difference/mean": 0.015799831598997116, "step": 73 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 984.0, "completions/max_terminated_length": 984.0, "completions/mean_length": 400.9375, "completions/mean_terminated_length": 400.9375, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.3829348683357239, "epoch": 0.09068627450980392, "frac_reward_zero_std": 0.75, "grad_norm": 0.5752153661409692, "kl": 0.00036305381217971444, "learning_rate": 2.967479674796748e-07, "loss": -0.0331, "num_tokens": 3042343.0, "reward": 0.625, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.530640959739685, "sampling/importance_sampling_ratio/mean": 1.0005073547363281, "sampling/importance_sampling_ratio/min": 0.18678990006446838, "sampling/sampling_logp_difference/max": 1.6777708530426025, "sampling/sampling_logp_difference/mean": 0.013566285371780396, "step": 74 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 630.0, "completions/max_terminated_length": 630.0, "completions/mean_length": 355.25, "completions/mean_terminated_length": 355.25, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "entropy": 0.5245380401611328, "epoch": 0.09191176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 0.7605525421522583, "kl": 0.00025241338880732656, "learning_rate": 3.008130081300813e-07, "loss": 0.0339, "num_tokens": 3082503.0, "reward": 0.84375, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.6428433656692505, "sampling/importance_sampling_ratio/mean": 1.0002483129501343, "sampling/importance_sampling_ratio/min": 0.6353777647018433, "sampling/sampling_logp_difference/max": 0.4964284896850586, "sampling/sampling_logp_difference/mean": 0.015040399506688118, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 893.0, "completions/max_terminated_length": 893.0, "completions/mean_length": 308.671875, "completions/mean_terminated_length": 308.671875, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.3515089452266693, "epoch": 0.09313725490196079, "frac_reward_zero_std": 0.5, "grad_norm": 1.1141612926503381, "kl": 0.00038636988028883934, "learning_rate": 3.048780487804878e-07, "loss": 0.1172, "num_tokens": 3116610.0, "reward": 0.4375, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.3789145946502686, "sampling/importance_sampling_ratio/mean": 0.9997060298919678, "sampling/importance_sampling_ratio/min": 0.6745181679725647, "sampling/sampling_logp_difference/max": 0.3937567472457886, "sampling/sampling_logp_difference/mean": 0.012528663501143456, "step": 76 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 907.0, "completions/max_terminated_length": 907.0, "completions/mean_length": 385.421875, "completions/mean_terminated_length": 385.421875, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.43520116806030273, "epoch": 0.09436274509803921, "frac_reward_zero_std": 0.5, "grad_norm": 0.8338349938177765, "kl": 0.0003671252343337983, "learning_rate": 3.0894308943089434e-07, "loss": -0.0057, "num_tokens": 3158637.0, "reward": 0.625, "reward_std": 0.481805682182312, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.7343062162399292, "sampling/importance_sampling_ratio/mean": 1.0001869201660156, "sampling/importance_sampling_ratio/min": 0.48710179328918457, "sampling/sampling_logp_difference/max": 0.7192821502685547, "sampling/sampling_logp_difference/mean": 0.014364716596901417, "step": 77 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 859.0, "completions/max_terminated_length": 859.0, "completions/mean_length": 465.765625, "completions/mean_terminated_length": 465.765625, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "entropy": 0.47529512643814087, "epoch": 0.09558823529411764, "frac_reward_zero_std": 0.25, "grad_norm": 0.8957899146651656, "kl": 0.0003479773295111954, "learning_rate": 3.130081300813008e-07, "loss": -0.0312, "num_tokens": 3202990.0, "reward": 0.125, "reward_std": 0.6311737298965454, "rewards/decision_reward_func/mean": 0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 1.4322385787963867, "sampling/importance_sampling_ratio/mean": 0.9997146129608154, "sampling/importance_sampling_ratio/min": 0.6279973983764648, "sampling/sampling_logp_difference/max": 0.46521925926208496, "sampling/sampling_logp_difference/mean": 0.014884884469211102, "step": 78 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 780.0, "completions/max_terminated_length": 780.0, "completions/mean_length": 388.609375, "completions/mean_terminated_length": 388.609375, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "entropy": 0.5224838852882385, "epoch": 0.09681372549019608, "frac_reward_zero_std": 0.5, "grad_norm": 0.8214640544428786, "kl": 0.0002836015191860497, "learning_rate": 3.170731707317073e-07, "loss": -0.029, "num_tokens": 3249829.0, "reward": 0.03125, "reward_std": 0.5061737298965454, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.7255699634552002, "sampling/importance_sampling_ratio/mean": 0.9999824166297913, "sampling/importance_sampling_ratio/min": 0.6698716878890991, "sampling/sampling_logp_difference/max": 0.5455573797225952, "sampling/sampling_logp_difference/mean": 0.01447225734591484, "step": 79 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 690.0, "completions/max_terminated_length": 690.0, "completions/mean_length": 359.5, "completions/mean_terminated_length": 359.5, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "entropy": 0.58916836977005, "epoch": 0.09803921568627451, "frac_reward_zero_std": 0.25, "grad_norm": 0.982593935552021, "kl": 0.00036830941098742187, "learning_rate": 3.211382113821138e-07, "loss": -0.0079, "num_tokens": 3292373.0, "reward": -0.125, "reward_std": 0.6143567562103271, "rewards/decision_reward_func/mean": -0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0003821849822998, "sampling/importance_sampling_ratio/min": 0.6957470774650574, "sampling/sampling_logp_difference/max": 0.7556667327880859, "sampling/sampling_logp_difference/mean": 0.017575817182660103, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 658.0, "completions/max_terminated_length": 658.0, "completions/mean_length": 277.46875, "completions/mean_terminated_length": 277.46875, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.4605231285095215, "epoch": 0.09926470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 0.9250344890648465, "kl": 0.0005266641965135932, "learning_rate": 3.252032520325203e-07, "loss": -0.013, "num_tokens": 3328259.0, "reward": -0.3125, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": -0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0003132820129395, "sampling/importance_sampling_ratio/min": 0.4708278775215149, "sampling/sampling_logp_difference/max": 0.7532627582550049, "sampling/sampling_logp_difference/mean": 0.016812246292829514, "step": 81 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 654.0, "completions/max_terminated_length": 654.0, "completions/mean_length": 291.296875, "completions/mean_terminated_length": 291.296875, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.44603225588798523, "epoch": 0.10049019607843138, "frac_reward_zero_std": 0.75, "grad_norm": 0.602864540154558, "kl": 0.00035364821087569, "learning_rate": 3.292682926829268e-07, "loss": -0.0178, "num_tokens": 3361686.0, "reward": 0.34375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999102354049683, "sampling/importance_sampling_ratio/min": 0.4882053732872009, "sampling/sampling_logp_difference/max": 0.9384479522705078, "sampling/sampling_logp_difference/mean": 0.014547828584909439, "step": 82 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 579.0, "completions/max_terminated_length": 579.0, "completions/mean_length": 295.140625, "completions/mean_terminated_length": 295.140625, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.4208294749259949, "epoch": 0.1017156862745098, "frac_reward_zero_std": 1.0, "grad_norm": 0.001779657233973569, "kl": 0.0004164364654570818, "learning_rate": 3.333333333333333e-07, "loss": 0.0, "num_tokens": 3394719.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5679559707641602, "sampling/importance_sampling_ratio/mean": 1.000484585762024, "sampling/importance_sampling_ratio/min": 0.623586893081665, "sampling/sampling_logp_difference/max": 0.47226715087890625, "sampling/sampling_logp_difference/mean": 0.015483451075851917, "step": 83 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1030.0, "completions/max_terminated_length": 1030.0, "completions/mean_length": 476.9375, "completions/mean_terminated_length": 476.9375, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.4867349863052368, "epoch": 0.10294117647058823, "frac_reward_zero_std": 0.25, "grad_norm": 0.9420804322336473, "kl": 0.00032511126482859254, "learning_rate": 3.3739837398373985e-07, "loss": -0.0544, "num_tokens": 3446731.0, "reward": 0.75, "reward_std": 0.5351393222808838, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.5752092599868774, "sampling/importance_sampling_ratio/mean": 1.000123381614685, "sampling/importance_sampling_ratio/min": 0.5812228918075562, "sampling/sampling_logp_difference/max": 0.5426208972930908, "sampling/sampling_logp_difference/mean": 0.014571974985301495, "step": 84 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1774.0, "completions/max_terminated_length": 1774.0, "completions/mean_length": 450.96875, "completions/mean_terminated_length": 450.96875, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.41511982679367065, "epoch": 0.10416666666666667, "frac_reward_zero_std": 0.75, "grad_norm": 0.530525518295596, "kl": 0.0003260596131440252, "learning_rate": 3.4146341463414634e-07, "loss": -0.0557, "num_tokens": 3497145.0, "reward": 0.3125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000139832496643, "sampling/importance_sampling_ratio/min": 0.5673341751098633, "sampling/sampling_logp_difference/max": 0.8062944412231445, "sampling/sampling_logp_difference/mean": 0.01370926946401596, "step": 85 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 420.0, "completions/max_terminated_length": 420.0, "completions/mean_length": 261.140625, "completions/mean_terminated_length": 261.140625, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.4012688994407654, "epoch": 0.1053921568627451, "frac_reward_zero_std": 0.5, "grad_norm": 1.2067698586036253, "kl": 0.00040133169386535883, "learning_rate": 3.4552845528455284e-07, "loss": -0.0205, "num_tokens": 3529410.0, "reward": 0.5, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.575060248374939, "sampling/importance_sampling_ratio/mean": 1.000122308731079, "sampling/importance_sampling_ratio/min": 0.6379515528678894, "sampling/sampling_logp_difference/max": 0.45429348945617676, "sampling/sampling_logp_difference/mean": 0.01421426422894001, "step": 86 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 647.0, "completions/max_terminated_length": 647.0, "completions/mean_length": 353.890625, "completions/mean_terminated_length": 353.890625, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.5234670042991638, "epoch": 0.10661764705882353, "frac_reward_zero_std": 0.25, "grad_norm": 1.1560640441383245, "kl": 0.0004347137291915715, "learning_rate": 3.4959349593495933e-07, "loss": 0.0258, "num_tokens": 3570043.0, "reward": -0.21875, "reward_std": 0.6601393222808838, "rewards/decision_reward_func/mean": -0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0005996227264404, "sampling/importance_sampling_ratio/min": 0.35779228806495667, "sampling/sampling_logp_difference/max": 1.0278027057647705, "sampling/sampling_logp_difference/mean": 0.01560912374407053, "step": 87 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 945.0, "completions/max_terminated_length": 945.0, "completions/mean_length": 341.203125, "completions/mean_terminated_length": 341.203125, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.36407971382141113, "epoch": 0.10784313725490197, "frac_reward_zero_std": 0.75, "grad_norm": 0.6119029430945078, "kl": 0.0005802462110295892, "learning_rate": 3.536585365853658e-07, "loss": -0.004, "num_tokens": 3606664.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.7188796997070312, "sampling/importance_sampling_ratio/mean": 1.000808596611023, "sampling/importance_sampling_ratio/min": 0.5300337672233582, "sampling/sampling_logp_difference/max": 0.6348145008087158, "sampling/sampling_logp_difference/mean": 0.014325735159218311, "step": 88 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1007.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 394.0, "completions/mean_terminated_length": 394.0, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.34408533573150635, "epoch": 0.1090686274509804, "frac_reward_zero_std": 0.5, "grad_norm": 0.8399432875525776, "kl": 0.00039662697236053646, "learning_rate": 3.5772357723577237e-07, "loss": 0.0341, "num_tokens": 3650584.0, "reward": 0.6875, "reward_std": 0.3811737596988678, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.6265212297439575, "sampling/importance_sampling_ratio/mean": 1.0002756118774414, "sampling/importance_sampling_ratio/min": 0.6422431468963623, "sampling/sampling_logp_difference/max": 0.48644351959228516, "sampling/sampling_logp_difference/mean": 0.01306096650660038, "step": 89 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1060.0, "completions/max_terminated_length": 1060.0, "completions/mean_length": 350.015625, "completions/mean_terminated_length": 350.015625, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "entropy": 0.5070651769638062, "epoch": 0.11029411764705882, "frac_reward_zero_std": 0.25, "grad_norm": 1.1389525807105187, "kl": 0.00040948143578134477, "learning_rate": 3.6178861788617886e-07, "loss": 0.0611, "num_tokens": 3691785.0, "reward": 0.3125, "reward_std": 0.5915650129318237, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.441906452178955, "sampling/importance_sampling_ratio/mean": 1.0000383853912354, "sampling/importance_sampling_ratio/min": 0.536314845085144, "sampling/sampling_logp_difference/max": 0.623033881187439, "sampling/sampling_logp_difference/mean": 0.016554951667785645, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 760.0, "completions/max_terminated_length": 760.0, "completions/mean_length": 315.859375, "completions/mean_terminated_length": 315.859375, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "entropy": 0.4514394998550415, "epoch": 0.11151960784313726, "frac_reward_zero_std": 0.75, "grad_norm": 0.6338392444695511, "kl": 0.0003176363534294069, "learning_rate": 3.6585365853658536e-07, "loss": 0.0021, "num_tokens": 3731424.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.4724797010421753, "sampling/importance_sampling_ratio/mean": 1.0000628232955933, "sampling/importance_sampling_ratio/min": 0.6368950605392456, "sampling/sampling_logp_difference/max": 0.45115041732788086, "sampling/sampling_logp_difference/mean": 0.014392010867595673, "step": 91 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 567.0, "completions/max_terminated_length": 567.0, "completions/mean_length": 288.34375, "completions/mean_terminated_length": 288.34375, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.35438281297683716, "epoch": 0.11274509803921569, "frac_reward_zero_std": 1.0, "grad_norm": 0.0019683524070327175, "kl": 0.00037295278161764145, "learning_rate": 3.6991869918699185e-07, "loss": 0.0, "num_tokens": 3763222.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.72694730758667, "sampling/importance_sampling_ratio/mean": 0.9995256662368774, "sampling/importance_sampling_ratio/min": 0.7039896249771118, "sampling/sampling_logp_difference/max": 0.5463552474975586, "sampling/sampling_logp_difference/mean": 0.012642355635762215, "step": 92 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1455.0, "completions/max_terminated_length": 1455.0, "completions/mean_length": 567.328125, "completions/mean_terminated_length": 567.328125, "completions/min_length": 275.0, "completions/min_terminated_length": 275.0, "entropy": 0.39885467290878296, "epoch": 0.11397058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 0.6757025474620525, "kl": 0.00025323120644316077, "learning_rate": 3.7398373983739835e-07, "loss": 0.0231, "num_tokens": 3818491.0, "reward": 0.625, "reward_std": 0.5, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.7607970237731934, "sampling/importance_sampling_ratio/mean": 1.0002464056015015, "sampling/importance_sampling_ratio/min": 0.2991294264793396, "sampling/sampling_logp_difference/max": 1.206878900527954, "sampling/sampling_logp_difference/mean": 0.011584034189581871, "step": 93 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 462.0, "completions/max_terminated_length": 462.0, "completions/mean_length": 249.390625, "completions/mean_terminated_length": 249.390625, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.4121914505958557, "epoch": 0.11519607843137254, "frac_reward_zero_std": 0.5, "grad_norm": 1.044821951680086, "kl": 0.0005148970521986485, "learning_rate": 3.7804878048780484e-07, "loss": 0.0048, "num_tokens": 3848036.0, "reward": 0.75, "reward_std": 0.3811737596988678, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.459807276725769, "sampling/importance_sampling_ratio/mean": 0.9995647668838501, "sampling/importance_sampling_ratio/min": 0.6950550675392151, "sampling/sampling_logp_difference/max": 0.37830448150634766, "sampling/sampling_logp_difference/mean": 0.015423442237079144, "step": 94 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 826.0, "completions/max_terminated_length": 826.0, "completions/mean_length": 433.375, "completions/mean_terminated_length": 433.375, "completions/min_length": 271.0, "completions/min_terminated_length": 271.0, "entropy": 0.4727870225906372, "epoch": 0.11642156862745098, "frac_reward_zero_std": 0.5, "grad_norm": 0.5942498697844701, "kl": 0.0003120132605545223, "learning_rate": 3.821138211382114e-07, "loss": 0.0398, "num_tokens": 3894764.0, "reward": 0.8125, "reward_std": 0.40311288833618164, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.3714686632156372, "sampling/importance_sampling_ratio/mean": 0.9999316930770874, "sampling/importance_sampling_ratio/min": 0.604153037071228, "sampling/sampling_logp_difference/max": 0.5039277076721191, "sampling/sampling_logp_difference/mean": 0.013419291004538536, "step": 95 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1289.0, "completions/max_terminated_length": 1289.0, "completions/mean_length": 333.34375, "completions/mean_terminated_length": 333.34375, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.4969383776187897, "epoch": 0.11764705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 0.9854722725405612, "kl": 0.0005100821726955473, "learning_rate": 3.861788617886179e-07, "loss": -0.0843, "num_tokens": 3937026.0, "reward": 0.59375, "reward_std": 0.4101392924785614, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.6096097230911255, "sampling/importance_sampling_ratio/mean": 0.9995669722557068, "sampling/importance_sampling_ratio/min": 0.1647721529006958, "sampling/sampling_logp_difference/max": 1.8031916618347168, "sampling/sampling_logp_difference/mean": 0.01715863309800625, "step": 96 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 579.0, "completions/max_terminated_length": 579.0, "completions/mean_length": 328.09375, "completions/mean_terminated_length": 328.09375, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.4238840341567993, "epoch": 0.11887254901960784, "frac_reward_zero_std": 0.5, "grad_norm": 0.9105096859914918, "kl": 0.000476241169963032, "learning_rate": 3.902439024390244e-07, "loss": 0.0119, "num_tokens": 3974920.0, "reward": 0.71875, "reward_std": 0.38319888710975647, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.5277611017227173, "sampling/importance_sampling_ratio/mean": 1.0000873804092407, "sampling/importance_sampling_ratio/min": 0.6375994086265564, "sampling/sampling_logp_difference/max": 0.450045108795166, "sampling/sampling_logp_difference/mean": 0.015075193718075752, "step": 97 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 985.0, "completions/max_terminated_length": 985.0, "completions/mean_length": 379.59375, "completions/mean_terminated_length": 379.59375, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.5408264398574829, "epoch": 0.12009803921568628, "frac_reward_zero_std": 0.25, "grad_norm": 1.0576467869702788, "kl": 0.0005210047820582986, "learning_rate": 3.9430894308943087e-07, "loss": -0.0045, "num_tokens": 4017710.0, "reward": 0.875, "reward_std": 0.42078250646591187, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.691725730895996, "sampling/importance_sampling_ratio/mean": 1.0005857944488525, "sampling/importance_sampling_ratio/min": 0.5060588121414185, "sampling/sampling_logp_difference/max": 0.6811022758483887, "sampling/sampling_logp_difference/mean": 0.0175236314535141, "step": 98 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 646.0, "completions/max_terminated_length": 646.0, "completions/mean_length": 291.640625, "completions/mean_terminated_length": 291.640625, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "entropy": 0.4811084270477295, "epoch": 0.1213235294117647, "frac_reward_zero_std": 0.75, "grad_norm": 0.7168268080272767, "kl": 0.0004572126199491322, "learning_rate": 3.9837398373983736e-07, "loss": 0.0111, "num_tokens": 4053127.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.5281450748443604, "sampling/importance_sampling_ratio/mean": 1.0007331371307373, "sampling/importance_sampling_ratio/min": 0.6074681878089905, "sampling/sampling_logp_difference/max": 0.4984554946422577, "sampling/sampling_logp_difference/mean": 0.01668773777782917, "step": 99 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 712.0, "completions/max_terminated_length": 712.0, "completions/mean_length": 383.25, "completions/mean_terminated_length": 383.25, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "entropy": 0.4761793613433838, "epoch": 0.12254901960784313, "frac_reward_zero_std": 0.25, "grad_norm": 0.9346101851377043, "kl": 0.0004383840714581311, "learning_rate": 4.024390243902439e-07, "loss": 0.0304, "num_tokens": 4096759.0, "reward": 0.78125, "reward_std": 0.519389271736145, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.573765516281128, "sampling/importance_sampling_ratio/mean": 0.9998458623886108, "sampling/importance_sampling_ratio/min": 0.6471254229545593, "sampling/sampling_logp_difference/max": 0.45347118377685547, "sampling/sampling_logp_difference/mean": 0.015609953552484512, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 694.0, "completions/max_terminated_length": 694.0, "completions/mean_length": 351.015625, "completions/mean_terminated_length": 351.015625, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.5687766075134277, "epoch": 0.12377450980392157, "frac_reward_zero_std": 0.25, "grad_norm": 1.1327231453324602, "kl": 0.0004234556690789759, "learning_rate": 4.065040650406504e-07, "loss": 0.0362, "num_tokens": 4138104.0, "reward": 0.0, "reward_std": 0.644389271736145, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.5435638427734375, "sampling/importance_sampling_ratio/mean": 1.0004775524139404, "sampling/importance_sampling_ratio/min": 0.6491868495941162, "sampling/sampling_logp_difference/max": 0.4340939521789551, "sampling/sampling_logp_difference/mean": 0.01723666675388813, "step": 101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 773.0, "completions/max_terminated_length": 773.0, "completions/mean_length": 407.25, "completions/mean_terminated_length": 407.25, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "entropy": 0.3959760367870331, "epoch": 0.125, "frac_reward_zero_std": 0.5, "grad_norm": 0.79653086684939, "kl": 0.00042028934694826603, "learning_rate": 4.105691056910569e-07, "loss": 0.0154, "num_tokens": 4183256.0, "reward": 0.03125, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.4827423095703125, "sampling/importance_sampling_ratio/mean": 1.0002237558364868, "sampling/importance_sampling_ratio/min": 0.6392970681190491, "sampling/sampling_logp_difference/max": 0.4473860263824463, "sampling/sampling_logp_difference/mean": 0.01372037548571825, "step": 102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 957.0, "completions/max_terminated_length": 957.0, "completions/mean_length": 325.734375, "completions/mean_terminated_length": 325.734375, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.40874558687210083, "epoch": 0.12622549019607843, "frac_reward_zero_std": 1.0, "grad_norm": 0.002941355090388281, "kl": 0.0006044954643584788, "learning_rate": 4.146341463414634e-07, "loss": 0.0, "num_tokens": 4224503.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.8682637214660645, "sampling/importance_sampling_ratio/mean": 1.000178337097168, "sampling/importance_sampling_ratio/min": 0.6096051335334778, "sampling/sampling_logp_difference/max": 0.6250095367431641, "sampling/sampling_logp_difference/mean": 0.01493051741272211, "step": 103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 793.0, "completions/max_terminated_length": 793.0, "completions/mean_length": 317.53125, "completions/mean_terminated_length": 317.53125, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.5234681367874146, "epoch": 0.12745098039215685, "frac_reward_zero_std": 0.5, "grad_norm": 0.837283040976194, "kl": 0.0004973389441147447, "learning_rate": 4.186991869918699e-07, "loss": 0.0268, "num_tokens": 4263497.0, "reward": 0.5625, "reward_std": 0.3943893015384674, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.6088883876800537, "sampling/importance_sampling_ratio/mean": 1.0002853870391846, "sampling/importance_sampling_ratio/min": 0.6319010257720947, "sampling/sampling_logp_difference/max": 0.4755434989929199, "sampling/sampling_logp_difference/mean": 0.01697307638823986, "step": 104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 846.0, "completions/max_terminated_length": 846.0, "completions/mean_length": 409.203125, "completions/mean_terminated_length": 409.203125, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "entropy": 0.37502801418304443, "epoch": 0.12867647058823528, "frac_reward_zero_std": 1.0, "grad_norm": 0.002686765768428253, "kl": 0.00037332598003558815, "learning_rate": 4.2276422764227643e-07, "loss": 0.0, "num_tokens": 4309062.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.999983549118042, "sampling/importance_sampling_ratio/min": 0.47367581725120544, "sampling/sampling_logp_difference/max": 0.7703673839569092, "sampling/sampling_logp_difference/mean": 0.012892134487628937, "step": 105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 638.0, "completions/max_terminated_length": 638.0, "completions/mean_length": 348.75, "completions/mean_terminated_length": 348.75, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.41564440727233887, "epoch": 0.12990196078431374, "frac_reward_zero_std": 0.75, "grad_norm": 0.6799753331293725, "kl": 0.0005036736256442964, "learning_rate": 4.268292682926829e-07, "loss": 0.0344, "num_tokens": 4346838.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.5580205917358398, "sampling/importance_sampling_ratio/mean": 0.9999070763587952, "sampling/importance_sampling_ratio/min": 0.5566685199737549, "sampling/sampling_logp_difference/max": 0.5857852697372437, "sampling/sampling_logp_difference/mean": 0.014538628980517387, "step": 106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 923.0, "completions/max_terminated_length": 923.0, "completions/mean_length": 396.734375, "completions/mean_terminated_length": 396.734375, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "entropy": 0.363589882850647, "epoch": 0.13112745098039216, "frac_reward_zero_std": 0.75, "grad_norm": 0.5407980673506114, "kl": 0.0003916417481377721, "learning_rate": 4.308943089430894e-07, "loss": 0.0249, "num_tokens": 4397525.0, "reward": 0.25, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.25, "rewards/decision_reward_func/std": 0.9759001135826111, "sampling/importance_sampling_ratio/max": 1.60197114944458, "sampling/importance_sampling_ratio/mean": 1.0000802278518677, "sampling/importance_sampling_ratio/min": 0.5501912236213684, "sampling/sampling_logp_difference/max": 0.5974893569946289, "sampling/sampling_logp_difference/mean": 0.012200066819787025, "step": 107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 858.0, "completions/max_terminated_length": 858.0, "completions/mean_length": 346.46875, "completions/mean_terminated_length": 346.46875, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.44900068640708923, "epoch": 0.1323529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 0.7874849968288857, "kl": 0.0004554901097435504, "learning_rate": 4.349593495934959e-07, "loss": -0.0203, "num_tokens": 4445811.0, "reward": 0.46875, "reward_std": 0.5061737298965454, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.4003722667694092, "sampling/importance_sampling_ratio/mean": 0.9998854398727417, "sampling/importance_sampling_ratio/min": 0.6057974100112915, "sampling/sampling_logp_difference/max": 0.5012097358703613, "sampling/sampling_logp_difference/mean": 0.01517036184668541, "step": 108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1487.0, "completions/max_terminated_length": 1487.0, "completions/mean_length": 580.25, "completions/mean_terminated_length": 580.25, "completions/min_length": 300.0, "completions/min_terminated_length": 300.0, "entropy": 0.555091917514801, "epoch": 0.13357843137254902, "frac_reward_zero_std": 0.0, "grad_norm": 0.7831411470622467, "kl": 0.0003628029953688383, "learning_rate": 4.390243902439024e-07, "loss": -0.008, "num_tokens": 4507219.0, "reward": 0.6875, "reward_std": 0.690913200378418, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.6208237409591675, "sampling/importance_sampling_ratio/mean": 0.9999600052833557, "sampling/importance_sampling_ratio/min": 0.2968596816062927, "sampling/sampling_logp_difference/max": 1.2144956588745117, "sampling/sampling_logp_difference/mean": 0.015868540853261948, "step": 109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.0, "completions/max_terminated_length": 523.0, "completions/mean_length": 336.015625, "completions/mean_terminated_length": 336.015625, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.37616395950317383, "epoch": 0.13480392156862744, "frac_reward_zero_std": 0.75, "grad_norm": 0.5467209225933788, "kl": 0.0004513614985626191, "learning_rate": 4.4308943089430896e-07, "loss": -0.0059, "num_tokens": 4555876.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.7915633916854858, "sampling/importance_sampling_ratio/mean": 0.99953293800354, "sampling/importance_sampling_ratio/min": 0.3487522006034851, "sampling/sampling_logp_difference/max": 1.0533936023712158, "sampling/sampling_logp_difference/mean": 0.013475427404046059, "step": 110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 815.0, "completions/max_terminated_length": 815.0, "completions/mean_length": 326.234375, "completions/mean_terminated_length": 326.234375, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.31936609745025635, "epoch": 0.13602941176470587, "frac_reward_zero_std": 0.75, "grad_norm": 0.5554328968660462, "kl": 0.00042767199920490384, "learning_rate": 4.471544715447154e-07, "loss": -0.0669, "num_tokens": 4591347.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.534935712814331, "sampling/importance_sampling_ratio/mean": 1.0001295804977417, "sampling/importance_sampling_ratio/min": 0.612177848815918, "sampling/sampling_logp_difference/max": 0.49073243141174316, "sampling/sampling_logp_difference/mean": 0.011050334200263023, "step": 111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1154.0, "completions/max_terminated_length": 1154.0, "completions/mean_length": 483.484375, "completions/mean_terminated_length": 483.484375, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "entropy": 0.41658058762550354, "epoch": 0.13725490196078433, "frac_reward_zero_std": 0.25, "grad_norm": 0.9777666719215118, "kl": 0.000341822043992579, "learning_rate": 4.5121951219512194e-07, "loss": 0.1058, "num_tokens": 4638354.0, "reward": 0.4375, "reward_std": 0.42078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.6007341146469116, "sampling/importance_sampling_ratio/mean": 1.000509262084961, "sampling/importance_sampling_ratio/min": 0.6229265332221985, "sampling/sampling_logp_difference/max": 0.4733266830444336, "sampling/sampling_logp_difference/mean": 0.012643326073884964, "step": 112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 894.0, "completions/max_terminated_length": 894.0, "completions/mean_length": 407.65625, "completions/mean_terminated_length": 407.65625, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "entropy": 0.37405529618263245, "epoch": 0.13848039215686275, "frac_reward_zero_std": 0.5, "grad_norm": 0.7207184522763672, "kl": 0.0003680842637550086, "learning_rate": 4.5528455284552844e-07, "loss": 0.0193, "num_tokens": 4686380.0, "reward": 0.90625, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.7516556978225708, "sampling/importance_sampling_ratio/mean": 0.99992436170578, "sampling/importance_sampling_ratio/min": 0.5651581883430481, "sampling/sampling_logp_difference/max": 0.5706496238708496, "sampling/sampling_logp_difference/mean": 0.01251281425356865, "step": 113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/max_terminated_length": 517.0, "completions/mean_length": 306.875, "completions/mean_terminated_length": 306.875, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "entropy": 0.4404970407485962, "epoch": 0.13970588235294118, "frac_reward_zero_std": 0.75, "grad_norm": 0.8257958485365574, "kl": 0.0004957873024977744, "learning_rate": 4.5934959349593493e-07, "loss": -0.0002, "num_tokens": 4723316.0, "reward": 0.125, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001955032348633, "sampling/importance_sampling_ratio/min": 0.7149559855461121, "sampling/sampling_logp_difference/max": 1.0273219347000122, "sampling/sampling_logp_difference/mean": 0.014298751950263977, "step": 114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 679.0, "completions/max_terminated_length": 679.0, "completions/mean_length": 275.78125, "completions/mean_terminated_length": 275.78125, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.34706220030784607, "epoch": 0.1409313725490196, "frac_reward_zero_std": 1.0, "grad_norm": 0.004997686214165879, "kl": 0.0007157026557251811, "learning_rate": 4.634146341463415e-07, "loss": 0.0, "num_tokens": 4756438.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4785610437393188, "sampling/importance_sampling_ratio/mean": 1.000439167022705, "sampling/importance_sampling_ratio/min": 0.41614991426467896, "sampling/sampling_logp_difference/max": 0.8767096996307373, "sampling/sampling_logp_difference/mean": 0.014026264660060406, "step": 115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 534.0, "completions/max_terminated_length": 534.0, "completions/mean_length": 299.578125, "completions/mean_terminated_length": 299.578125, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "entropy": 0.4293009638786316, "epoch": 0.14215686274509803, "frac_reward_zero_std": 0.75, "grad_norm": 0.7438147224906381, "kl": 0.000517491134814918, "learning_rate": 4.674796747967479e-07, "loss": 0.0271, "num_tokens": 4798827.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.5071347951889038, "sampling/importance_sampling_ratio/mean": 1.0001477003097534, "sampling/importance_sampling_ratio/min": 0.6571649312973022, "sampling/sampling_logp_difference/max": 0.4198201894760132, "sampling/sampling_logp_difference/mean": 0.015242652036249638, "step": 116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1002.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 479.9375, "completions/mean_terminated_length": 479.9375, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.33730798959732056, "epoch": 0.14338235294117646, "frac_reward_zero_std": 0.5, "grad_norm": 0.643856763955946, "kl": 0.0003845912287943065, "learning_rate": 4.7154471544715447e-07, "loss": -0.0161, "num_tokens": 4848295.0, "reward": 0.09375, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.6249107122421265, "sampling/importance_sampling_ratio/mean": 1.0000396966934204, "sampling/importance_sampling_ratio/min": 0.6364777684211731, "sampling/sampling_logp_difference/max": 0.48545289039611816, "sampling/sampling_logp_difference/mean": 0.010646654292941093, "step": 117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 747.0, "completions/max_terminated_length": 747.0, "completions/mean_length": 375.484375, "completions/mean_terminated_length": 375.484375, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "entropy": 0.34789761900901794, "epoch": 0.14460784313725492, "frac_reward_zero_std": 0.75, "grad_norm": 0.7276828911290584, "kl": 0.000434291287092492, "learning_rate": 4.756097560975609e-07, "loss": 0.0595, "num_tokens": 4891206.0, "reward": 0.84375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.2788143157958984, "sampling/importance_sampling_ratio/mean": 1.000098705291748, "sampling/importance_sampling_ratio/min": 0.4455534517765045, "sampling/sampling_logp_difference/max": 0.8084380626678467, "sampling/sampling_logp_difference/mean": 0.011729598045349121, "step": 118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 790.0, "completions/max_terminated_length": 790.0, "completions/mean_length": 335.21875, "completions/mean_terminated_length": 335.21875, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.6202929019927979, "epoch": 0.14583333333333334, "frac_reward_zero_std": 0.0, "grad_norm": 1.3274008414113523, "kl": 0.0006457231938838959, "learning_rate": 4.796747967479675e-07, "loss": -0.0663, "num_tokens": 4926852.0, "reward": -0.1875, "reward_std": 0.974128007888794, "rewards/decision_reward_func/mean": -0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 1.414965271949768, "sampling/importance_sampling_ratio/mean": 1.0001801252365112, "sampling/importance_sampling_ratio/min": 0.6064948439598083, "sampling/sampling_logp_difference/max": 0.5000591278076172, "sampling/sampling_logp_difference/mean": 0.018467864021658897, "step": 119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1132.0, "completions/max_terminated_length": 1132.0, "completions/mean_length": 346.046875, "completions/mean_terminated_length": 346.046875, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.4757922291755676, "epoch": 0.14705882352941177, "frac_reward_zero_std": 0.25, "grad_norm": 1.0853747440957902, "kl": 0.0008165095932781696, "learning_rate": 4.83739837398374e-07, "loss": 0.0155, "num_tokens": 4964487.0, "reward": 0.3125, "reward_std": 0.6813369989395142, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.5744314193725586, "sampling/importance_sampling_ratio/mean": 1.0003631114959717, "sampling/importance_sampling_ratio/min": 0.5865985751152039, "sampling/sampling_logp_difference/max": 0.5334146022796631, "sampling/sampling_logp_difference/mean": 0.016278965398669243, "step": 120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 797.0, "completions/max_terminated_length": 797.0, "completions/mean_length": 486.5, "completions/mean_terminated_length": 486.5, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.5085945129394531, "epoch": 0.1482843137254902, "frac_reward_zero_std": 0.5, "grad_norm": 0.6485572472659795, "kl": 0.000499403802677989, "learning_rate": 4.878048780487804e-07, "loss": 0.0277, "num_tokens": 5018807.0, "reward": 0.53125, "reward_std": 0.5061737298965454, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.6675312519073486, "sampling/importance_sampling_ratio/mean": 1.0000520944595337, "sampling/importance_sampling_ratio/min": 0.5893089175224304, "sampling/sampling_logp_difference/max": 0.5288047790527344, "sampling/sampling_logp_difference/mean": 0.015509932301938534, "step": 121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 711.0, "completions/max_terminated_length": 711.0, "completions/mean_length": 364.5625, "completions/mean_terminated_length": 364.5625, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.5436242818832397, "epoch": 0.14950980392156862, "frac_reward_zero_std": 0.5, "grad_norm": 1.0089439052742057, "kl": 0.0005406868876889348, "learning_rate": 4.91869918699187e-07, "loss": -0.0575, "num_tokens": 5062299.0, "reward": 0.3125, "reward_std": 0.3811737596988678, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.665187954902649, "sampling/importance_sampling_ratio/mean": 0.9999135136604309, "sampling/importance_sampling_ratio/min": 0.6882880330085754, "sampling/sampling_logp_difference/max": 0.5099380016326904, "sampling/sampling_logp_difference/mean": 0.016519315540790558, "step": 122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 676.0, "completions/max_terminated_length": 676.0, "completions/mean_length": 282.46875, "completions/mean_terminated_length": 282.46875, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.4887445569038391, "epoch": 0.15073529411764705, "frac_reward_zero_std": 0.25, "grad_norm": 1.227020278714591, "kl": 0.0009422621806152165, "learning_rate": 4.959349593495934e-07, "loss": -0.087, "num_tokens": 5094217.0, "reward": 0.5, "reward_std": 0.4973389506340027, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4228546619415283, "sampling/importance_sampling_ratio/mean": 0.9999914765357971, "sampling/importance_sampling_ratio/min": 0.6293879151344299, "sampling/sampling_logp_difference/max": 0.46300745010375977, "sampling/sampling_logp_difference/mean": 0.016937926411628723, "step": 123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2474.0, "completions/max_terminated_length": 2474.0, "completions/mean_length": 693.71875, "completions/mean_terminated_length": 693.71875, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "entropy": 0.3880191743373871, "epoch": 0.15196078431372548, "frac_reward_zero_std": 0.5, "grad_norm": 0.5644859989198875, "kl": 0.00046928043593652546, "learning_rate": 5e-07, "loss": -0.1017, "num_tokens": 5166775.0, "reward": 0.34375, "reward_std": 0.375, "rewards/decision_reward_func/mean": 0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 1.818074345588684, "sampling/importance_sampling_ratio/mean": 0.9999571442604065, "sampling/importance_sampling_ratio/min": 0.2569502294063568, "sampling/sampling_logp_difference/max": 1.358872890472412, "sampling/sampling_logp_difference/mean": 0.011630144901573658, "step": 124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 908.0, "completions/max_terminated_length": 908.0, "completions/mean_length": 298.296875, "completions/mean_terminated_length": 298.296875, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.38100963830947876, "epoch": 0.15318627450980393, "frac_reward_zero_std": 0.5, "grad_norm": 0.8584275864965829, "kl": 0.0009190543787553906, "learning_rate": 5.040650406504064e-07, "loss": 0.0753, "num_tokens": 5205050.0, "reward": 0.6875, "reward_std": 0.3811737596988678, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.6088923215866089, "sampling/importance_sampling_ratio/mean": 0.9999606609344482, "sampling/importance_sampling_ratio/min": 0.6371169090270996, "sampling/sampling_logp_difference/max": 0.47554588317871094, "sampling/sampling_logp_difference/mean": 0.013589650392532349, "step": 125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 856.0, "completions/max_terminated_length": 856.0, "completions/mean_length": 437.828125, "completions/mean_terminated_length": 437.828125, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "entropy": 0.4290671944618225, "epoch": 0.15441176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 0.7356359187180855, "kl": 0.0006445399485528469, "learning_rate": 5.081300813008131e-07, "loss": -0.0169, "num_tokens": 5250335.0, "reward": 0.6875, "reward_std": 0.42898139357566833, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.447912573814392, "sampling/importance_sampling_ratio/mean": 0.9997588396072388, "sampling/importance_sampling_ratio/min": 0.698763906955719, "sampling/sampling_logp_difference/max": 0.37012290954589844, "sampling/sampling_logp_difference/mean": 0.011977942660450935, "step": 126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 798.0, "completions/max_terminated_length": 798.0, "completions/mean_length": 328.015625, "completions/mean_terminated_length": 328.015625, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.46991682052612305, "epoch": 0.1556372549019608, "frac_reward_zero_std": 0.5, "grad_norm": 0.9026036696179089, "kl": 0.0009409356280229986, "learning_rate": 5.121951219512195e-07, "loss": -0.0063, "num_tokens": 5286912.0, "reward": 0.15625, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": 0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.5467315912246704, "sampling/importance_sampling_ratio/mean": 1.000398874282837, "sampling/importance_sampling_ratio/min": 0.6602442860603333, "sampling/sampling_logp_difference/max": 0.4361441135406494, "sampling/sampling_logp_difference/mean": 0.01578994281589985, "step": 127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1098.0, "completions/max_terminated_length": 1098.0, "completions/mean_length": 389.0, "completions/mean_terminated_length": 389.0, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.46830958127975464, "epoch": 0.1568627450980392, "frac_reward_zero_std": 0.5, "grad_norm": 0.7035588728377132, "kl": 0.001022371114231646, "learning_rate": 5.16260162601626e-07, "loss": 0.0079, "num_tokens": 5328304.0, "reward": 0.71875, "reward_std": 0.38319888710975647, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.3418762683868408, "sampling/importance_sampling_ratio/mean": 1.0002293586730957, "sampling/importance_sampling_ratio/min": 0.5929670929908752, "sampling/sampling_logp_difference/max": 0.5226163864135742, "sampling/sampling_logp_difference/mean": 0.015201126225292683, "step": 128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1123.0, "completions/max_terminated_length": 1123.0, "completions/mean_length": 501.453125, "completions/mean_terminated_length": 501.453125, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "entropy": 0.41464269161224365, "epoch": 0.15808823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 0.6347591314993487, "kl": 0.0008197766728699207, "learning_rate": 5.203252032520325e-07, "loss": 0.0307, "num_tokens": 5378941.0, "reward": 0.46875, "reward_std": 0.48935678601264954, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.620728611946106, "sampling/importance_sampling_ratio/mean": 0.9999899864196777, "sampling/importance_sampling_ratio/min": 0.5211734771728516, "sampling/sampling_logp_difference/max": 0.65167236328125, "sampling/sampling_logp_difference/mean": 0.013104362413287163, "step": 129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 821.0, "completions/max_terminated_length": 821.0, "completions/mean_length": 375.484375, "completions/mean_terminated_length": 375.484375, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "entropy": 0.497396320104599, "epoch": 0.15931372549019607, "frac_reward_zero_std": 0.75, "grad_norm": 0.5940186318705434, "kl": 0.0009189542615786195, "learning_rate": 5.24390243902439e-07, "loss": -0.0213, "num_tokens": 5419628.0, "reward": 0.65625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0002679824829102, "sampling/importance_sampling_ratio/min": 0.6202808618545532, "sampling/sampling_logp_difference/max": 0.7553448677062988, "sampling/sampling_logp_difference/mean": 0.014913068152964115, "step": 130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 641.0, "completions/max_terminated_length": 641.0, "completions/mean_length": 251.265625, "completions/mean_terminated_length": 251.265625, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.38112348318099976, "epoch": 0.16053921568627452, "frac_reward_zero_std": 1.0, "grad_norm": 0.009335823564268089, "kl": 0.001319713657721877, "learning_rate": 5.284552845528455e-07, "loss": 0.0, "num_tokens": 5454141.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4742951393127441, "sampling/importance_sampling_ratio/mean": 0.9996437430381775, "sampling/importance_sampling_ratio/min": 0.609596312046051, "sampling/sampling_logp_difference/max": 0.49495840072631836, "sampling/sampling_logp_difference/mean": 0.014325141906738281, "step": 131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 693.0, "completions/max_terminated_length": 693.0, "completions/mean_length": 309.59375, "completions/mean_terminated_length": 309.59375, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.3670653998851776, "epoch": 0.16176470588235295, "frac_reward_zero_std": 1.0, "grad_norm": 0.010008721427208173, "kl": 0.0013580196537077427, "learning_rate": 5.325203252032519e-07, "loss": 0.0, "num_tokens": 5490931.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4298441410064697, "sampling/importance_sampling_ratio/mean": 1.000504493713379, "sampling/importance_sampling_ratio/min": 0.6171438097953796, "sampling/sampling_logp_difference/max": 0.4826531410217285, "sampling/sampling_logp_difference/mean": 0.013864768669009209, "step": 132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1091.0, "completions/max_terminated_length": 1091.0, "completions/mean_length": 402.484375, "completions/mean_terminated_length": 402.484375, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.46136894822120667, "epoch": 0.16299019607843138, "frac_reward_zero_std": 0.75, "grad_norm": 0.681492221381314, "kl": 0.0015797644155099988, "learning_rate": 5.365853658536586e-07, "loss": 0.0296, "num_tokens": 5536306.0, "reward": 0.28125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9997262954711914, "sampling/importance_sampling_ratio/min": 0.6161852478981018, "sampling/sampling_logp_difference/max": 0.7263710498809814, "sampling/sampling_logp_difference/mean": 0.01595831662416458, "step": 133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 649.0, "completions/max_terminated_length": 649.0, "completions/mean_length": 378.71875, "completions/mean_terminated_length": 378.71875, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "entropy": 0.4749453067779541, "epoch": 0.1642156862745098, "frac_reward_zero_std": 0.25, "grad_norm": 0.97378606071848, "kl": 0.0012213042937219143, "learning_rate": 5.40650406504065e-07, "loss": 0.0339, "num_tokens": 5576624.0, "reward": 0.875, "reward_std": 0.42078250646591187, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.5467623472213745, "sampling/importance_sampling_ratio/mean": 1.0003010034561157, "sampling/importance_sampling_ratio/min": 0.5364373922348022, "sampling/sampling_logp_difference/max": 0.6228054165840149, "sampling/sampling_logp_difference/mean": 0.014230499044060707, "step": 134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 667.0, "completions/max_terminated_length": 667.0, "completions/mean_length": 353.25, "completions/mean_terminated_length": 353.25, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "entropy": 0.5062936544418335, "epoch": 0.16544117647058823, "frac_reward_zero_std": 0.75, "grad_norm": 0.6135099516916491, "kl": 0.0017107664607465267, "learning_rate": 5.447154471544715e-07, "loss": -0.0208, "num_tokens": 5621456.0, "reward": 0.75, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.520559310913086, "sampling/importance_sampling_ratio/mean": 0.9999039173126221, "sampling/importance_sampling_ratio/min": 0.5776877403259277, "sampling/sampling_logp_difference/max": 0.5487217903137207, "sampling/sampling_logp_difference/mean": 0.01628214120864868, "step": 135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1014.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 437.96875, "completions/mean_terminated_length": 437.96875, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "entropy": 0.5432509183883667, "epoch": 0.16666666666666666, "frac_reward_zero_std": 0.75, "grad_norm": 0.558328288124103, "kl": 0.0013620015233755112, "learning_rate": 5.487804878048781e-07, "loss": 0.0104, "num_tokens": 5671742.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.527771234512329, "sampling/importance_sampling_ratio/mean": 0.9997594952583313, "sampling/importance_sampling_ratio/min": 0.46458950638771057, "sampling/sampling_logp_difference/max": 0.7666010856628418, "sampling/sampling_logp_difference/mean": 0.015825096517801285, "step": 136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 971.0, "completions/max_terminated_length": 971.0, "completions/mean_length": 432.859375, "completions/mean_terminated_length": 432.859375, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "entropy": 0.3548095226287842, "epoch": 0.16789215686274508, "frac_reward_zero_std": 0.75, "grad_norm": 0.521045255789522, "kl": 0.001133988844230771, "learning_rate": 5.528455284552846e-07, "loss": -0.0611, "num_tokens": 5715989.0, "reward": 0.0625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.31369948387146, "sampling/importance_sampling_ratio/mean": 0.999811589717865, "sampling/importance_sampling_ratio/min": 0.7104911804199219, "sampling/sampling_logp_difference/max": 0.3417987823486328, "sampling/sampling_logp_difference/mean": 0.011268092319369316, "step": 137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 902.0, "completions/max_terminated_length": 902.0, "completions/mean_length": 409.96875, "completions/mean_terminated_length": 409.96875, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "entropy": 0.4311014413833618, "epoch": 0.16911764705882354, "frac_reward_zero_std": 0.25, "grad_norm": 0.88717604105265, "kl": 0.0016709924675524235, "learning_rate": 5.56910569105691e-07, "loss": 0.0538, "num_tokens": 5759491.0, "reward": 0.25, "reward_std": 0.5765564441680908, "rewards/decision_reward_func/mean": 0.25, "rewards/decision_reward_func/std": 0.9759001135826111, "sampling/importance_sampling_ratio/max": 1.6012189388275146, "sampling/importance_sampling_ratio/mean": 0.9999065399169922, "sampling/importance_sampling_ratio/min": 0.32545751333236694, "sampling/sampling_logp_difference/max": 1.122523307800293, "sampling/sampling_logp_difference/mean": 0.013206014409661293, "step": 138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1095.0, "completions/max_terminated_length": 1095.0, "completions/mean_length": 460.578125, "completions/mean_terminated_length": 460.578125, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "entropy": 0.554047167301178, "epoch": 0.17034313725490197, "frac_reward_zero_std": 0.5, "grad_norm": 0.6884640860920749, "kl": 0.0018319606315344572, "learning_rate": 5.609756097560975e-07, "loss": -0.0338, "num_tokens": 5806216.0, "reward": -0.46875, "reward_std": 0.3723389506340027, "rewards/decision_reward_func/mean": -0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.4692245721817017, "sampling/importance_sampling_ratio/mean": 0.999860942363739, "sampling/importance_sampling_ratio/min": 0.609712541103363, "sampling/sampling_logp_difference/max": 0.4947676658630371, "sampling/sampling_logp_difference/mean": 0.01586918905377388, "step": 139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 617.0, "completions/max_terminated_length": 617.0, "completions/mean_length": 345.8125, "completions/mean_terminated_length": 345.8125, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "entropy": 0.4406856298446655, "epoch": 0.1715686274509804, "frac_reward_zero_std": 0.5, "grad_norm": 0.8886855101899511, "kl": 0.0023436183109879494, "learning_rate": 5.650406504065041e-07, "loss": 0.0001, "num_tokens": 5848156.0, "reward": 0.875, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.5316803455352783, "sampling/importance_sampling_ratio/mean": 0.9997419118881226, "sampling/importance_sampling_ratio/min": 0.5516175031661987, "sampling/sampling_logp_difference/max": 0.594900369644165, "sampling/sampling_logp_difference/mean": 0.015718860551714897, "step": 140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 763.0, "completions/max_terminated_length": 763.0, "completions/mean_length": 327.03125, "completions/mean_terminated_length": 327.03125, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.32813170552253723, "epoch": 0.17279411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.012487448666231448, "kl": 0.002653113566339016, "learning_rate": 5.691056910569105e-07, "loss": 0.0, "num_tokens": 5884062.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5744550228118896, "sampling/importance_sampling_ratio/mean": 1.000216007232666, "sampling/importance_sampling_ratio/min": 0.4735444486141205, "sampling/sampling_logp_difference/max": 0.7475094795227051, "sampling/sampling_logp_difference/mean": 0.011935854330658913, "step": 141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 985.0, "completions/max_terminated_length": 985.0, "completions/mean_length": 354.375, "completions/mean_terminated_length": 354.375, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "entropy": 0.4214673340320587, "epoch": 0.17401960784313725, "frac_reward_zero_std": 0.75, "grad_norm": 0.6969989956407171, "kl": 0.002823120914399624, "learning_rate": 5.73170731707317e-07, "loss": 0.0239, "num_tokens": 5924454.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.602253794670105, "sampling/importance_sampling_ratio/mean": 0.9999052286148071, "sampling/importance_sampling_ratio/min": 0.6005186438560486, "sampling/sampling_logp_difference/max": 0.5099616050720215, "sampling/sampling_logp_difference/mean": 0.015614859759807587, "step": 142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 763.0, "completions/max_terminated_length": 763.0, "completions/mean_length": 306.765625, "completions/mean_terminated_length": 306.765625, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.4776228666305542, "epoch": 0.17524509803921567, "frac_reward_zero_std": 0.75, "grad_norm": 0.7895264518497722, "kl": 0.003243096871301532, "learning_rate": 5.772357723577236e-07, "loss": -0.0042, "num_tokens": 5958759.0, "reward": 0.375, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.37553071975708, "sampling/importance_sampling_ratio/mean": 0.999626636505127, "sampling/importance_sampling_ratio/min": 0.22571147978305817, "sampling/sampling_logp_difference/max": 1.4884977340698242, "sampling/sampling_logp_difference/mean": 0.016000976786017418, "step": 143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 751.0, "completions/max_terminated_length": 751.0, "completions/mean_length": 476.28125, "completions/mean_terminated_length": 476.28125, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "entropy": 0.44130784273147583, "epoch": 0.17647058823529413, "frac_reward_zero_std": 0.5, "grad_norm": 0.921630284226583, "kl": 0.0020088457968086004, "learning_rate": 5.813008130081301e-07, "loss": 0.0001, "num_tokens": 6016249.0, "reward": 0.3125, "reward_std": 0.3811737596988678, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.599674940109253, "sampling/importance_sampling_ratio/mean": 0.9997872114181519, "sampling/importance_sampling_ratio/min": 0.6262628436088562, "sampling/sampling_logp_difference/max": 0.4698004722595215, "sampling/sampling_logp_difference/mean": 0.013191440142691135, "step": 144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 907.0, "completions/max_terminated_length": 907.0, "completions/mean_length": 435.453125, "completions/mean_terminated_length": 435.453125, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "entropy": 0.43111541867256165, "epoch": 0.17769607843137256, "frac_reward_zero_std": 0.5, "grad_norm": 0.7645964763850379, "kl": 0.0026750946417450905, "learning_rate": 5.853658536585365e-07, "loss": 0.0194, "num_tokens": 6061558.0, "reward": 0.8125, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.3216941356658936, "sampling/importance_sampling_ratio/mean": 0.9996474981307983, "sampling/importance_sampling_ratio/min": 0.6563990712165833, "sampling/sampling_logp_difference/max": 0.4209862947463989, "sampling/sampling_logp_difference/mean": 0.013443708419799805, "step": 145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1336.0, "completions/max_terminated_length": 1336.0, "completions/mean_length": 609.53125, "completions/mean_terminated_length": 609.53125, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "entropy": 0.3142632246017456, "epoch": 0.17892156862745098, "frac_reward_zero_std": 0.5, "grad_norm": 0.49562702250138874, "kl": 0.0018147325608879328, "learning_rate": 5.894308943089431e-07, "loss": 0.0802, "num_tokens": 6123528.0, "reward": 0.34375, "reward_std": 0.42695626616477966, "rewards/decision_reward_func/mean": 0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 1.44566011428833, "sampling/importance_sampling_ratio/mean": 1.0002951622009277, "sampling/importance_sampling_ratio/min": 0.620877742767334, "sampling/sampling_logp_difference/max": 0.476621150970459, "sampling/sampling_logp_difference/mean": 0.010062876157462597, "step": 146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 574.0, "completions/max_terminated_length": 574.0, "completions/mean_length": 326.859375, "completions/mean_terminated_length": 326.859375, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "entropy": 0.4390563368797302, "epoch": 0.1801470588235294, "frac_reward_zero_std": 0.75, "grad_norm": 0.6808299486524304, "kl": 0.003742450848221779, "learning_rate": 5.934959349593496e-07, "loss": 0.0077, "num_tokens": 6160015.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.5281697511672974, "sampling/importance_sampling_ratio/mean": 0.9997563362121582, "sampling/importance_sampling_ratio/min": 0.6894895434379578, "sampling/sampling_logp_difference/max": 0.42407071590423584, "sampling/sampling_logp_difference/mean": 0.014493118971586227, "step": 147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1156.0, "completions/max_terminated_length": 1156.0, "completions/mean_length": 385.40625, "completions/mean_terminated_length": 385.40625, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "entropy": 0.5161522626876831, "epoch": 0.18137254901960784, "frac_reward_zero_std": 0.75, "grad_norm": 0.5106203720184216, "kl": 0.004347861744463444, "learning_rate": 5.97560975609756e-07, "loss": -0.0329, "num_tokens": 6203337.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.4270079135894775, "sampling/importance_sampling_ratio/mean": 1.0003323554992676, "sampling/importance_sampling_ratio/min": 0.5371721982955933, "sampling/sampling_logp_difference/max": 0.621436595916748, "sampling/sampling_logp_difference/mean": 0.015818962827324867, "step": 148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1231.0, "completions/max_terminated_length": 1231.0, "completions/mean_length": 358.5, "completions/mean_terminated_length": 358.5, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.394218772649765, "epoch": 0.18259803921568626, "frac_reward_zero_std": 0.75, "grad_norm": 0.621924338002284, "kl": 0.004120059311389923, "learning_rate": 6.016260162601626e-07, "loss": 0.0095, "num_tokens": 6246009.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.3785911798477173, "sampling/importance_sampling_ratio/mean": 0.9999122619628906, "sampling/importance_sampling_ratio/min": 0.663678765296936, "sampling/sampling_logp_difference/max": 0.40995705127716064, "sampling/sampling_logp_difference/mean": 0.013108538463711739, "step": 149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1173.0, "completions/max_terminated_length": 1173.0, "completions/mean_length": 406.453125, "completions/mean_terminated_length": 406.453125, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "entropy": 0.44276145100593567, "epoch": 0.18382352941176472, "frac_reward_zero_std": 0.5, "grad_norm": 0.9166536523033724, "kl": 0.0038746348582208157, "learning_rate": 6.056910569105691e-07, "loss": -0.0968, "num_tokens": 6286822.0, "reward": -0.375, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": -0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.6281954050064087, "sampling/importance_sampling_ratio/mean": 0.9995982050895691, "sampling/importance_sampling_ratio/min": 0.614771842956543, "sampling/sampling_logp_difference/max": 0.4874722957611084, "sampling/sampling_logp_difference/mean": 0.014399973675608635, "step": 150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 861.0, "completions/max_terminated_length": 861.0, "completions/mean_length": 476.625, "completions/mean_terminated_length": 476.625, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.5333432555198669, "epoch": 0.18504901960784315, "frac_reward_zero_std": 0.25, "grad_norm": 0.8092827243276652, "kl": 0.00326154800131917, "learning_rate": 6.097560975609756e-07, "loss": 0.019, "num_tokens": 6339726.0, "reward": 0.5, "reward_std": 0.6663130521774292, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4753891229629517, "sampling/importance_sampling_ratio/mean": 0.999840497970581, "sampling/importance_sampling_ratio/min": 0.6151829361915588, "sampling/sampling_logp_difference/max": 0.48583555221557617, "sampling/sampling_logp_difference/mean": 0.015032002702355385, "step": 151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1275.0, "completions/max_terminated_length": 1275.0, "completions/mean_length": 392.90625, "completions/mean_terminated_length": 392.90625, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "entropy": 0.38361018896102905, "epoch": 0.18627450980392157, "frac_reward_zero_std": 1.0, "grad_norm": 0.016004127180490924, "kl": 0.004751845262944698, "learning_rate": 6.13821138211382e-07, "loss": 0.0, "num_tokens": 6385368.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6353745460510254, "sampling/importance_sampling_ratio/mean": 1.0002700090408325, "sampling/importance_sampling_ratio/min": 0.5933288335800171, "sampling/sampling_logp_difference/max": 0.5220065116882324, "sampling/sampling_logp_difference/mean": 0.01408070046454668, "step": 152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 729.0, "completions/max_terminated_length": 729.0, "completions/mean_length": 378.59375, "completions/mean_terminated_length": 378.59375, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.3599497377872467, "epoch": 0.1875, "frac_reward_zero_std": 0.75, "grad_norm": 0.5571390177177418, "kl": 0.004533540457487106, "learning_rate": 6.178861788617887e-07, "loss": -0.0176, "num_tokens": 6426238.0, "reward": 0.59375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.3947250843048096, "sampling/importance_sampling_ratio/mean": 1.000086784362793, "sampling/importance_sampling_ratio/min": 0.6938565969467163, "sampling/sampling_logp_difference/max": 0.3654899597167969, "sampling/sampling_logp_difference/mean": 0.011803986504673958, "step": 153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1592.0, "completions/max_terminated_length": 1592.0, "completions/mean_length": 507.765625, "completions/mean_terminated_length": 507.765625, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "entropy": 0.4648657739162445, "epoch": 0.18872549019607843, "frac_reward_zero_std": 0.5, "grad_norm": 0.6644897599417385, "kl": 0.0035571714397519827, "learning_rate": 6.219512195121951e-07, "loss": -0.0796, "num_tokens": 6478367.0, "reward": 0.09375, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.4491777420043945, "sampling/importance_sampling_ratio/mean": 1.0002162456512451, "sampling/importance_sampling_ratio/min": 0.4817696213722229, "sampling/sampling_logp_difference/max": 0.7302892208099365, "sampling/sampling_logp_difference/mean": 0.014617575332522392, "step": 154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1230.0, "completions/max_terminated_length": 1230.0, "completions/mean_length": 481.328125, "completions/mean_terminated_length": 481.328125, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "entropy": 0.38059449195861816, "epoch": 0.18995098039215685, "frac_reward_zero_std": 0.75, "grad_norm": 0.5123622744755361, "kl": 0.004011716693639755, "learning_rate": 6.260162601626016e-07, "loss": -0.0038, "num_tokens": 6525268.0, "reward": 0.3125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.5744246244430542, "sampling/importance_sampling_ratio/mean": 1.0001606941223145, "sampling/importance_sampling_ratio/min": 0.5797883868217468, "sampling/sampling_logp_difference/max": 0.5450921058654785, "sampling/sampling_logp_difference/mean": 0.012282188050448895, "step": 155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1081.0, "completions/max_terminated_length": 1081.0, "completions/mean_length": 426.796875, "completions/mean_terminated_length": 426.796875, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "entropy": 0.3004455864429474, "epoch": 0.19117647058823528, "frac_reward_zero_std": 1.0, "grad_norm": 0.014081826418978417, "kl": 0.0039043508004397154, "learning_rate": 6.300813008130081e-07, "loss": 0.0, "num_tokens": 6569415.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6181378364562988, "sampling/importance_sampling_ratio/mean": 0.999901533126831, "sampling/importance_sampling_ratio/min": 0.5946979522705078, "sampling/sampling_logp_difference/max": 0.5197016000747681, "sampling/sampling_logp_difference/mean": 0.010305074043571949, "step": 156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 907.0, "completions/max_terminated_length": 907.0, "completions/mean_length": 398.359375, "completions/mean_terminated_length": 398.359375, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.3335036635398865, "epoch": 0.19240196078431374, "frac_reward_zero_std": 1.0, "grad_norm": 0.01695887859366382, "kl": 0.004330042749643326, "learning_rate": 6.341463414634146e-07, "loss": 0.0, "num_tokens": 6615246.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5872254371643066, "sampling/importance_sampling_ratio/mean": 1.000267744064331, "sampling/importance_sampling_ratio/min": 0.6254510879516602, "sampling/sampling_logp_difference/max": 0.4692821502685547, "sampling/sampling_logp_difference/mean": 0.011573560535907745, "step": 157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 719.0, "completions/max_terminated_length": 719.0, "completions/mean_length": 301.296875, "completions/mean_terminated_length": 301.296875, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.44272011518478394, "epoch": 0.19362745098039216, "frac_reward_zero_std": 0.5, "grad_norm": 1.0213451821380761, "kl": 0.005657391157001257, "learning_rate": 6.382113821138211e-07, "loss": -0.0455, "num_tokens": 6649233.0, "reward": 0.53125, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.6093683242797852, "sampling/importance_sampling_ratio/mean": 1.0000816583633423, "sampling/importance_sampling_ratio/min": 0.4897538423538208, "sampling/sampling_logp_difference/max": 0.7138524055480957, "sampling/sampling_logp_difference/mean": 0.015922019258141518, "step": 158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 623.0, "completions/max_terminated_length": 623.0, "completions/mean_length": 365.71875, "completions/mean_terminated_length": 365.71875, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "entropy": 0.4774950444698334, "epoch": 0.1948529411764706, "frac_reward_zero_std": 0.75, "grad_norm": 0.7180089717230046, "kl": 0.0047995829954743385, "learning_rate": 6.422764227642276e-07, "loss": -0.0012, "num_tokens": 6688335.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.5949105024337769, "sampling/importance_sampling_ratio/mean": 0.9995107650756836, "sampling/importance_sampling_ratio/min": 0.3610268235206604, "sampling/sampling_logp_difference/max": 1.0188030004501343, "sampling/sampling_logp_difference/mean": 0.016172613948583603, "step": 159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 712.0, "completions/max_terminated_length": 712.0, "completions/mean_length": 405.390625, "completions/mean_terminated_length": 405.390625, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "entropy": 0.43090176582336426, "epoch": 0.19607843137254902, "frac_reward_zero_std": 0.75, "grad_norm": 0.456602445008736, "kl": 0.0035547083243727684, "learning_rate": 6.463414634146342e-07, "loss": -0.0038, "num_tokens": 6733912.0, "reward": 0.0625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.6478630304336548, "sampling/importance_sampling_ratio/mean": 1.00009286403656, "sampling/importance_sampling_ratio/min": 0.5305798053741455, "sampling/sampling_logp_difference/max": 0.6337850093841553, "sampling/sampling_logp_difference/mean": 0.012937285006046295, "step": 160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 752.0, "completions/max_terminated_length": 752.0, "completions/mean_length": 392.359375, "completions/mean_terminated_length": 392.359375, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.44942134618759155, "epoch": 0.19730392156862744, "frac_reward_zero_std": 1.0, "grad_norm": 0.017260266529493525, "kl": 0.004692297428846359, "learning_rate": 6.504065040650406e-07, "loss": 0.0, "num_tokens": 6776127.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.8066729307174683, "sampling/importance_sampling_ratio/mean": 0.999629557132721, "sampling/importance_sampling_ratio/min": 0.5684332847595215, "sampling/sampling_logp_difference/max": 0.591486930847168, "sampling/sampling_logp_difference/mean": 0.014672377146780491, "step": 161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1189.0, "completions/max_terminated_length": 1189.0, "completions/mean_length": 471.484375, "completions/mean_terminated_length": 471.484375, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "entropy": 0.47953277826309204, "epoch": 0.19852941176470587, "frac_reward_zero_std": 0.75, "grad_norm": 0.5091053895897155, "kl": 0.003861036617308855, "learning_rate": 6.544715447154471e-07, "loss": -0.066, "num_tokens": 6825022.0, "reward": 0.09375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.4388420581817627, "sampling/importance_sampling_ratio/mean": 0.9998289346694946, "sampling/importance_sampling_ratio/min": 0.6015257239341736, "sampling/sampling_logp_difference/max": 0.5082859992980957, "sampling/sampling_logp_difference/mean": 0.014183681458234787, "step": 162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1086.0, "completions/max_terminated_length": 1086.0, "completions/mean_length": 502.96875, "completions/mean_terminated_length": 502.96875, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "entropy": 0.3863132894039154, "epoch": 0.19975490196078433, "frac_reward_zero_std": 0.5, "grad_norm": 0.6929067144139683, "kl": 0.003628198988735676, "learning_rate": 6.585365853658536e-07, "loss": 0.0233, "num_tokens": 6874220.0, "reward": 0.6875, "reward_std": 0.42898139357566833, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.3802987337112427, "sampling/importance_sampling_ratio/mean": 0.9999516606330872, "sampling/importance_sampling_ratio/min": 0.6645122766494751, "sampling/sampling_logp_difference/max": 0.40870189666748047, "sampling/sampling_logp_difference/mean": 0.011898540891706944, "step": 163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 859.0, "completions/max_terminated_length": 859.0, "completions/mean_length": 487.609375, "completions/mean_terminated_length": 487.609375, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "entropy": 0.35286206007003784, "epoch": 0.20098039215686275, "frac_reward_zero_std": 0.5, "grad_norm": 0.6684175744686148, "kl": 0.0036187574733048677, "learning_rate": 6.626016260162602e-07, "loss": 0.019, "num_tokens": 6938147.0, "reward": 0.9375, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.4058619737625122, "sampling/importance_sampling_ratio/mean": 0.9999487400054932, "sampling/importance_sampling_ratio/min": 0.6332491636276245, "sampling/sampling_logp_difference/max": 0.4568912982940674, "sampling/sampling_logp_difference/mean": 0.012093906290829182, "step": 164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1677.0, "completions/max_terminated_length": 1677.0, "completions/mean_length": 540.984375, "completions/mean_terminated_length": 540.984375, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.4097919762134552, "epoch": 0.20220588235294118, "frac_reward_zero_std": 0.75, "grad_norm": 0.3717741620357305, "kl": 0.0032151651103049517, "learning_rate": 6.666666666666666e-07, "loss": -0.0317, "num_tokens": 6990482.0, "reward": 0.65625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.4748649597167969, "sampling/importance_sampling_ratio/mean": 0.9996954202651978, "sampling/importance_sampling_ratio/min": 0.6730663180351257, "sampling/sampling_logp_difference/max": 0.39591145515441895, "sampling/sampling_logp_difference/mean": 0.01284037809818983, "step": 165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 564.0, "completions/max_terminated_length": 564.0, "completions/mean_length": 277.609375, "completions/mean_terminated_length": 277.609375, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.49535906314849854, "epoch": 0.2034313725490196, "frac_reward_zero_std": 0.75, "grad_norm": 0.6392681691999575, "kl": 0.008185303770005703, "learning_rate": 6.707317073170731e-07, "loss": 0.0113, "num_tokens": 7023721.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.3997917175292969, "sampling/importance_sampling_ratio/mean": 1.0001068115234375, "sampling/importance_sampling_ratio/min": 0.6889551877975464, "sampling/sampling_logp_difference/max": 0.37257909774780273, "sampling/sampling_logp_difference/mean": 0.016455374658107758, "step": 166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 642.0, "completions/max_terminated_length": 642.0, "completions/mean_length": 371.03125, "completions/mean_terminated_length": 371.03125, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "entropy": 0.46950244903564453, "epoch": 0.20465686274509803, "frac_reward_zero_std": 0.25, "grad_norm": 1.1341795638548644, "kl": 0.004859311506152153, "learning_rate": 6.747967479674797e-07, "loss": 0.0279, "num_tokens": 7064795.0, "reward": 0.5, "reward_std": 0.42078250646591187, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6250637769699097, "sampling/importance_sampling_ratio/mean": 1.000107765197754, "sampling/importance_sampling_ratio/min": 0.6469900608062744, "sampling/sampling_logp_difference/max": 0.4855470657348633, "sampling/sampling_logp_difference/mean": 0.014739388599991798, "step": 167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1284.0, "completions/max_terminated_length": 1284.0, "completions/mean_length": 491.234375, "completions/mean_terminated_length": 491.234375, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "entropy": 0.5913112163543701, "epoch": 0.20588235294117646, "frac_reward_zero_std": 0.5, "grad_norm": 0.8887257427156512, "kl": 0.004408184438943863, "learning_rate": 6.788617886178861e-07, "loss": -0.0431, "num_tokens": 7113850.0, "reward": -0.125, "reward_std": 0.481805682182312, "rewards/decision_reward_func/mean": -0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 1.6560295820236206, "sampling/importance_sampling_ratio/mean": 0.9998589754104614, "sampling/importance_sampling_ratio/min": 0.5362949371337891, "sampling/sampling_logp_difference/max": 0.6230709552764893, "sampling/sampling_logp_difference/mean": 0.015996946021914482, "step": 168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1022.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 481.296875, "completions/mean_terminated_length": 481.296875, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "entropy": 0.4298356771469116, "epoch": 0.20710784313725492, "frac_reward_zero_std": 0.5, "grad_norm": 0.6640793437592589, "kl": 0.003607515711337328, "learning_rate": 6.829268292682927e-07, "loss": -0.0053, "num_tokens": 7162541.0, "reward": 0.875, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.4521687030792236, "sampling/importance_sampling_ratio/mean": 1.0004308223724365, "sampling/importance_sampling_ratio/min": 0.2017352133989334, "sampling/sampling_logp_difference/max": 1.6007992029190063, "sampling/sampling_logp_difference/mean": 0.01292538270354271, "step": 169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1911.0, "completions/max_terminated_length": 1911.0, "completions/mean_length": 538.953125, "completions/mean_terminated_length": 538.953125, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.44844484329223633, "epoch": 0.20833333333333334, "frac_reward_zero_std": 0.5, "grad_norm": 1.0089777129172623, "kl": 0.0047582355327904224, "learning_rate": 6.869918699186991e-07, "loss": -0.0501, "num_tokens": 7217594.0, "reward": 0.59375, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.600766897201538, "sampling/importance_sampling_ratio/mean": 1.0002501010894775, "sampling/importance_sampling_ratio/min": 0.6921090483665466, "sampling/sampling_logp_difference/max": 0.47048282623291016, "sampling/sampling_logp_difference/mean": 0.014945071190595627, "step": 170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 797.0, "completions/max_terminated_length": 797.0, "completions/mean_length": 364.984375, "completions/mean_terminated_length": 364.984375, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "entropy": 0.4574723541736603, "epoch": 0.20955882352941177, "frac_reward_zero_std": 1.0, "grad_norm": 0.01873286692399453, "kl": 0.004790525417774916, "learning_rate": 6.910569105691057e-07, "loss": 0.0, "num_tokens": 7258873.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5891157388687134, "sampling/importance_sampling_ratio/mean": 1.0004844665527344, "sampling/importance_sampling_ratio/min": 0.6558976173400879, "sampling/sampling_logp_difference/max": 0.4631776809692383, "sampling/sampling_logp_difference/mean": 0.014632642269134521, "step": 171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 776.0, "completions/max_terminated_length": 776.0, "completions/mean_length": 399.984375, "completions/mean_terminated_length": 399.984375, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "entropy": 0.4561616778373718, "epoch": 0.2107843137254902, "frac_reward_zero_std": 0.75, "grad_norm": 0.6205438021462557, "kl": 0.004261118359863758, "learning_rate": 6.951219512195121e-07, "loss": -0.044, "num_tokens": 7305848.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.7109476327896118, "sampling/importance_sampling_ratio/mean": 0.9997113943099976, "sampling/importance_sampling_ratio/min": 0.5396422147750854, "sampling/sampling_logp_difference/max": 0.6168489456176758, "sampling/sampling_logp_difference/mean": 0.014773163944482803, "step": 172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1216.0, "completions/max_terminated_length": 1216.0, "completions/mean_length": 397.625, "completions/mean_terminated_length": 397.625, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.4278619885444641, "epoch": 0.21200980392156862, "frac_reward_zero_std": 0.75, "grad_norm": 0.6604330388178005, "kl": 0.003941599279642105, "learning_rate": 6.991869918699187e-07, "loss": 0.1122, "num_tokens": 7349472.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.5583029985427856, "sampling/importance_sampling_ratio/mean": 1.0007801055908203, "sampling/importance_sampling_ratio/min": 0.6283499598503113, "sampling/sampling_logp_difference/max": 0.4646580219268799, "sampling/sampling_logp_difference/mean": 0.014442057348787785, "step": 173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 673.0, "completions/max_terminated_length": 673.0, "completions/mean_length": 277.609375, "completions/mean_terminated_length": 277.609375, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.4941035211086273, "epoch": 0.21323529411764705, "frac_reward_zero_std": 0.75, "grad_norm": 0.6246855728037028, "kl": 0.006091909483075142, "learning_rate": 7.032520325203252e-07, "loss": 0.0026, "num_tokens": 7386007.0, "reward": 0.78125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.5808994770050049, "sampling/importance_sampling_ratio/mean": 0.9998506903648376, "sampling/importance_sampling_ratio/min": 0.6171379685401917, "sampling/sampling_logp_difference/max": 0.4826626777648926, "sampling/sampling_logp_difference/mean": 0.01603008434176445, "step": 174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1029.0, "completions/max_terminated_length": 1029.0, "completions/mean_length": 476.140625, "completions/mean_terminated_length": 476.140625, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "entropy": 0.45601868629455566, "epoch": 0.21446078431372548, "frac_reward_zero_std": 0.5, "grad_norm": 0.732853294823002, "kl": 0.004244891926646233, "learning_rate": 7.073170731707316e-07, "loss": 0.0492, "num_tokens": 7437728.0, "reward": 0.875, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.5612297058105469, "sampling/importance_sampling_ratio/mean": 0.9998975992202759, "sampling/importance_sampling_ratio/min": 0.504122257232666, "sampling/sampling_logp_difference/max": 0.6849365234375, "sampling/sampling_logp_difference/mean": 0.013925015926361084, "step": 175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1502.0, "completions/max_terminated_length": 1502.0, "completions/mean_length": 489.515625, "completions/mean_terminated_length": 489.515625, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.5971165895462036, "epoch": 0.21568627450980393, "frac_reward_zero_std": 0.25, "grad_norm": 1.0072046769437988, "kl": 0.004870361648499966, "learning_rate": 7.113821138211382e-07, "loss": -0.1616, "num_tokens": 7491633.0, "reward": 0.21875, "reward_std": 0.5722135901451111, "rewards/decision_reward_func/mean": 0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 1.8303399085998535, "sampling/importance_sampling_ratio/mean": 1.000565528869629, "sampling/importance_sampling_ratio/min": 0.6207992434501648, "sampling/sampling_logp_difference/max": 0.6045017242431641, "sampling/sampling_logp_difference/mean": 0.01714678853750229, "step": 176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1000.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 433.109375, "completions/mean_terminated_length": 433.109375, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "entropy": 0.35336244106292725, "epoch": 0.21691176470588236, "frac_reward_zero_std": 0.75, "grad_norm": 0.5007256774092718, "kl": 0.003227109555155039, "learning_rate": 7.154471544715447e-07, "loss": -0.0222, "num_tokens": 7542264.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.6965581178665161, "sampling/importance_sampling_ratio/mean": 1.0002156496047974, "sampling/importance_sampling_ratio/min": 0.521416425704956, "sampling/sampling_logp_difference/max": 0.6512062549591064, "sampling/sampling_logp_difference/mean": 0.011670866049826145, "step": 177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 876.0, "completions/max_terminated_length": 876.0, "completions/mean_length": 401.984375, "completions/mean_terminated_length": 401.984375, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "entropy": 0.3588196039199829, "epoch": 0.2181372549019608, "frac_reward_zero_std": 0.75, "grad_norm": 0.6682730264356083, "kl": 0.0035114730708301067, "learning_rate": 7.195121951219512e-07, "loss": 0.0446, "num_tokens": 7588311.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.529266119003296, "sampling/importance_sampling_ratio/mean": 1.000096082687378, "sampling/importance_sampling_ratio/min": 0.5586716532707214, "sampling/sampling_logp_difference/max": 0.5821933746337891, "sampling/sampling_logp_difference/mean": 0.011490784585475922, "step": 178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1284.0, "completions/max_terminated_length": 1284.0, "completions/mean_length": 530.015625, "completions/mean_terminated_length": 530.015625, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "entropy": 0.3720214068889618, "epoch": 0.2193627450980392, "frac_reward_zero_std": 0.75, "grad_norm": 0.386488106591345, "kl": 0.0037605869583785534, "learning_rate": 7.235772357723577e-07, "loss": -0.0066, "num_tokens": 7647592.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.957401156425476, "sampling/importance_sampling_ratio/mean": 0.9999653100967407, "sampling/importance_sampling_ratio/min": 0.6151204109191895, "sampling/sampling_logp_difference/max": 0.6716176271438599, "sampling/sampling_logp_difference/mean": 0.011253642849624157, "step": 179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2063.0, "completions/max_terminated_length": 2063.0, "completions/mean_length": 374.03125, "completions/mean_terminated_length": 374.03125, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.5727463960647583, "epoch": 0.22058823529411764, "frac_reward_zero_std": 0.75, "grad_norm": 0.5618724912354318, "kl": 0.005983203649520874, "learning_rate": 7.276422764227642e-07, "loss": 0.0279, "num_tokens": 7691274.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.6513185501098633, "sampling/importance_sampling_ratio/mean": 0.9995871782302856, "sampling/importance_sampling_ratio/min": 0.6419724822044373, "sampling/sampling_logp_difference/max": 0.5015740394592285, "sampling/sampling_logp_difference/mean": 0.017797719687223434, "step": 180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 857.0, "completions/max_terminated_length": 857.0, "completions/mean_length": 363.40625, "completions/mean_terminated_length": 363.40625, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.49614229798316956, "epoch": 0.22181372549019607, "frac_reward_zero_std": 0.5, "grad_norm": 0.951165914394822, "kl": 0.006604750175029039, "learning_rate": 7.317073170731707e-07, "loss": -0.0306, "num_tokens": 7730484.0, "reward": 0.5, "reward_std": 0.4472135901451111, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6309478282928467, "sampling/importance_sampling_ratio/mean": 0.9999588131904602, "sampling/importance_sampling_ratio/min": 0.48406141996383667, "sampling/sampling_logp_difference/max": 0.7255434989929199, "sampling/sampling_logp_difference/mean": 0.01605304330587387, "step": 181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1425.0, "completions/max_terminated_length": 1425.0, "completions/mean_length": 448.375, "completions/mean_terminated_length": 448.375, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "entropy": 0.37348049879074097, "epoch": 0.22303921568627452, "frac_reward_zero_std": 0.75, "grad_norm": 0.5166880816956583, "kl": 0.004921473562717438, "learning_rate": 7.357723577235772e-07, "loss": 0.0887, "num_tokens": 7781740.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.8179765939712524, "sampling/importance_sampling_ratio/mean": 1.0001211166381836, "sampling/importance_sampling_ratio/min": 0.6387758851051331, "sampling/sampling_logp_difference/max": 0.5977240800857544, "sampling/sampling_logp_difference/mean": 0.012444385327398777, "step": 182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 830.0, "completions/max_terminated_length": 830.0, "completions/mean_length": 453.015625, "completions/mean_terminated_length": 453.015625, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "entropy": 0.3997567296028137, "epoch": 0.22426470588235295, "frac_reward_zero_std": 1.0, "grad_norm": 0.011835528155763065, "kl": 0.003470691852271557, "learning_rate": 7.398373983739837e-07, "loss": 0.0, "num_tokens": 7831181.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6587755680084229, "sampling/importance_sampling_ratio/mean": 0.9998177886009216, "sampling/importance_sampling_ratio/min": 0.6498575210571289, "sampling/sampling_logp_difference/max": 0.5060796737670898, "sampling/sampling_logp_difference/mean": 0.012998364865779877, "step": 183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 903.0, "completions/max_terminated_length": 903.0, "completions/mean_length": 413.546875, "completions/mean_terminated_length": 413.546875, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "entropy": 0.48159945011138916, "epoch": 0.22549019607843138, "frac_reward_zero_std": 0.75, "grad_norm": 0.5127783792197383, "kl": 0.005283496342599392, "learning_rate": 7.439024390243903e-07, "loss": 0.0109, "num_tokens": 7872528.0, "reward": 0.34375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 1.380964994430542, "sampling/importance_sampling_ratio/mean": 1.0000718832015991, "sampling/importance_sampling_ratio/min": 0.7114086747169495, "sampling/sampling_logp_difference/max": 0.34050822257995605, "sampling/sampling_logp_difference/mean": 0.014744466170668602, "step": 184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 708.0, "completions/max_terminated_length": 708.0, "completions/mean_length": 322.703125, "completions/mean_terminated_length": 322.703125, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.5981003046035767, "epoch": 0.2267156862745098, "frac_reward_zero_std": 0.75, "grad_norm": 0.477945408078274, "kl": 0.006985259708017111, "learning_rate": 7.479674796747967e-07, "loss": 0.0056, "num_tokens": 7910637.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.00022554397583, "sampling/importance_sampling_ratio/min": 0.6778608560562134, "sampling/sampling_logp_difference/max": 0.7313823699951172, "sampling/sampling_logp_difference/mean": 0.01764710247516632, "step": 185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1270.0, "completions/max_terminated_length": 1270.0, "completions/mean_length": 401.03125, "completions/mean_terminated_length": 401.03125, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "entropy": 0.3814786374568939, "epoch": 0.22794117647058823, "frac_reward_zero_std": 0.75, "grad_norm": 0.5347925309128488, "kl": 0.0038328412920236588, "learning_rate": 7.520325203252032e-07, "loss": 0.0102, "num_tokens": 7956911.0, "reward": 0.09375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.4916139841079712, "sampling/importance_sampling_ratio/mean": 1.0002614259719849, "sampling/importance_sampling_ratio/min": 0.7016423344612122, "sampling/sampling_logp_difference/max": 0.3998587131500244, "sampling/sampling_logp_difference/mean": 0.012018885463476181, "step": 186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1537.0, "completions/max_terminated_length": 1537.0, "completions/mean_length": 362.21875, "completions/mean_terminated_length": 362.21875, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.5369474291801453, "epoch": 0.22916666666666666, "frac_reward_zero_std": 0.75, "grad_norm": 0.6502270307249437, "kl": 0.006617746781557798, "learning_rate": 7.560975609756097e-07, "loss": 0.0053, "num_tokens": 7998669.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.5862452983856201, "sampling/importance_sampling_ratio/mean": 0.9998201131820679, "sampling/importance_sampling_ratio/min": 0.633513867855072, "sampling/sampling_logp_difference/max": 0.46136975288391113, "sampling/sampling_logp_difference/mean": 0.01709068939089775, "step": 187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1169.0, "completions/max_terminated_length": 1169.0, "completions/mean_length": 485.296875, "completions/mean_terminated_length": 485.296875, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "entropy": 0.5380478501319885, "epoch": 0.23039215686274508, "frac_reward_zero_std": 0.5, "grad_norm": 1.02432362559322, "kl": 0.004427447449415922, "learning_rate": 7.601626016260162e-07, "loss": 0.0361, "num_tokens": 8056800.0, "reward": 0.4375, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.608885407447815, "sampling/importance_sampling_ratio/mean": 0.9999710321426392, "sampling/importance_sampling_ratio/min": 0.4801919162273407, "sampling/sampling_logp_difference/max": 0.7335694432258606, "sampling/sampling_logp_difference/mean": 0.01595824956893921, "step": 188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 783.0, "completions/max_terminated_length": 783.0, "completions/mean_length": 395.375, "completions/mean_terminated_length": 395.375, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "entropy": 0.4918695390224457, "epoch": 0.23161764705882354, "frac_reward_zero_std": 0.75, "grad_norm": 0.4675705420098068, "kl": 0.005992270074784756, "learning_rate": 7.642276422764228e-07, "loss": -0.0126, "num_tokens": 8097960.0, "reward": 0.625, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.4455801248550415, "sampling/importance_sampling_ratio/mean": 1.0000041723251343, "sampling/importance_sampling_ratio/min": 0.6454936861991882, "sampling/sampling_logp_difference/max": 0.43773984909057617, "sampling/sampling_logp_difference/mean": 0.014961302280426025, "step": 189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 673.0, "completions/max_terminated_length": 673.0, "completions/mean_length": 300.5625, "completions/mean_terminated_length": 300.5625, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.44659334421157837, "epoch": 0.23284313725490197, "frac_reward_zero_std": 0.5, "grad_norm": 0.8184448352074536, "kl": 0.006650568451732397, "learning_rate": 7.682926829268292e-07, "loss": 0.0196, "num_tokens": 8132492.0, "reward": 0.375, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.6274304389953613, "sampling/importance_sampling_ratio/mean": 1.0004918575286865, "sampling/importance_sampling_ratio/min": 0.6324418187141418, "sampling/sampling_logp_difference/max": 0.4870023727416992, "sampling/sampling_logp_difference/mean": 0.015091955661773682, "step": 190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1788.0, "completions/max_terminated_length": 1788.0, "completions/mean_length": 545.859375, "completions/mean_terminated_length": 545.859375, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "entropy": 0.5478230714797974, "epoch": 0.2340686274509804, "frac_reward_zero_std": 0.5, "grad_norm": 0.5984246163392734, "kl": 0.0048377844505012035, "learning_rate": 7.723577235772358e-07, "loss": 0.0098, "num_tokens": 8189491.0, "reward": 0.09375, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.4394152164459229, "sampling/importance_sampling_ratio/mean": 1.0000107288360596, "sampling/importance_sampling_ratio/min": 0.528676450252533, "sampling/sampling_logp_difference/max": 0.6373786926269531, "sampling/sampling_logp_difference/mean": 0.016178160905838013, "step": 191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 972.0, "completions/max_terminated_length": 972.0, "completions/mean_length": 380.21875, "completions/mean_terminated_length": 380.21875, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.5085766315460205, "epoch": 0.23529411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 0.8653279331582857, "kl": 0.005279788747429848, "learning_rate": 7.764227642276422e-07, "loss": -0.0991, "num_tokens": 8229009.0, "reward": -0.21875, "reward_std": 0.38319888710975647, "rewards/decision_reward_func/mean": -0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 1.6145867109298706, "sampling/importance_sampling_ratio/mean": 1.0008035898208618, "sampling/importance_sampling_ratio/min": 0.6494563221931458, "sampling/sampling_logp_difference/max": 0.479079008102417, "sampling/sampling_logp_difference/mean": 0.01593240536749363, "step": 192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1041.0, "completions/max_terminated_length": 1041.0, "completions/mean_length": 572.359375, "completions/mean_terminated_length": 572.359375, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "entropy": 0.3963686525821686, "epoch": 0.23651960784313725, "frac_reward_zero_std": 0.75, "grad_norm": 0.5288271913623861, "kl": 0.0025573442690074444, "learning_rate": 7.804878048780488e-07, "loss": -0.0062, "num_tokens": 8290520.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0002659559249878, "sampling/importance_sampling_ratio/min": 0.5596486926078796, "sampling/sampling_logp_difference/max": 0.7287571430206299, "sampling/sampling_logp_difference/mean": 0.012413708493113518, "step": 193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1599.0, "completions/max_terminated_length": 1599.0, "completions/mean_length": 521.0, "completions/mean_terminated_length": 521.0, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.5120000839233398, "epoch": 0.23774509803921567, "frac_reward_zero_std": 0.5, "grad_norm": 1.4120232861505997, "kl": 0.004460856318473816, "learning_rate": 7.845528455284552e-07, "loss": 0.0074, "num_tokens": 8339880.0, "reward": 0.59375, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.6253219842910767, "sampling/importance_sampling_ratio/mean": 0.9999644756317139, "sampling/importance_sampling_ratio/min": 0.6631456613540649, "sampling/sampling_logp_difference/max": 0.4857058525085449, "sampling/sampling_logp_difference/mean": 0.015755321830511093, "step": 194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1960.0, "completions/max_terminated_length": 1960.0, "completions/mean_length": 509.5, "completions/mean_terminated_length": 509.5, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "entropy": 0.41128700971603394, "epoch": 0.23897058823529413, "frac_reward_zero_std": 0.75, "grad_norm": 0.5849686705191636, "kl": 0.003687812015414238, "learning_rate": 7.886178861788617e-07, "loss": 0.1057, "num_tokens": 8391432.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.7592071294784546, "sampling/importance_sampling_ratio/mean": 1.0002968311309814, "sampling/importance_sampling_ratio/min": 0.6585896015167236, "sampling/sampling_logp_difference/max": 0.5648632049560547, "sampling/sampling_logp_difference/mean": 0.013009379617869854, "step": 195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 579.0, "completions/max_terminated_length": 579.0, "completions/mean_length": 296.34375, "completions/mean_terminated_length": 296.34375, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.4526978135108948, "epoch": 0.24019607843137256, "frac_reward_zero_std": 0.75, "grad_norm": 0.7126594989667164, "kl": 0.005487365648150444, "learning_rate": 7.926829268292683e-07, "loss": -0.0115, "num_tokens": 8426782.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.6015453338623047, "sampling/importance_sampling_ratio/mean": 0.9999386072158813, "sampling/importance_sampling_ratio/min": 0.6121783256530762, "sampling/sampling_logp_difference/max": 0.49073171615600586, "sampling/sampling_logp_difference/mean": 0.01590438187122345, "step": 196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1167.0, "completions/max_terminated_length": 1167.0, "completions/mean_length": 433.34375, "completions/mean_terminated_length": 433.34375, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.4306119680404663, "epoch": 0.24142156862745098, "frac_reward_zero_std": 1.0, "grad_norm": 0.009693392435579533, "kl": 0.003387909382581711, "learning_rate": 7.967479674796747e-07, "loss": 0.0, "num_tokens": 8474212.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9998720288276672, "sampling/importance_sampling_ratio/min": 0.6539034247398376, "sampling/sampling_logp_difference/max": 0.7102479934692383, "sampling/sampling_logp_difference/mean": 0.013633619993925095, "step": 197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 786.0, "completions/max_terminated_length": 786.0, "completions/mean_length": 372.40625, "completions/mean_terminated_length": 372.40625, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.46559786796569824, "epoch": 0.2426470588235294, "frac_reward_zero_std": 0.75, "grad_norm": 0.5478102079744444, "kl": 0.004131893161684275, "learning_rate": 8.008130081300813e-07, "loss": 0.0071, "num_tokens": 8511518.0, "reward": 0.65625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.3672350645065308, "sampling/importance_sampling_ratio/mean": 0.9997446537017822, "sampling/importance_sampling_ratio/min": 0.4394739270210266, "sampling/sampling_logp_difference/max": 0.8221769332885742, "sampling/sampling_logp_difference/mean": 0.014384216628968716, "step": 198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 739.0, "completions/max_terminated_length": 739.0, "completions/mean_length": 373.0, "completions/mean_terminated_length": 373.0, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "entropy": 0.4356841742992401, "epoch": 0.24387254901960784, "frac_reward_zero_std": 1.0, "grad_norm": 0.01180386104996107, "kl": 0.003270552959293127, "learning_rate": 8.048780487804878e-07, "loss": 0.0, "num_tokens": 8554254.0, "reward": -0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": -0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6787763833999634, "sampling/importance_sampling_ratio/mean": 1.000264286994934, "sampling/importance_sampling_ratio/min": 0.3785226345062256, "sampling/sampling_logp_difference/max": 0.9714794158935547, "sampling/sampling_logp_difference/mean": 0.014370928518474102, "step": 199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 904.0, "completions/max_terminated_length": 904.0, "completions/mean_length": 440.46875, "completions/mean_terminated_length": 440.46875, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "entropy": 0.4416758418083191, "epoch": 0.24509803921568626, "frac_reward_zero_std": 0.75, "grad_norm": 0.42207764989927266, "kl": 0.0027535143308341503, "learning_rate": 8.089430894308943e-07, "loss": 0.0073, "num_tokens": 8607932.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.6027153730392456, "sampling/importance_sampling_ratio/mean": 0.9998844861984253, "sampling/importance_sampling_ratio/min": 0.6176396012306213, "sampling/sampling_logp_difference/max": 0.48185014724731445, "sampling/sampling_logp_difference/mean": 0.012988569214940071, "step": 200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 780.0, "completions/max_terminated_length": 780.0, "completions/mean_length": 427.28125, "completions/mean_terminated_length": 427.28125, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "entropy": 0.5359739065170288, "epoch": 0.24632352941176472, "frac_reward_zero_std": 1.0, "grad_norm": 0.01424017231908946, "kl": 0.004285302013158798, "learning_rate": 8.130081300813008e-07, "loss": 0.0, "num_tokens": 8652430.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4350324869155884, "sampling/importance_sampling_ratio/mean": 1.000110149383545, "sampling/importance_sampling_ratio/min": 0.521568775177002, "sampling/sampling_logp_difference/max": 0.650914192199707, "sampling/sampling_logp_difference/mean": 0.01647791638970375, "step": 201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 656.0, "completions/max_terminated_length": 656.0, "completions/mean_length": 337.203125, "completions/mean_terminated_length": 337.203125, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "entropy": 0.41922080516815186, "epoch": 0.24754901960784315, "frac_reward_zero_std": 0.75, "grad_norm": 0.5785927519682516, "kl": 0.004934743978083134, "learning_rate": 8.170731707317072e-07, "loss": -0.0056, "num_tokens": 8693323.0, "reward": 0.59375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.5709730386734009, "sampling/importance_sampling_ratio/mean": 1.000248908996582, "sampling/importance_sampling_ratio/min": 0.6612770557403564, "sampling/sampling_logp_difference/max": 0.45169520378112793, "sampling/sampling_logp_difference/mean": 0.013619820587337017, "step": 202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 670.0, "completions/max_terminated_length": 670.0, "completions/mean_length": 303.984375, "completions/mean_terminated_length": 303.984375, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.4433736205101013, "epoch": 0.24877450980392157, "frac_reward_zero_std": 1.0, "grad_norm": 0.015897202598931, "kl": 0.004801301285624504, "learning_rate": 8.211382113821138e-07, "loss": 0.0, "num_tokens": 8727770.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5415936708450317, "sampling/importance_sampling_ratio/mean": 0.9997665882110596, "sampling/importance_sampling_ratio/min": 0.6658324003219604, "sampling/sampling_logp_difference/max": 0.432816743850708, "sampling/sampling_logp_difference/mean": 0.014023381285369396, "step": 203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 591.0, "completions/max_terminated_length": 591.0, "completions/mean_length": 355.546875, "completions/mean_terminated_length": 355.546875, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "entropy": 0.3853534758090973, "epoch": 0.25, "frac_reward_zero_std": 1.0, "grad_norm": 0.009979773967544474, "kl": 0.0028065149672329426, "learning_rate": 8.252032520325202e-07, "loss": 0.0, "num_tokens": 8771981.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6130318641662598, "sampling/importance_sampling_ratio/mean": 0.9996683597564697, "sampling/importance_sampling_ratio/min": 0.6458671689033508, "sampling/sampling_logp_difference/max": 0.4781155586242676, "sampling/sampling_logp_difference/mean": 0.012810765765607357, "step": 204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 756.0, "completions/max_terminated_length": 756.0, "completions/mean_length": 381.796875, "completions/mean_terminated_length": 381.796875, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "entropy": 0.41156747937202454, "epoch": 0.2512254901960784, "frac_reward_zero_std": 0.75, "grad_norm": 0.5264573359980514, "kl": 0.0036690214183181524, "learning_rate": 8.292682926829268e-07, "loss": -0.0353, "num_tokens": 8811808.0, "reward": 0.0625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999576807022095, "sampling/importance_sampling_ratio/min": 0.3858484625816345, "sampling/sampling_logp_difference/max": 0.9523105621337891, "sampling/sampling_logp_difference/mean": 0.012871080078184605, "step": 205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 594.0, "completions/max_terminated_length": 594.0, "completions/mean_length": 281.265625, "completions/mean_terminated_length": 281.265625, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.36229994893074036, "epoch": 0.25245098039215685, "frac_reward_zero_std": 1.0, "grad_norm": 0.014886403567225088, "kl": 0.004854248836636543, "learning_rate": 8.333333333333333e-07, "loss": 0.0, "num_tokens": 8849937.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4352002143859863, "sampling/importance_sampling_ratio/mean": 1.000007152557373, "sampling/importance_sampling_ratio/min": 0.4507101774215698, "sampling/sampling_logp_difference/max": 0.7969307899475098, "sampling/sampling_logp_difference/mean": 0.014139528386294842, "step": 206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 671.0, "completions/max_terminated_length": 671.0, "completions/mean_length": 405.765625, "completions/mean_terminated_length": 405.765625, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "entropy": 0.5414474010467529, "epoch": 0.2536764705882353, "frac_reward_zero_std": 0.75, "grad_norm": 0.5185427864124751, "kl": 0.0030638203024864197, "learning_rate": 8.373983739837398e-07, "loss": -0.0274, "num_tokens": 8894082.0, "reward": 0.6875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.5892702341079712, "sampling/importance_sampling_ratio/mean": 1.0000797510147095, "sampling/importance_sampling_ratio/min": 0.6446306109428406, "sampling/sampling_logp_difference/max": 0.4632749557495117, "sampling/sampling_logp_difference/mean": 0.015613934025168419, "step": 207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 697.0, "completions/max_terminated_length": 697.0, "completions/mean_length": 339.984375, "completions/mean_terminated_length": 339.984375, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "entropy": 0.4759123623371124, "epoch": 0.2549019607843137, "frac_reward_zero_std": 0.75, "grad_norm": 0.6835822848073662, "kl": 0.005242983344942331, "learning_rate": 8.414634146341463e-07, "loss": 0.0028, "num_tokens": 8930417.0, "reward": 0.78125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.4805220365524292, "sampling/importance_sampling_ratio/mean": 0.9999938011169434, "sampling/importance_sampling_ratio/min": 0.6299000382423401, "sampling/sampling_logp_difference/max": 0.46219420433044434, "sampling/sampling_logp_difference/mean": 0.01565392129123211, "step": 208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 600.0, "completions/max_terminated_length": 600.0, "completions/mean_length": 256.734375, "completions/mean_terminated_length": 256.734375, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.5525139570236206, "epoch": 0.25612745098039214, "frac_reward_zero_std": 0.25, "grad_norm": 1.3111775783524455, "kl": 0.007884347811341286, "learning_rate": 8.455284552845529e-07, "loss": -0.0129, "num_tokens": 8961888.0, "reward": 0.71875, "reward_std": 0.5457825064659119, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.4257937669754028, "sampling/importance_sampling_ratio/mean": 1.0002796649932861, "sampling/importance_sampling_ratio/min": 0.6614418029785156, "sampling/sampling_logp_difference/max": 0.41333329677581787, "sampling/sampling_logp_difference/mean": 0.018570387735962868, "step": 209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/max_terminated_length": 516.0, "completions/mean_length": 301.703125, "completions/mean_terminated_length": 301.703125, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "entropy": 0.3947408199310303, "epoch": 0.25735294117647056, "frac_reward_zero_std": 0.75, "grad_norm": 0.6454722711432224, "kl": 0.003253223840147257, "learning_rate": 8.495934959349593e-07, "loss": 0.0113, "num_tokens": 8999229.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.4255321025848389, "sampling/importance_sampling_ratio/mean": 0.9994814395904541, "sampling/importance_sampling_ratio/min": 0.6056057214736938, "sampling/sampling_logp_difference/max": 0.5015261173248291, "sampling/sampling_logp_difference/mean": 0.013710642233490944, "step": 210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 535.0, "completions/max_terminated_length": 535.0, "completions/mean_length": 274.859375, "completions/mean_terminated_length": 274.859375, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.5436346530914307, "epoch": 0.25857843137254904, "frac_reward_zero_std": 0.5, "grad_norm": 0.9235867522864423, "kl": 0.005823261104524136, "learning_rate": 8.536585365853657e-07, "loss": -0.0167, "num_tokens": 9034804.0, "reward": 0.28125, "reward_std": 0.375, "rewards/decision_reward_func/mean": 0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 1.7462549209594727, "sampling/importance_sampling_ratio/mean": 1.0001451969146729, "sampling/importance_sampling_ratio/min": 0.5452821254730225, "sampling/sampling_logp_difference/max": 0.6064519882202148, "sampling/sampling_logp_difference/mean": 0.019334815442562103, "step": 211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 777.0, "completions/max_terminated_length": 777.0, "completions/mean_length": 334.453125, "completions/mean_terminated_length": 334.453125, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.4834514856338501, "epoch": 0.25980392156862747, "frac_reward_zero_std": 0.75, "grad_norm": 0.6723004881996534, "kl": 0.004506836645305157, "learning_rate": 8.577235772357723e-07, "loss": -0.0142, "num_tokens": 9070657.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.4465776681900024, "sampling/importance_sampling_ratio/mean": 1.0001462697982788, "sampling/importance_sampling_ratio/min": 0.6087289452552795, "sampling/sampling_logp_difference/max": 0.4963822364807129, "sampling/sampling_logp_difference/mean": 0.015772415325045586, "step": 212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1383.0, "completions/max_terminated_length": 1383.0, "completions/mean_length": 469.703125, "completions/mean_terminated_length": 469.703125, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.46233388781547546, "epoch": 0.2610294117647059, "frac_reward_zero_std": 0.75, "grad_norm": 0.7108717325145066, "kl": 0.003683686489239335, "learning_rate": 8.617886178861788e-07, "loss": 0.0198, "num_tokens": 9122830.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.539827585220337, "sampling/importance_sampling_ratio/mean": 0.9997385740280151, "sampling/importance_sampling_ratio/min": 0.5865123271942139, "sampling/sampling_logp_difference/max": 0.5335615873336792, "sampling/sampling_logp_difference/mean": 0.01475541852414608, "step": 213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1689.0, "completions/max_terminated_length": 1689.0, "completions/mean_length": 518.40625, "completions/mean_terminated_length": 518.40625, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "entropy": 0.41643911600112915, "epoch": 0.2622549019607843, "frac_reward_zero_std": 1.0, "grad_norm": 0.008330473011950463, "kl": 0.0031126798130571842, "learning_rate": 8.658536585365853e-07, "loss": 0.0, "num_tokens": 9180664.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5509802103042603, "sampling/importance_sampling_ratio/mean": 0.9998952746391296, "sampling/importance_sampling_ratio/min": 0.591347873210907, "sampling/sampling_logp_difference/max": 0.5253508687019348, "sampling/sampling_logp_difference/mean": 0.014208003878593445, "step": 214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 901.0, "completions/max_terminated_length": 901.0, "completions/mean_length": 423.703125, "completions/mean_terminated_length": 423.703125, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.3201414942741394, "epoch": 0.26348039215686275, "frac_reward_zero_std": 1.0, "grad_norm": 0.008311063623481522, "kl": 0.0027530700899660587, "learning_rate": 8.699186991869918e-07, "loss": 0.0, "num_tokens": 9235477.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9997621178627014, "sampling/importance_sampling_ratio/min": 0.5695708990097046, "sampling/sampling_logp_difference/max": 0.8375595808029175, "sampling/sampling_logp_difference/mean": 0.01148461177945137, "step": 215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 931.0, "completions/max_terminated_length": 931.0, "completions/mean_length": 386.921875, "completions/mean_terminated_length": 386.921875, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "entropy": 0.38359034061431885, "epoch": 0.2647058823529412, "frac_reward_zero_std": 0.75, "grad_norm": 0.492377844894546, "kl": 0.004112003371119499, "learning_rate": 8.739837398373984e-07, "loss": -0.045, "num_tokens": 9278848.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.463629961013794, "sampling/importance_sampling_ratio/mean": 1.0001704692840576, "sampling/importance_sampling_ratio/min": 0.691211998462677, "sampling/sampling_logp_difference/max": 0.38091957569122314, "sampling/sampling_logp_difference/mean": 0.012968108057975769, "step": 216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1789.0, "completions/max_terminated_length": 1789.0, "completions/mean_length": 460.15625, "completions/mean_terminated_length": 460.15625, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "entropy": 0.4212179481983185, "epoch": 0.2659313725490196, "frac_reward_zero_std": 0.75, "grad_norm": 0.6423987827663491, "kl": 0.0033663909416645765, "learning_rate": 8.780487804878048e-07, "loss": 0.12, "num_tokens": 9327370.0, "reward": 0.40625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.4415700435638428, "sampling/importance_sampling_ratio/mean": 0.9995126128196716, "sampling/importance_sampling_ratio/min": 0.6015902757644653, "sampling/sampling_logp_difference/max": 0.5081787109375, "sampling/sampling_logp_difference/mean": 0.013313958421349525, "step": 217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1319.0, "completions/max_terminated_length": 1319.0, "completions/mean_length": 437.25, "completions/mean_terminated_length": 437.25, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "entropy": 0.31653136014938354, "epoch": 0.26715686274509803, "frac_reward_zero_std": 1.0, "grad_norm": 0.007640823684835177, "kl": 0.0025680765975266695, "learning_rate": 8.821138211382113e-07, "loss": 0.0, "num_tokens": 9376570.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3473955392837524, "sampling/importance_sampling_ratio/mean": 0.9996888637542725, "sampling/importance_sampling_ratio/min": 0.4982426166534424, "sampling/sampling_logp_difference/max": 0.6966681480407715, "sampling/sampling_logp_difference/mean": 0.010345378890633583, "step": 218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 657.0, "completions/max_terminated_length": 657.0, "completions/mean_length": 363.3125, "completions/mean_terminated_length": 363.3125, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.433807373046875, "epoch": 0.26838235294117646, "frac_reward_zero_std": 1.0, "grad_norm": 0.012086751925003116, "kl": 0.004698028787970543, "learning_rate": 8.861788617886179e-07, "loss": 0.0, "num_tokens": 9416334.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6051712036132812, "sampling/importance_sampling_ratio/mean": 0.9994518160820007, "sampling/importance_sampling_ratio/min": 0.5025330185890198, "sampling/sampling_logp_difference/max": 0.688093900680542, "sampling/sampling_logp_difference/mean": 0.014945242553949356, "step": 219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1489.0, "completions/max_terminated_length": 1489.0, "completions/mean_length": 427.71875, "completions/mean_terminated_length": 427.71875, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.4216124713420868, "epoch": 0.2696078431372549, "frac_reward_zero_std": 0.75, "grad_norm": 0.5913809027230166, "kl": 0.00386065524071455, "learning_rate": 8.902439024390244e-07, "loss": 0.0046, "num_tokens": 9461436.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.4394358396530151, "sampling/importance_sampling_ratio/mean": 1.0001566410064697, "sampling/importance_sampling_ratio/min": 0.4416671693325043, "sampling/sampling_logp_difference/max": 0.8171987533569336, "sampling/sampling_logp_difference/mean": 0.014225799590349197, "step": 220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1469.0, "completions/max_terminated_length": 1469.0, "completions/mean_length": 390.828125, "completions/mean_terminated_length": 390.828125, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "entropy": 0.5868004560470581, "epoch": 0.2708333333333333, "frac_reward_zero_std": 0.75, "grad_norm": 0.5494231751139271, "kl": 0.005832800175994635, "learning_rate": 8.943089430894308e-07, "loss": -0.0078, "num_tokens": 9502113.0, "reward": 0.15625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.4796297550201416, "sampling/importance_sampling_ratio/mean": 1.0003875494003296, "sampling/importance_sampling_ratio/min": 0.6785179376602173, "sampling/sampling_logp_difference/max": 0.3917919397354126, "sampling/sampling_logp_difference/mean": 0.01645105704665184, "step": 221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 606.0, "completions/max_terminated_length": 606.0, "completions/mean_length": 309.21875, "completions/mean_terminated_length": 309.21875, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.3350669741630554, "epoch": 0.27205882352941174, "frac_reward_zero_std": 1.0, "grad_norm": 0.011103999429334358, "kl": 0.003944728523492813, "learning_rate": 8.983739837398373e-07, "loss": 0.0, "num_tokens": 9541711.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.660025715827942, "sampling/importance_sampling_ratio/mean": 0.9996508955955505, "sampling/importance_sampling_ratio/min": 0.6957373023033142, "sampling/sampling_logp_difference/max": 0.5068330764770508, "sampling/sampling_logp_difference/mean": 0.011814197525382042, "step": 222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 833.0, "completions/max_terminated_length": 833.0, "completions/mean_length": 359.875, "completions/mean_terminated_length": 359.875, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "entropy": 0.40930959582328796, "epoch": 0.27328431372549017, "frac_reward_zero_std": 1.0, "grad_norm": 0.014122203012559195, "kl": 0.004193365573883057, "learning_rate": 9.024390243902439e-07, "loss": 0.0, "num_tokens": 9583991.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6273785829544067, "sampling/importance_sampling_ratio/mean": 1.000042200088501, "sampling/importance_sampling_ratio/min": 0.5911235809326172, "sampling/sampling_logp_difference/max": 0.5257301926612854, "sampling/sampling_logp_difference/mean": 0.013948352076113224, "step": 223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1160.0, "completions/max_terminated_length": 1160.0, "completions/mean_length": 379.6875, "completions/mean_terminated_length": 379.6875, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "entropy": 0.46815726161003113, "epoch": 0.27450980392156865, "frac_reward_zero_std": 1.0, "grad_norm": 0.010528925178720136, "kl": 0.004483724944293499, "learning_rate": 9.065040650406503e-07, "loss": 0.0, "num_tokens": 9627283.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5099003314971924, "sampling/importance_sampling_ratio/mean": 0.9998564720153809, "sampling/importance_sampling_ratio/min": 0.6927226781845093, "sampling/sampling_logp_difference/max": 0.41204357147216797, "sampling/sampling_logp_difference/mean": 0.014761611819267273, "step": 224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 962.0, "completions/max_terminated_length": 962.0, "completions/mean_length": 402.09375, "completions/mean_terminated_length": 402.09375, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.42520254850387573, "epoch": 0.2757352941176471, "frac_reward_zero_std": 0.75, "grad_norm": 0.6645486497249684, "kl": 0.004336307756602764, "learning_rate": 9.105691056910569e-07, "loss": 0.0265, "num_tokens": 9668281.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.5925328731536865, "sampling/importance_sampling_ratio/mean": 1.0000450611114502, "sampling/importance_sampling_ratio/min": 0.5571359992027283, "sampling/sampling_logp_difference/max": 0.5849459171295166, "sampling/sampling_logp_difference/mean": 0.013885562308132648, "step": 225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 691.0, "completions/max_terminated_length": 691.0, "completions/mean_length": 291.40625, "completions/mean_terminated_length": 291.40625, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.5267449021339417, "epoch": 0.2769607843137255, "frac_reward_zero_std": 0.75, "grad_norm": 0.6805200554104797, "kl": 0.008622783236205578, "learning_rate": 9.146341463414634e-07, "loss": -0.0174, "num_tokens": 9703107.0, "reward": 0.125, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 1.7520668506622314, "sampling/importance_sampling_ratio/mean": 0.9996801614761353, "sampling/importance_sampling_ratio/min": 0.6630117297172546, "sampling/sampling_logp_difference/max": 0.5607961416244507, "sampling/sampling_logp_difference/mean": 0.01669890247285366, "step": 226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 664.0, "completions/max_terminated_length": 664.0, "completions/mean_length": 316.28125, "completions/mean_terminated_length": 316.28125, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.3557261526584625, "epoch": 0.27818627450980393, "frac_reward_zero_std": 1.0, "grad_norm": 0.014309930106245684, "kl": 0.004655526485294104, "learning_rate": 9.186991869918699e-07, "loss": 0.0, "num_tokens": 9740773.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.5467315912246704, "sampling/importance_sampling_ratio/mean": 1.000122308731079, "sampling/importance_sampling_ratio/min": 0.6570137143135071, "sampling/sampling_logp_difference/max": 0.4361441135406494, "sampling/sampling_logp_difference/mean": 0.01303093321621418, "step": 227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 675.0, "completions/max_terminated_length": 675.0, "completions/mean_length": 364.34375, "completions/mean_terminated_length": 364.34375, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "entropy": 0.35571926832199097, "epoch": 0.27941176470588236, "frac_reward_zero_std": 1.0, "grad_norm": 0.010547798479601798, "kl": 0.0038934077601879835, "learning_rate": 9.227642276422763e-07, "loss": 0.0, "num_tokens": 9786203.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.7720345258712769, "sampling/importance_sampling_ratio/mean": 0.9997571706771851, "sampling/importance_sampling_ratio/min": 0.7035456299781799, "sampling/sampling_logp_difference/max": 0.5721282958984375, "sampling/sampling_logp_difference/mean": 0.012102827429771423, "step": 228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 960.0, "completions/max_terminated_length": 960.0, "completions/mean_length": 349.6875, "completions/mean_terminated_length": 349.6875, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.4121597111225128, "epoch": 0.2806372549019608, "frac_reward_zero_std": 1.0, "grad_norm": 0.009281061741533347, "kl": 0.004506619647145271, "learning_rate": 9.26829268292683e-07, "loss": 0.0, "num_tokens": 9828215.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5467199087142944, "sampling/importance_sampling_ratio/mean": 0.999993622303009, "sampling/importance_sampling_ratio/min": 0.612779974937439, "sampling/sampling_logp_difference/max": 0.4897494316101074, "sampling/sampling_logp_difference/mean": 0.015029371716082096, "step": 229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1435.0, "completions/max_terminated_length": 1435.0, "completions/mean_length": 458.15625, "completions/mean_terminated_length": 458.15625, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.47298547625541687, "epoch": 0.2818627450980392, "frac_reward_zero_std": 0.75, "grad_norm": 0.6281173070152755, "kl": 0.005311484914273024, "learning_rate": 9.308943089430894e-07, "loss": 0.0461, "num_tokens": 9888113.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.6218538284301758, "sampling/importance_sampling_ratio/mean": 0.9998378753662109, "sampling/importance_sampling_ratio/min": 0.493777871131897, "sampling/sampling_logp_difference/max": 0.7056695222854614, "sampling/sampling_logp_difference/mean": 0.01617797091603279, "step": 230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 925.0, "completions/max_terminated_length": 925.0, "completions/mean_length": 366.8125, "completions/mean_terminated_length": 366.8125, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "entropy": 0.3003723621368408, "epoch": 0.28308823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.009142309686612301, "kl": 0.0036167511716485023, "learning_rate": 9.349593495934958e-07, "loss": 0.0, "num_tokens": 9929301.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.616797685623169, "sampling/importance_sampling_ratio/mean": 0.9999996423721313, "sampling/importance_sampling_ratio/min": 0.6298378109931946, "sampling/sampling_logp_difference/max": 0.4804474115371704, "sampling/sampling_logp_difference/mean": 0.010030673816800117, "step": 231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1621.0, "completions/max_terminated_length": 1621.0, "completions/mean_length": 348.578125, "completions/mean_terminated_length": 348.578125, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.45386460423469543, "epoch": 0.28431372549019607, "frac_reward_zero_std": 0.75, "grad_norm": 0.9238819570839728, "kl": 0.006151631474494934, "learning_rate": 9.390243902439024e-07, "loss": -0.1943, "num_tokens": 9971754.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.5431654453277588, "sampling/importance_sampling_ratio/mean": 0.9996588230133057, "sampling/importance_sampling_ratio/min": 0.6105530858039856, "sampling/sampling_logp_difference/max": 0.4933900833129883, "sampling/sampling_logp_difference/mean": 0.015382247045636177, "step": 232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1210.0, "completions/max_terminated_length": 1210.0, "completions/mean_length": 415.15625, "completions/mean_terminated_length": 415.15625, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "entropy": 0.4504861533641815, "epoch": 0.2855392156862745, "frac_reward_zero_std": 0.75, "grad_norm": 0.6134883065624449, "kl": 0.005751346237957478, "learning_rate": 9.430894308943089e-07, "loss": 0.0381, "num_tokens": 10013284.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.7173755168914795, "sampling/importance_sampling_ratio/mean": 1.0001498460769653, "sampling/importance_sampling_ratio/min": 0.6074577569961548, "sampling/sampling_logp_difference/max": 0.540797233581543, "sampling/sampling_logp_difference/mean": 0.015252334997057915, "step": 233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1268.0, "completions/max_terminated_length": 1268.0, "completions/mean_length": 505.28125, "completions/mean_terminated_length": 505.28125, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "entropy": 0.44269686937332153, "epoch": 0.2867647058823529, "frac_reward_zero_std": 0.75, "grad_norm": 0.4314659655137182, "kl": 0.004137088544666767, "learning_rate": 9.471544715447154e-07, "loss": -0.0026, "num_tokens": 10067958.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000338077545166, "sampling/importance_sampling_ratio/min": 0.6262631416320801, "sampling/sampling_logp_difference/max": 1.2897999286651611, "sampling/sampling_logp_difference/mean": 0.012861356139183044, "step": 234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 592.0, "completions/max_terminated_length": 592.0, "completions/mean_length": 314.84375, "completions/mean_terminated_length": 314.84375, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.3251887559890747, "epoch": 0.28799019607843135, "frac_reward_zero_std": 1.0, "grad_norm": 0.011832216322073878, "kl": 0.00416521867737174, "learning_rate": 9.512195121951218e-07, "loss": 0.0, "num_tokens": 10107484.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4490584135055542, "sampling/importance_sampling_ratio/mean": 0.9999005198478699, "sampling/importance_sampling_ratio/min": 0.6389105319976807, "sampling/sampling_logp_difference/max": 0.44799089431762695, "sampling/sampling_logp_difference/mean": 0.011882545426487923, "step": 235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 452.0, "completions/max_terminated_length": 452.0, "completions/mean_length": 252.109375, "completions/mean_terminated_length": 252.109375, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.5039539337158203, "epoch": 0.28921568627450983, "frac_reward_zero_std": 1.0, "grad_norm": 0.022644643383816568, "kl": 0.01020767167210579, "learning_rate": 9.552845528455285e-07, "loss": 0.0001, "num_tokens": 10138147.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.613327980041504, "sampling/importance_sampling_ratio/mean": 1.000085711479187, "sampling/importance_sampling_ratio/min": 0.6173760294914246, "sampling/sampling_logp_difference/max": 0.48227691650390625, "sampling/sampling_logp_difference/mean": 0.018074192106723785, "step": 236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 672.0, "completions/max_terminated_length": 672.0, "completions/mean_length": 364.953125, "completions/mean_terminated_length": 364.953125, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.4040769338607788, "epoch": 0.29044117647058826, "frac_reward_zero_std": 1.0, "grad_norm": 0.011957923576877194, "kl": 0.0042405580170452595, "learning_rate": 9.59349593495935e-07, "loss": 0.0, "num_tokens": 10196096.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.8378251791000366, "sampling/importance_sampling_ratio/mean": 1.0003762245178223, "sampling/importance_sampling_ratio/min": 0.42732876539230347, "sampling/sampling_logp_difference/max": 0.8502016067504883, "sampling/sampling_logp_difference/mean": 0.016121212393045425, "step": 237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 672.0, "completions/max_terminated_length": 672.0, "completions/mean_length": 310.125, "completions/mean_terminated_length": 310.125, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.5144382119178772, "epoch": 0.2916666666666667, "frac_reward_zero_std": 0.25, "grad_norm": 1.1604382491073426, "kl": 0.008663758635520935, "learning_rate": 9.634146341463414e-07, "loss": 0.0004, "num_tokens": 10230168.0, "reward": 0.46875, "reward_std": 0.46656501293182373, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.5409520864486694, "sampling/importance_sampling_ratio/mean": 0.9997280836105347, "sampling/importance_sampling_ratio/min": 0.6259968280792236, "sampling/sampling_logp_difference/max": 0.46841001510620117, "sampling/sampling_logp_difference/mean": 0.01703379862010479, "step": 238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1505.0, "completions/max_terminated_length": 1505.0, "completions/mean_length": 352.796875, "completions/mean_terminated_length": 352.796875, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.43276238441467285, "epoch": 0.2928921568627451, "frac_reward_zero_std": 0.75, "grad_norm": 0.6820390123243861, "kl": 0.00801345519721508, "learning_rate": 9.67479674796748e-07, "loss": -0.0063, "num_tokens": 10272811.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.575048565864563, "sampling/importance_sampling_ratio/mean": 0.9999696016311646, "sampling/importance_sampling_ratio/min": 0.4848375916481018, "sampling/sampling_logp_difference/max": 0.7239413261413574, "sampling/sampling_logp_difference/mean": 0.01476423628628254, "step": 239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 552.0, "completions/max_terminated_length": 552.0, "completions/mean_length": 315.015625, "completions/mean_terminated_length": 315.015625, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.43508949875831604, "epoch": 0.29411764705882354, "frac_reward_zero_std": 0.75, "grad_norm": 0.7475494340450964, "kl": 0.005802095867693424, "learning_rate": 9.715447154471544e-07, "loss": -0.0217, "num_tokens": 10310220.0, "reward": -0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": -0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.562896728515625, "sampling/importance_sampling_ratio/mean": 0.9998626112937927, "sampling/importance_sampling_ratio/min": 0.6799696683883667, "sampling/sampling_logp_difference/max": 0.44654107093811035, "sampling/sampling_logp_difference/mean": 0.01419808343052864, "step": 240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1408.0, "completions/max_terminated_length": 1408.0, "completions/mean_length": 350.46875, "completions/mean_terminated_length": 350.46875, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.46074721217155457, "epoch": 0.29534313725490197, "frac_reward_zero_std": 0.75, "grad_norm": 0.6802506128592076, "kl": 0.00559465866535902, "learning_rate": 9.756097560975609e-07, "loss": -0.0338, "num_tokens": 10348682.0, "reward": 0.0625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.646485447883606, "sampling/importance_sampling_ratio/mean": 0.9999402761459351, "sampling/importance_sampling_ratio/min": 0.7217565178871155, "sampling/sampling_logp_difference/max": 0.4986429214477539, "sampling/sampling_logp_difference/mean": 0.014125732704997063, "step": 241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 993.0, "completions/max_terminated_length": 993.0, "completions/mean_length": 428.71875, "completions/mean_terminated_length": 428.71875, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.4052489995956421, "epoch": 0.2965686274509804, "frac_reward_zero_std": 1.0, "grad_norm": 0.01306308646246238, "kl": 0.005257242824882269, "learning_rate": 9.796747967479673e-07, "loss": 0.0, "num_tokens": 10401944.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5467331409454346, "sampling/importance_sampling_ratio/mean": 1.0004154443740845, "sampling/importance_sampling_ratio/min": 0.6295267343521118, "sampling/sampling_logp_difference/max": 0.4627869129180908, "sampling/sampling_logp_difference/mean": 0.013822273351252079, "step": 242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1461.0, "completions/max_terminated_length": 1461.0, "completions/mean_length": 376.046875, "completions/mean_terminated_length": 376.046875, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.4546189606189728, "epoch": 0.2977941176470588, "frac_reward_zero_std": 0.75, "grad_norm": 0.7912834100061582, "kl": 0.005774803459644318, "learning_rate": 9.83739837398374e-07, "loss": 0.1003, "num_tokens": 10441899.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.6133626699447632, "sampling/importance_sampling_ratio/mean": 0.9996986985206604, "sampling/importance_sampling_ratio/min": 0.5424333810806274, "sampling/sampling_logp_difference/max": 0.6116900444030762, "sampling/sampling_logp_difference/mean": 0.015671471133828163, "step": 243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 612.0, "completions/max_terminated_length": 612.0, "completions/mean_length": 268.0625, "completions/mean_terminated_length": 268.0625, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.4850078523159027, "epoch": 0.29901960784313725, "frac_reward_zero_std": 0.75, "grad_norm": 0.8794628676859257, "kl": 0.008190421387553215, "learning_rate": 9.878048780487804e-07, "loss": -0.0117, "num_tokens": 10476287.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.4257193803787231, "sampling/importance_sampling_ratio/mean": 0.9998325109481812, "sampling/importance_sampling_ratio/min": 0.633163571357727, "sampling/sampling_logp_difference/max": 0.45702648162841797, "sampling/sampling_logp_difference/mean": 0.016241561621427536, "step": 244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 825.0, "completions/max_terminated_length": 825.0, "completions/mean_length": 440.625, "completions/mean_terminated_length": 440.625, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "entropy": 0.44624072313308716, "epoch": 0.3002450980392157, "frac_reward_zero_std": 0.75, "grad_norm": 0.6425129475818805, "kl": 0.004389941226691008, "learning_rate": 9.918699186991869e-07, "loss": 0.0364, "num_tokens": 10530711.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.6945854425430298, "sampling/importance_sampling_ratio/mean": 1.00020432472229, "sampling/importance_sampling_ratio/min": 0.6298462748527527, "sampling/sampling_logp_difference/max": 0.5274381637573242, "sampling/sampling_logp_difference/mean": 0.014506272971630096, "step": 245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1309.0, "completions/max_terminated_length": 1309.0, "completions/mean_length": 524.375, "completions/mean_terminated_length": 524.375, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "entropy": 0.49369558691978455, "epoch": 0.3014705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.009392338618878006, "kl": 0.004467587452381849, "learning_rate": 9.959349593495935e-07, "loss": 0.0, "num_tokens": 10585839.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6007341146469116, "sampling/importance_sampling_ratio/mean": 1.000148892402649, "sampling/importance_sampling_ratio/min": 0.6233722567558289, "sampling/sampling_logp_difference/max": 0.4726114273071289, "sampling/sampling_logp_difference/mean": 0.015174762345850468, "step": 246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/max_terminated_length": 497.0, "completions/mean_length": 342.6875, "completions/mean_terminated_length": 342.6875, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "entropy": 0.24563083052635193, "epoch": 0.30269607843137253, "frac_reward_zero_std": 1.0, "grad_norm": 0.011353673244639909, "kl": 0.003573143156245351, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 10623067.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.375269889831543, "sampling/importance_sampling_ratio/mean": 1.0000427961349487, "sampling/importance_sampling_ratio/min": 0.6933198571205139, "sampling/sampling_logp_difference/max": 0.36626386642456055, "sampling/sampling_logp_difference/mean": 0.0092755276709795, "step": 247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 663.0, "completions/max_terminated_length": 663.0, "completions/mean_length": 286.40625, "completions/mean_terminated_length": 286.40625, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.4605855643749237, "epoch": 0.30392156862745096, "frac_reward_zero_std": 1.0, "grad_norm": 0.019777025232276254, "kl": 0.007960245013237, "learning_rate": 9.99999492515838e-07, "loss": 0.0001, "num_tokens": 10658917.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5835540294647217, "sampling/importance_sampling_ratio/mean": 1.0006576776504517, "sampling/importance_sampling_ratio/min": 0.626899242401123, "sampling/sampling_logp_difference/max": 0.46696949005126953, "sampling/sampling_logp_difference/mean": 0.016132820397615433, "step": 248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1577.0, "completions/max_terminated_length": 1577.0, "completions/mean_length": 601.921875, "completions/mean_terminated_length": 601.921875, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "entropy": 0.535902738571167, "epoch": 0.30514705882352944, "frac_reward_zero_std": 0.5, "grad_norm": 0.6768067665084555, "kl": 0.0070672365836799145, "learning_rate": 9.99997970064382e-07, "loss": 0.0647, "num_tokens": 10718304.0, "reward": 0.3125, "reward_std": 0.3943893015384674, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.4967761039733887, "sampling/importance_sampling_ratio/mean": 0.9999873638153076, "sampling/importance_sampling_ratio/min": 0.24174155294895172, "sampling/sampling_logp_difference/max": 1.4198861122131348, "sampling/sampling_logp_difference/mean": 0.015520536340773106, "step": 249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 769.0, "completions/max_terminated_length": 769.0, "completions/mean_length": 402.59375, "completions/mean_terminated_length": 402.59375, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.47650665044784546, "epoch": 0.30637254901960786, "frac_reward_zero_std": 1.0, "grad_norm": 0.011178878383545532, "kl": 0.005233863368630409, "learning_rate": 9.999954326487227e-07, "loss": 0.0, "num_tokens": 10759142.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.4148826599121094, "sampling/importance_sampling_ratio/mean": 1.0001262426376343, "sampling/importance_sampling_ratio/min": 0.564049243927002, "sampling/sampling_logp_difference/max": 0.5726137161254883, "sampling/sampling_logp_difference/mean": 0.014487143605947495, "step": 250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1088.0, "completions/max_terminated_length": 1088.0, "completions/mean_length": 384.03125, "completions/mean_terminated_length": 384.03125, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "entropy": 0.453128457069397, "epoch": 0.3075980392156863, "frac_reward_zero_std": 0.75, "grad_norm": 0.5717424276865352, "kl": 0.006018123589456081, "learning_rate": 9.999918802740106e-07, "loss": -0.0102, "num_tokens": 10796104.0, "reward": 0.71875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.3331857919692993, "sampling/importance_sampling_ratio/mean": 0.9996961355209351, "sampling/importance_sampling_ratio/min": 0.5539204478263855, "sampling/sampling_logp_difference/max": 0.5907341241836548, "sampling/sampling_logp_difference/mean": 0.014053301885724068, "step": 251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 708.0, "completions/max_terminated_length": 708.0, "completions/mean_length": 252.34375, "completions/mean_terminated_length": 252.34375, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.38080328702926636, "epoch": 0.3088235294117647, "frac_reward_zero_std": 0.75, "grad_norm": 0.7794919111906381, "kl": 0.007688506506383419, "learning_rate": 9.999873129474573e-07, "loss": -0.0014, "num_tokens": 10831662.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.343520998954773, "sampling/importance_sampling_ratio/mean": 1.0003948211669922, "sampling/importance_sampling_ratio/min": 0.6174754500389099, "sampling/sampling_logp_difference/max": 0.4821159839630127, "sampling/sampling_logp_difference/mean": 0.013652031309902668, "step": 252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 931.0, "completions/max_terminated_length": 931.0, "completions/mean_length": 404.25, "completions/mean_terminated_length": 404.25, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "entropy": 0.5724977850914001, "epoch": 0.31004901960784315, "frac_reward_zero_std": 0.5, "grad_norm": 0.7941292337321143, "kl": 0.009670319966971874, "learning_rate": 9.999817306783336e-07, "loss": -0.0024, "num_tokens": 10872622.0, "reward": 0.46875, "reward_std": 0.42516323924064636, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.5071560144424438, "sampling/importance_sampling_ratio/mean": 1.0000457763671875, "sampling/importance_sampling_ratio/min": 0.6962995529174805, "sampling/sampling_logp_difference/max": 0.41022443771362305, "sampling/sampling_logp_difference/mean": 0.016572710126638412, "step": 253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 643.0, "completions/max_terminated_length": 643.0, "completions/mean_length": 272.453125, "completions/mean_terminated_length": 272.453125, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.42835021018981934, "epoch": 0.3112745098039216, "frac_reward_zero_std": 1.0, "grad_norm": 0.017640777709851146, "kl": 0.008064229041337967, "learning_rate": 9.999751334779714e-07, "loss": 0.0001, "num_tokens": 10904635.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5810751914978027, "sampling/importance_sampling_ratio/mean": 1.000704288482666, "sampling/importance_sampling_ratio/min": 0.6547352075576782, "sampling/sampling_logp_difference/max": 0.45810508728027344, "sampling/sampling_logp_difference/mean": 0.015917841345071793, "step": 254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1083.0, "completions/max_terminated_length": 1083.0, "completions/mean_length": 391.375, "completions/mean_terminated_length": 391.375, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.4366636872291565, "epoch": 0.3125, "frac_reward_zero_std": 1.0, "grad_norm": 0.010154372449846932, "kl": 0.0052803014405071735, "learning_rate": 9.999675213597626e-07, "loss": 0.0, "num_tokens": 10949027.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000762939453125, "sampling/importance_sampling_ratio/min": 0.6637117266654968, "sampling/sampling_logp_difference/max": 0.7490854263305664, "sampling/sampling_logp_difference/mean": 0.01449318416416645, "step": 255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 896.0, "completions/max_terminated_length": 896.0, "completions/mean_length": 319.0625, "completions/mean_terminated_length": 319.0625, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.5140831470489502, "epoch": 0.3137254901960784, "frac_reward_zero_std": 1.0, "grad_norm": 0.018720589238457094, "kl": 0.009047593921422958, "learning_rate": 9.999588943391595e-07, "loss": 0.0001, "num_tokens": 10987399.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.7340036630630493, "sampling/importance_sampling_ratio/mean": 0.999995231628418, "sampling/importance_sampling_ratio/min": 0.6396996378898621, "sampling/sampling_logp_difference/max": 0.5504329204559326, "sampling/sampling_logp_difference/mean": 0.01787625066936016, "step": 256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 717.0, "completions/max_terminated_length": 717.0, "completions/mean_length": 369.359375, "completions/mean_terminated_length": 369.359375, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "entropy": 0.526248574256897, "epoch": 0.31495098039215685, "frac_reward_zero_std": 0.5, "grad_norm": 0.9687214128636437, "kl": 0.007176979444921017, "learning_rate": 9.999492524336742e-07, "loss": 0.0255, "num_tokens": 11025342.0, "reward": 0.9375, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.5873112678527832, "sampling/importance_sampling_ratio/mean": 0.9999411106109619, "sampling/importance_sampling_ratio/min": 0.4597007930278778, "sampling/sampling_logp_difference/max": 0.777179479598999, "sampling/sampling_logp_difference/mean": 0.015837356448173523, "step": 257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 822.0, "completions/max_terminated_length": 822.0, "completions/mean_length": 300.921875, "completions/mean_terminated_length": 300.921875, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.472340852022171, "epoch": 0.3161764705882353, "frac_reward_zero_std": 0.75, "grad_norm": 0.6263701161670777, "kl": 0.010344721376895905, "learning_rate": 9.999385956628792e-07, "loss": -0.0058, "num_tokens": 11059625.0, "reward": 0.1875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 1.43520188331604, "sampling/importance_sampling_ratio/mean": 0.9999966621398926, "sampling/importance_sampling_ratio/min": 0.6100949645042419, "sampling/sampling_logp_difference/max": 0.494140625, "sampling/sampling_logp_difference/mean": 0.016531987115740776, "step": 258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1362.0, "completions/max_terminated_length": 1362.0, "completions/mean_length": 426.96875, "completions/mean_terminated_length": 426.96875, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "entropy": 0.45004984736442566, "epoch": 0.3174019607843137, "frac_reward_zero_std": 0.75, "grad_norm": 0.5553459110402027, "kl": 0.006115212570875883, "learning_rate": 9.999269240484069e-07, "loss": -0.0408, "num_tokens": 11104807.0, "reward": -0.40625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": -0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000231146812439, "sampling/importance_sampling_ratio/min": 0.6524202823638916, "sampling/sampling_logp_difference/max": 0.7963438034057617, "sampling/sampling_logp_difference/mean": 0.0143682099878788, "step": 259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1234.0, "completions/max_terminated_length": 1234.0, "completions/mean_length": 371.359375, "completions/mean_terminated_length": 371.359375, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.3914976716041565, "epoch": 0.31862745098039214, "frac_reward_zero_std": 1.0, "grad_norm": 0.013741072302318604, "kl": 0.00652517331764102, "learning_rate": 9.999142376139503e-07, "loss": 0.0001, "num_tokens": 11148942.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.5181559324264526, "sampling/importance_sampling_ratio/mean": 0.9999802112579346, "sampling/importance_sampling_ratio/min": 0.5068036913871765, "sampling/sampling_logp_difference/max": 0.6796314716339111, "sampling/sampling_logp_difference/mean": 0.014423957094550133, "step": 260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 554.0, "completions/max_terminated_length": 554.0, "completions/mean_length": 273.5625, "completions/mean_terminated_length": 273.5625, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.3968122601509094, "epoch": 0.31985294117647056, "frac_reward_zero_std": 1.0, "grad_norm": 0.01582753561268836, "kl": 0.007336938753724098, "learning_rate": 9.999005363852617e-07, "loss": 0.0001, "num_tokens": 11182306.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6110678911209106, "sampling/importance_sampling_ratio/mean": 1.0002583265304565, "sampling/importance_sampling_ratio/min": 0.239310622215271, "sampling/sampling_logp_difference/max": 1.429992914199829, "sampling/sampling_logp_difference/mean": 0.014425508677959442, "step": 261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 886.0, "completions/max_terminated_length": 886.0, "completions/mean_length": 385.703125, "completions/mean_terminated_length": 385.703125, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "entropy": 0.4064733684062958, "epoch": 0.32107843137254904, "frac_reward_zero_std": 1.0, "grad_norm": 0.012096501295540938, "kl": 0.00552629679441452, "learning_rate": 9.99885820390154e-07, "loss": 0.0001, "num_tokens": 11226111.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.8356996774673462, "sampling/importance_sampling_ratio/mean": 0.9998568892478943, "sampling/importance_sampling_ratio/min": 0.5389136075973511, "sampling/sampling_logp_difference/max": 0.6182000637054443, "sampling/sampling_logp_difference/mean": 0.013913342729210854, "step": 262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 651.0, "completions/max_terminated_length": 651.0, "completions/mean_length": 300.125, "completions/mean_terminated_length": 300.125, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.5236982703208923, "epoch": 0.32230392156862747, "frac_reward_zero_std": 0.75, "grad_norm": 0.8734390429342446, "kl": 0.009163947775959969, "learning_rate": 9.998700896584995e-07, "loss": 0.0036, "num_tokens": 11265607.0, "reward": 0.15625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.5041520595550537, "sampling/importance_sampling_ratio/mean": 0.9998224973678589, "sampling/importance_sampling_ratio/min": 0.2653282880783081, "sampling/sampling_logp_difference/max": 1.3267874717712402, "sampling/sampling_logp_difference/mean": 0.01675572618842125, "step": 263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 929.0, "completions/max_terminated_length": 929.0, "completions/mean_length": 451.484375, "completions/mean_terminated_length": 451.484375, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "entropy": 0.4946002960205078, "epoch": 0.3235294117647059, "frac_reward_zero_std": 0.75, "grad_norm": 0.6585176413597268, "kl": 0.006031869910657406, "learning_rate": 9.998533442222308e-07, "loss": 0.0178, "num_tokens": 11313734.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.5725140571594238, "sampling/importance_sampling_ratio/mean": 0.999994158744812, "sampling/importance_sampling_ratio/min": 0.5927926301956177, "sampling/sampling_logp_difference/max": 0.5229105949401855, "sampling/sampling_logp_difference/mean": 0.01599842496216297, "step": 264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/max_terminated_length": 525.0, "completions/mean_length": 326.125, "completions/mean_terminated_length": 326.125, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.41472047567367554, "epoch": 0.3247549019607843, "frac_reward_zero_std": 0.75, "grad_norm": 0.5438304974537151, "kl": 0.010181521065533161, "learning_rate": 9.9983558411534e-07, "loss": -0.0015, "num_tokens": 11349854.0, "reward": 0.6875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.4288227558135986, "sampling/importance_sampling_ratio/mean": 0.9998899698257446, "sampling/importance_sampling_ratio/min": 0.6207209825515747, "sampling/sampling_logp_difference/max": 0.47687363624572754, "sampling/sampling_logp_difference/mean": 0.013577282428741455, "step": 265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 771.0, "completions/max_terminated_length": 771.0, "completions/mean_length": 418.90625, "completions/mean_terminated_length": 418.90625, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.4121779799461365, "epoch": 0.32598039215686275, "frac_reward_zero_std": 1.0, "grad_norm": 0.010094143979247046, "kl": 0.004873153753578663, "learning_rate": 9.99816809373879e-07, "loss": 0.0, "num_tokens": 11396952.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.8681329488754272, "sampling/importance_sampling_ratio/mean": 1.0000123977661133, "sampling/importance_sampling_ratio/min": 0.6142473816871643, "sampling/sampling_logp_difference/max": 0.6249395608901978, "sampling/sampling_logp_difference/mean": 0.012942689470946789, "step": 266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1294.0, "completions/max_terminated_length": 1294.0, "completions/mean_length": 433.140625, "completions/mean_terminated_length": 433.140625, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.5197969079017639, "epoch": 0.3272058823529412, "frac_reward_zero_std": 0.75, "grad_norm": 0.5273553780239081, "kl": 0.006781969219446182, "learning_rate": 9.99797020035959e-07, "loss": -0.0195, "num_tokens": 11444097.0, "reward": 0.15625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.5467320680618286, "sampling/importance_sampling_ratio/mean": 1.0002578496932983, "sampling/importance_sampling_ratio/min": 0.6087985634803772, "sampling/sampling_logp_difference/max": 0.49626779556274414, "sampling/sampling_logp_difference/mean": 0.01568402349948883, "step": 267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 876.0, "completions/max_terminated_length": 876.0, "completions/mean_length": 420.671875, "completions/mean_terminated_length": 420.671875, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "entropy": 0.39038383960723877, "epoch": 0.3284313725490196, "frac_reward_zero_std": 0.75, "grad_norm": 0.7304668528018662, "kl": 0.006473686080425978, "learning_rate": 9.997762161417517e-07, "loss": 0.0746, "num_tokens": 11488716.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.6158193349838257, "sampling/importance_sampling_ratio/mean": 1.0000941753387451, "sampling/importance_sampling_ratio/min": 0.6392062306404114, "sampling/sampling_logp_difference/max": 0.4798421859741211, "sampling/sampling_logp_difference/mean": 0.013103759847581387, "step": 268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 869.0, "completions/max_terminated_length": 869.0, "completions/mean_length": 478.34375, "completions/mean_terminated_length": 478.34375, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "entropy": 0.3793662488460541, "epoch": 0.32965686274509803, "frac_reward_zero_std": 1.0, "grad_norm": 0.012110913704401148, "kl": 0.005645375698804855, "learning_rate": 9.997543977334873e-07, "loss": 0.0001, "num_tokens": 11544754.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.4719692468643188, "sampling/importance_sampling_ratio/mean": 0.9999769926071167, "sampling/importance_sampling_ratio/min": 0.6105214953422546, "sampling/sampling_logp_difference/max": 0.4934418201446533, "sampling/sampling_logp_difference/mean": 0.01258816383779049, "step": 269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 742.0, "completions/max_terminated_length": 742.0, "completions/mean_length": 335.390625, "completions/mean_terminated_length": 335.390625, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "entropy": 0.5340389609336853, "epoch": 0.33088235294117646, "frac_reward_zero_std": 1.0, "grad_norm": 0.021240243532826324, "kl": 0.00902465358376503, "learning_rate": 9.99731564855456e-07, "loss": 0.0001, "num_tokens": 11582891.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.408657431602478, "sampling/importance_sampling_ratio/mean": 0.9997990131378174, "sampling/importance_sampling_ratio/min": 0.6144591569900513, "sampling/sampling_logp_difference/max": 0.4870128631591797, "sampling/sampling_logp_difference/mean": 0.01785307563841343, "step": 270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 905.0, "completions/max_terminated_length": 905.0, "completions/mean_length": 460.625, "completions/mean_terminated_length": 460.625, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.3157837986946106, "epoch": 0.3321078431372549, "frac_reward_zero_std": 1.0, "grad_norm": 0.011316478071849316, "kl": 0.005926870740950108, "learning_rate": 9.997077175540066e-07, "loss": 0.0, "num_tokens": 11631859.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999547004699707, "sampling/importance_sampling_ratio/min": 0.5944328308105469, "sampling/sampling_logp_difference/max": 0.70906662940979, "sampling/sampling_logp_difference/mean": 0.011131072416901588, "step": 271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1019.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 366.046875, "completions/mean_terminated_length": 366.046875, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "entropy": 0.37685471773147583, "epoch": 0.3333333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.012293792161516584, "kl": 0.006027653813362122, "learning_rate": 9.996828558775485e-07, "loss": 0.0001, "num_tokens": 11676214.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.7520934343338013, "sampling/importance_sampling_ratio/mean": 0.9994595050811768, "sampling/importance_sampling_ratio/min": 0.5039157271385193, "sampling/sampling_logp_difference/max": 0.6853463649749756, "sampling/sampling_logp_difference/mean": 0.012954109348356724, "step": 272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 332.265625, "completions/mean_terminated_length": 332.265625, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.4931579828262329, "epoch": 0.33455882352941174, "frac_reward_zero_std": 0.75, "grad_norm": 0.6775020074990198, "kl": 0.012495461851358414, "learning_rate": 9.996569798765487e-07, "loss": 0.0843, "num_tokens": 11712199.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.5486825704574585, "sampling/importance_sampling_ratio/mean": 0.9999171495437622, "sampling/importance_sampling_ratio/min": 0.6463443636894226, "sampling/sampling_logp_difference/max": 0.4374046325683594, "sampling/sampling_logp_difference/mean": 0.016278326511383057, "step": 273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 598.0, "completions/max_terminated_length": 598.0, "completions/mean_length": 324.65625, "completions/mean_terminated_length": 324.65625, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "entropy": 0.4076008200645447, "epoch": 0.33578431372549017, "frac_reward_zero_std": 0.75, "grad_norm": 0.6972937947780689, "kl": 0.008574405685067177, "learning_rate": 9.996300896035338e-07, "loss": -0.0296, "num_tokens": 11747825.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.4543428421020508, "sampling/importance_sampling_ratio/mean": 1.000024437904358, "sampling/importance_sampling_ratio/min": 0.6813722848892212, "sampling/sampling_logp_difference/max": 0.38364648818969727, "sampling/sampling_logp_difference/mean": 0.014667466282844543, "step": 274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 834.0, "completions/max_terminated_length": 834.0, "completions/mean_length": 372.640625, "completions/mean_terminated_length": 372.640625, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "entropy": 0.3959220051765442, "epoch": 0.33700980392156865, "frac_reward_zero_std": 1.0, "grad_norm": 0.015326209630199934, "kl": 0.006399261299520731, "learning_rate": 9.996021851130896e-07, "loss": 0.0001, "num_tokens": 11789722.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6009925603866577, "sampling/importance_sampling_ratio/mean": 0.9998877644538879, "sampling/importance_sampling_ratio/min": 0.19343671202659607, "sampling/sampling_logp_difference/max": 1.6428048610687256, "sampling/sampling_logp_difference/mean": 0.013061312958598137, "step": 275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1132.0, "completions/max_terminated_length": 1132.0, "completions/mean_length": 356.8125, "completions/mean_terminated_length": 356.8125, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.4652358293533325, "epoch": 0.3382352941176471, "frac_reward_zero_std": 1.0, "grad_norm": 0.029056647558311166, "kl": 0.00901811383664608, "learning_rate": 9.995732664618603e-07, "loss": 0.0001, "num_tokens": 11845022.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9998490810394287, "sampling/importance_sampling_ratio/min": 0.4370385706424713, "sampling/sampling_logp_difference/max": 0.8277338743209839, "sampling/sampling_logp_difference/mean": 0.015659213066101074, "step": 276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2330.0, "completions/max_terminated_length": 2330.0, "completions/mean_length": 528.28125, "completions/mean_terminated_length": 528.28125, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.458290696144104, "epoch": 0.3394607843137255, "frac_reward_zero_std": 0.75, "grad_norm": 0.3987842070515677, "kl": 0.007810407318174839, "learning_rate": 9.99543333708549e-07, "loss": -0.001, "num_tokens": 11895648.0, "reward": 0.28125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 1.7985732555389404, "sampling/importance_sampling_ratio/mean": 0.9999399781227112, "sampling/importance_sampling_ratio/min": 0.40161171555519104, "sampling/sampling_logp_difference/max": 0.9122695922851562, "sampling/sampling_logp_difference/mean": 0.015011485666036606, "step": 277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 676.0, "completions/max_terminated_length": 676.0, "completions/mean_length": 301.53125, "completions/mean_terminated_length": 301.53125, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.43827176094055176, "epoch": 0.34068627450980393, "frac_reward_zero_std": 1.0, "grad_norm": 0.023853019802949627, "kl": 0.010565811768174171, "learning_rate": 9.995123869139176e-07, "loss": 0.0001, "num_tokens": 11928370.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4364597797393799, "sampling/importance_sampling_ratio/mean": 0.9996968507766724, "sampling/importance_sampling_ratio/min": 0.5515589714050293, "sampling/sampling_logp_difference/max": 0.5950064659118652, "sampling/sampling_logp_difference/mean": 0.016701238229870796, "step": 278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 822.0, "completions/max_terminated_length": 822.0, "completions/mean_length": 453.453125, "completions/mean_terminated_length": 453.453125, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "entropy": 0.538509726524353, "epoch": 0.34191176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 0.717002517937967, "kl": 0.009718159213662148, "learning_rate": 9.994804261407854e-07, "loss": 0.0418, "num_tokens": 11983231.0, "reward": 0.5625, "reward_std": 0.3943893015384674, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.6307955980300903, "sampling/importance_sampling_ratio/mean": 1.0000951290130615, "sampling/importance_sampling_ratio/min": 0.6374204754829407, "sampling/sampling_logp_difference/max": 0.48906803131103516, "sampling/sampling_logp_difference/mean": 0.016552571207284927, "step": 279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 849.0, "completions/max_terminated_length": 849.0, "completions/mean_length": 272.5, "completions/mean_terminated_length": 272.5, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.4508207440376282, "epoch": 0.3431372549019608, "frac_reward_zero_std": 0.75, "grad_norm": 0.668154265684309, "kl": 0.00934848003089428, "learning_rate": 9.994474514540312e-07, "loss": 0.0092, "num_tokens": 12024191.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.6463180780410767, "sampling/importance_sampling_ratio/mean": 1.0004639625549316, "sampling/importance_sampling_ratio/min": 0.6137672066688538, "sampling/sampling_logp_difference/max": 0.49854135513305664, "sampling/sampling_logp_difference/mean": 0.016092857345938683, "step": 280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1731.0, "completions/max_terminated_length": 1731.0, "completions/mean_length": 455.421875, "completions/mean_terminated_length": 455.421875, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "entropy": 0.4277341663837433, "epoch": 0.3443627450980392, "frac_reward_zero_std": 0.75, "grad_norm": 0.5814743469768969, "kl": 0.008012944832444191, "learning_rate": 9.994134629205917e-07, "loss": -0.0093, "num_tokens": 12070426.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.4348182678222656, "sampling/importance_sampling_ratio/mean": 0.9996310472488403, "sampling/importance_sampling_ratio/min": 0.380202054977417, "sampling/sampling_logp_difference/max": 0.9670524597167969, "sampling/sampling_logp_difference/mean": 0.014142225496470928, "step": 281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 990.0, "completions/max_terminated_length": 990.0, "completions/mean_length": 391.515625, "completions/mean_terminated_length": 391.515625, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.34291574358940125, "epoch": 0.34558823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.027473693006806563, "kl": 0.008133836090564728, "learning_rate": 9.99378460609461e-07, "loss": 0.0001, "num_tokens": 12110027.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.8718515634536743, "sampling/importance_sampling_ratio/mean": 0.9995509386062622, "sampling/importance_sampling_ratio/min": 0.5338816046714783, "sampling/sampling_logp_difference/max": 0.6275811195373535, "sampling/sampling_logp_difference/mean": 0.012578390538692474, "step": 282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1110.0, "completions/max_terminated_length": 1110.0, "completions/mean_length": 448.828125, "completions/mean_terminated_length": 448.828125, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "entropy": 0.5282679796218872, "epoch": 0.34681372549019607, "frac_reward_zero_std": 0.75, "grad_norm": 0.6448287955350547, "kl": 0.009138301014900208, "learning_rate": 9.993424445916922e-07, "loss": 0.0192, "num_tokens": 12156160.0, "reward": -0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": -0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.2968796491622925, "sampling/importance_sampling_ratio/mean": 0.9994966387748718, "sampling/importance_sampling_ratio/min": 0.5780472755432129, "sampling/sampling_logp_difference/max": 0.5480996370315552, "sampling/sampling_logp_difference/mean": 0.01567963697016239, "step": 283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 539.0, "completions/max_terminated_length": 539.0, "completions/mean_length": 298.6875, "completions/mean_terminated_length": 298.6875, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.43923360109329224, "epoch": 0.3480392156862745, "frac_reward_zero_std": 1.0, "grad_norm": 0.02400926405736424, "kl": 0.009320631623268127, "learning_rate": 9.993054149403949e-07, "loss": 0.0001, "num_tokens": 12191996.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5979321002960205, "sampling/importance_sampling_ratio/mean": 1.0002543926239014, "sampling/importance_sampling_ratio/min": 0.6147794723510742, "sampling/sampling_logp_difference/max": 0.48649168014526367, "sampling/sampling_logp_difference/mean": 0.016528140753507614, "step": 284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1017.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 379.921875, "completions/mean_terminated_length": 379.921875, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "entropy": 0.41807466745376587, "epoch": 0.3492647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 0.777122448825079, "kl": 0.009721128270030022, "learning_rate": 9.992673717307372e-07, "loss": 0.0109, "num_tokens": 12233063.0, "reward": 0.90625, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9998581409454346, "sampling/importance_sampling_ratio/min": 0.6313745379447937, "sampling/sampling_logp_difference/max": 0.8771946430206299, "sampling/sampling_logp_difference/mean": 0.014585594646632671, "step": 285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 974.0, "completions/max_terminated_length": 974.0, "completions/mean_length": 458.59375, "completions/mean_terminated_length": 458.59375, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "entropy": 0.513215184211731, "epoch": 0.35049019607843135, "frac_reward_zero_std": 0.75, "grad_norm": 0.405742625010024, "kl": 0.006400286685675383, "learning_rate": 9.992283150399446e-07, "loss": -0.0112, "num_tokens": 12282189.0, "reward": -0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": -0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.5342351198196411, "sampling/importance_sampling_ratio/mean": 1.0003541707992554, "sampling/importance_sampling_ratio/min": 0.6520817279815674, "sampling/sampling_logp_difference/max": 0.42803192138671875, "sampling/sampling_logp_difference/mean": 0.015352083370089531, "step": 286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 413.0, "completions/max_terminated_length": 413.0, "completions/mean_length": 256.921875, "completions/mean_terminated_length": 256.921875, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.3392180800437927, "epoch": 0.35171568627450983, "frac_reward_zero_std": 1.0, "grad_norm": 0.02842084079987757, "kl": 0.010030429810285568, "learning_rate": 9.991882449472994e-07, "loss": 0.0001, "num_tokens": 12312520.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.699145793914795, "sampling/importance_sampling_ratio/mean": 0.99982750415802, "sampling/importance_sampling_ratio/min": 0.6013734340667725, "sampling/sampling_logp_difference/max": 0.530125617980957, "sampling/sampling_logp_difference/mean": 0.01373203657567501, "step": 287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1554.0, "completions/max_terminated_length": 1554.0, "completions/mean_length": 434.484375, "completions/mean_terminated_length": 434.484375, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "entropy": 0.32547104358673096, "epoch": 0.35294117647058826, "frac_reward_zero_std": 1.0, "grad_norm": 0.011262596707021056, "kl": 0.004784740041941404, "learning_rate": 9.991471615341415e-07, "loss": 0.0, "num_tokens": 12358807.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4279887676239014, "sampling/importance_sampling_ratio/mean": 1.0000362396240234, "sampling/importance_sampling_ratio/min": 0.68781977891922, "sampling/sampling_logp_difference/max": 0.37422847747802734, "sampling/sampling_logp_difference/mean": 0.010886838659644127, "step": 288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1191.0, "completions/max_terminated_length": 1191.0, "completions/mean_length": 475.65625, "completions/mean_terminated_length": 475.65625, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "entropy": 0.36578553915023804, "epoch": 0.3541666666666667, "frac_reward_zero_std": 0.75, "grad_norm": 0.5010226839275513, "kl": 0.005711100529879332, "learning_rate": 9.991050648838675e-07, "loss": -0.0218, "num_tokens": 12414993.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0002543926239014, "sampling/importance_sampling_ratio/min": 0.5022750496864319, "sampling/sampling_logp_difference/max": 0.7195267677307129, "sampling/sampling_logp_difference/mean": 0.012987583875656128, "step": 289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 811.0, "completions/max_terminated_length": 811.0, "completions/mean_length": 352.25, "completions/mean_terminated_length": 352.25, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.45125269889831543, "epoch": 0.3553921568627451, "frac_reward_zero_std": 0.75, "grad_norm": 0.712283233106377, "kl": 0.011799456551671028, "learning_rate": 9.990619550819312e-07, "loss": -0.0191, "num_tokens": 12458193.0, "reward": 0.34375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 1.617409586906433, "sampling/importance_sampling_ratio/mean": 1.000082015991211, "sampling/importance_sampling_ratio/min": 0.6459353566169739, "sampling/sampling_logp_difference/max": 0.48082590103149414, "sampling/sampling_logp_difference/mean": 0.016624558717012405, "step": 290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 634.0, "completions/max_terminated_length": 634.0, "completions/mean_length": 369.28125, "completions/mean_terminated_length": 369.28125, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.3470129072666168, "epoch": 0.35661764705882354, "frac_reward_zero_std": 1.0, "grad_norm": 0.017589171772455517, "kl": 0.007577105890959501, "learning_rate": 9.990178322158424e-07, "loss": 0.0001, "num_tokens": 12497923.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.8727912902832031, "sampling/importance_sampling_ratio/mean": 0.9999344348907471, "sampling/importance_sampling_ratio/min": 0.5515694618225098, "sampling/sampling_logp_difference/max": 0.6274299621582031, "sampling/sampling_logp_difference/mean": 0.012849211692810059, "step": 291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1035.0, "completions/max_terminated_length": 1035.0, "completions/mean_length": 369.4375, "completions/mean_terminated_length": 369.4375, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "entropy": 0.40876254439353943, "epoch": 0.35784313725490197, "frac_reward_zero_std": 1.0, "grad_norm": 0.018446684304350057, "kl": 0.0077904583886265755, "learning_rate": 9.989726963751682e-07, "loss": 0.0001, "num_tokens": 12543583.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4480196237564087, "sampling/importance_sampling_ratio/mean": 1.0001870393753052, "sampling/importance_sampling_ratio/min": 0.693528950214386, "sampling/sampling_logp_difference/max": 0.3701968193054199, "sampling/sampling_logp_difference/mean": 0.014104874804615974, "step": 292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 820.0, "completions/max_terminated_length": 820.0, "completions/mean_length": 431.4375, "completions/mean_terminated_length": 431.4375, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "entropy": 0.325234591960907, "epoch": 0.3590686274509804, "frac_reward_zero_std": 1.0, "grad_norm": 0.011985812900248798, "kl": 0.005029605235904455, "learning_rate": 9.989265476515309e-07, "loss": 0.0, "num_tokens": 12591163.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.369502305984497, "sampling/importance_sampling_ratio/mean": 0.9998431205749512, "sampling/importance_sampling_ratio/min": 0.5833713412284851, "sampling/sampling_logp_difference/max": 0.5389313697814941, "sampling/sampling_logp_difference/mean": 0.010473090223968029, "step": 293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1859.0, "completions/max_terminated_length": 1859.0, "completions/mean_length": 598.09375, "completions/mean_terminated_length": 598.09375, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.4085462689399719, "epoch": 0.3602941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.013339892059344093, "kl": 0.006725154351443052, "learning_rate": 9.9887938613861e-07, "loss": 0.0001, "num_tokens": 12650897.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6710412502288818, "sampling/importance_sampling_ratio/mean": 1.0000739097595215, "sampling/importance_sampling_ratio/min": 0.17536543309688568, "sampling/sampling_logp_difference/max": 1.7408833503723145, "sampling/sampling_logp_difference/mean": 0.014114665798842907, "step": 294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 722.0, "completions/max_terminated_length": 722.0, "completions/mean_length": 455.0, "completions/mean_terminated_length": 455.0, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "entropy": 0.5367239713668823, "epoch": 0.36151960784313725, "frac_reward_zero_std": 1.0, "grad_norm": 0.02031050884430324, "kl": 0.008807896636426449, "learning_rate": 9.988312119321402e-07, "loss": 0.0001, "num_tokens": 12694993.0, "reward": -0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": -0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4024665355682373, "sampling/importance_sampling_ratio/mean": 0.9997748732566833, "sampling/importance_sampling_ratio/min": 0.6492209434509277, "sampling/sampling_logp_difference/max": 0.431982159614563, "sampling/sampling_logp_difference/mean": 0.016321517527103424, "step": 295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 901.0, "completions/max_terminated_length": 901.0, "completions/mean_length": 401.53125, "completions/mean_terminated_length": 401.53125, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.5166581273078918, "epoch": 0.3627450980392157, "frac_reward_zero_std": 0.75, "grad_norm": 0.5668631124606451, "kl": 0.009334176778793335, "learning_rate": 9.98782025129912e-07, "loss": -0.0171, "num_tokens": 12737603.0, "reward": 0.65625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.9516141414642334, "sampling/importance_sampling_ratio/mean": 1.0003881454467773, "sampling/importance_sampling_ratio/min": 0.6117785573005676, "sampling/sampling_logp_difference/max": 0.6686568260192871, "sampling/sampling_logp_difference/mean": 0.015941839665174484, "step": 296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 544.0, "completions/max_terminated_length": 544.0, "completions/mean_length": 353.65625, "completions/mean_terminated_length": 353.65625, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "entropy": 0.363652765750885, "epoch": 0.3639705882352941, "frac_reward_zero_std": 0.75, "grad_norm": 0.5981564902565938, "kl": 0.008375367149710655, "learning_rate": 9.987318258317715e-07, "loss": -0.0047, "num_tokens": 12775469.0, "reward": 0.78125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.686222791671753, "sampling/importance_sampling_ratio/mean": 1.0000629425048828, "sampling/importance_sampling_ratio/min": 0.6828455328941345, "sampling/sampling_logp_difference/max": 0.5224909782409668, "sampling/sampling_logp_difference/mean": 0.012904006987810135, "step": 297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 799.0, "completions/max_terminated_length": 799.0, "completions/mean_length": 348.28125, "completions/mean_terminated_length": 348.28125, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "entropy": 0.5027517676353455, "epoch": 0.36519607843137253, "frac_reward_zero_std": 0.5, "grad_norm": 1.0935188356421421, "kl": 0.011659527197480202, "learning_rate": 9.986806141396205e-07, "loss": 0.0194, "num_tokens": 12815103.0, "reward": 0.5625, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.3952319622039795, "sampling/importance_sampling_ratio/mean": 1.000213384628296, "sampling/importance_sampling_ratio/min": 0.15417538583278656, "sampling/sampling_logp_difference/max": 1.8696644306182861, "sampling/sampling_logp_difference/mean": 0.016838114708662033, "step": 298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 551.0, "completions/max_terminated_length": 551.0, "completions/mean_length": 331.15625, "completions/mean_terminated_length": 331.15625, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "entropy": 0.43869608640670776, "epoch": 0.36642156862745096, "frac_reward_zero_std": 1.0, "grad_norm": 0.02166812488406378, "kl": 0.009893905371427536, "learning_rate": 9.986283901574149e-07, "loss": 0.0001, "num_tokens": 12852441.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5559877157211304, "sampling/importance_sampling_ratio/mean": 0.9995791912078857, "sampling/importance_sampling_ratio/min": 0.48723095655441284, "sampling/sampling_logp_difference/max": 0.7190170288085938, "sampling/sampling_logp_difference/mean": 0.01529137697070837, "step": 299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 780.0, "completions/max_terminated_length": 780.0, "completions/mean_length": 355.15625, "completions/mean_terminated_length": 355.15625, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "entropy": 0.46426403522491455, "epoch": 0.36764705882352944, "frac_reward_zero_std": 0.75, "grad_norm": 0.5521686596420503, "kl": 0.007922716438770294, "learning_rate": 9.985751539911664e-07, "loss": 0.0007, "num_tokens": 12893379.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.3670470714569092, "sampling/importance_sampling_ratio/mean": 1.0002626180648804, "sampling/importance_sampling_ratio/min": 0.6171382665634155, "sampling/sampling_logp_difference/max": 0.4826622009277344, "sampling/sampling_logp_difference/mean": 0.014585580676794052, "step": 300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 917.0, "completions/max_terminated_length": 917.0, "completions/mean_length": 421.9375, "completions/mean_terminated_length": 421.9375, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "entropy": 0.4536247253417969, "epoch": 0.36887254901960786, "frac_reward_zero_std": 0.75, "grad_norm": 0.6173638199456861, "kl": 0.008427445776760578, "learning_rate": 9.985209057489408e-07, "loss": 0.0042, "num_tokens": 12939119.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.6288907527923584, "sampling/importance_sampling_ratio/mean": 1.0000323057174683, "sampling/importance_sampling_ratio/min": 0.5026019215583801, "sampling/sampling_logp_difference/max": 0.6879568099975586, "sampling/sampling_logp_difference/mean": 0.014504168182611465, "step": 301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1331.0, "completions/max_terminated_length": 1331.0, "completions/mean_length": 492.15625, "completions/mean_terminated_length": 492.15625, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.39561834931373596, "epoch": 0.3700980392156863, "frac_reward_zero_std": 0.75, "grad_norm": 0.5630038274687309, "kl": 0.007504451088607311, "learning_rate": 9.98465645540859e-07, "loss": -0.0302, "num_tokens": 12987097.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.5578343868255615, "sampling/importance_sampling_ratio/mean": 1.000136137008667, "sampling/importance_sampling_ratio/min": 0.5727870464324951, "sampling/sampling_logp_difference/max": 0.5572413206100464, "sampling/sampling_logp_difference/mean": 0.012591574341058731, "step": 302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 827.0, "completions/max_terminated_length": 827.0, "completions/mean_length": 429.640625, "completions/mean_terminated_length": 429.640625, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "entropy": 0.3929044008255005, "epoch": 0.3713235294117647, "frac_reward_zero_std": 0.75, "grad_norm": 0.45951732983935806, "kl": 0.009980730712413788, "learning_rate": 9.984093734790954e-07, "loss": 0.001, "num_tokens": 13034818.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.507156252861023, "sampling/importance_sampling_ratio/mean": 0.9996682405471802, "sampling/importance_sampling_ratio/min": 0.4881964325904846, "sampling/sampling_logp_difference/max": 0.7170374393463135, "sampling/sampling_logp_difference/mean": 0.014606675133109093, "step": 303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1734.0, "completions/max_terminated_length": 1734.0, "completions/mean_length": 526.828125, "completions/mean_terminated_length": 526.828125, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "entropy": 0.4194396734237671, "epoch": 0.37254901960784315, "frac_reward_zero_std": 1.0, "grad_norm": 0.016533251701879432, "kl": 0.008217718452215195, "learning_rate": 9.983520896778788e-07, "loss": 0.0001, "num_tokens": 13093415.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5951775312423706, "sampling/importance_sampling_ratio/mean": 1.0000388622283936, "sampling/importance_sampling_ratio/min": 0.6205462217330933, "sampling/sampling_logp_difference/max": 0.4771552085876465, "sampling/sampling_logp_difference/mean": 0.013638323172926903, "step": 304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 847.0, "completions/max_terminated_length": 847.0, "completions/mean_length": 296.90625, "completions/mean_terminated_length": 296.90625, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "entropy": 0.45182517170906067, "epoch": 0.3737745098039216, "frac_reward_zero_std": 0.75, "grad_norm": 0.6925784100250523, "kl": 0.015090974979102612, "learning_rate": 9.982937942534917e-07, "loss": 0.0117, "num_tokens": 13129057.0, "reward": 0.40625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.4265079498291016, "sampling/importance_sampling_ratio/mean": 0.9998557567596436, "sampling/importance_sampling_ratio/min": 0.668265700340271, "sampling/sampling_logp_difference/max": 0.40306949615478516, "sampling/sampling_logp_difference/mean": 0.01587398163974285, "step": 305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1180.0, "completions/max_terminated_length": 1180.0, "completions/mean_length": 368.046875, "completions/mean_terminated_length": 368.046875, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.4087732434272766, "epoch": 0.375, "frac_reward_zero_std": 0.75, "grad_norm": 0.5513238439422787, "kl": 0.011980720795691013, "learning_rate": 9.982344873242701e-07, "loss": 0.0358, "num_tokens": 13168244.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.5422356128692627, "sampling/importance_sampling_ratio/mean": 0.9998914003372192, "sampling/importance_sampling_ratio/min": 0.12312961369752884, "sampling/sampling_logp_difference/max": 2.094517707824707, "sampling/sampling_logp_difference/mean": 0.014269829727709293, "step": 306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 659.0, "completions/max_terminated_length": 659.0, "completions/mean_length": 354.25, "completions/mean_terminated_length": 354.25, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "entropy": 0.39770442247390747, "epoch": 0.3762254901960784, "frac_reward_zero_std": 0.75, "grad_norm": 0.5384992795771103, "kl": 0.010300671681761742, "learning_rate": 9.981741690106034e-07, "loss": 0.017, "num_tokens": 13210212.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.8809148073196411, "sampling/importance_sampling_ratio/mean": 1.0000584125518799, "sampling/importance_sampling_ratio/min": 0.7107168436050415, "sampling/sampling_logp_difference/max": 0.6317582130432129, "sampling/sampling_logp_difference/mean": 0.013473963364958763, "step": 307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 698.0, "completions/max_terminated_length": 698.0, "completions/mean_length": 403.65625, "completions/mean_terminated_length": 403.65625, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "entropy": 0.29514577984809875, "epoch": 0.37745098039215685, "frac_reward_zero_std": 1.0, "grad_norm": 0.027147435788758206, "kl": 0.00740927504375577, "learning_rate": 9.981128394349337e-07, "loss": 0.0001, "num_tokens": 13253182.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6930053234100342, "sampling/importance_sampling_ratio/mean": 0.9996542930603027, "sampling/importance_sampling_ratio/min": 0.48819631338119507, "sampling/sampling_logp_difference/max": 0.7170376777648926, "sampling/sampling_logp_difference/mean": 0.011050916276872158, "step": 308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 745.0, "completions/max_terminated_length": 745.0, "completions/mean_length": 409.265625, "completions/mean_terminated_length": 409.265625, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "entropy": 0.3837679922580719, "epoch": 0.3786764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.021727153461431826, "kl": 0.011208648793399334, "learning_rate": 9.980504987217566e-07, "loss": 0.0001, "num_tokens": 13294239.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6049079895019531, "sampling/importance_sampling_ratio/mean": 1.000239372253418, "sampling/importance_sampling_ratio/min": 0.6173794865608215, "sampling/sampling_logp_difference/max": 0.4822714328765869, "sampling/sampling_logp_difference/mean": 0.013072416186332703, "step": 309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1452.0, "completions/max_terminated_length": 1452.0, "completions/mean_length": 616.015625, "completions/mean_terminated_length": 616.015625, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.3933092951774597, "epoch": 0.3799019607843137, "frac_reward_zero_std": 1.0, "grad_norm": 0.017538490277218117, "kl": 0.009886260144412518, "learning_rate": 9.979871469976195e-07, "loss": 0.0001, "num_tokens": 13355264.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999862313270569, "sampling/importance_sampling_ratio/min": 0.5150355100631714, "sampling/sampling_logp_difference/max": 0.8242847919464111, "sampling/sampling_logp_difference/mean": 0.013695517554879189, "step": 310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1691.0, "completions/max_terminated_length": 1691.0, "completions/mean_length": 519.828125, "completions/mean_terminated_length": 519.828125, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "entropy": 0.38938552141189575, "epoch": 0.38112745098039214, "frac_reward_zero_std": 1.0, "grad_norm": 0.014017354011001466, "kl": 0.008116151206195354, "learning_rate": 9.979227843911224e-07, "loss": 0.0001, "num_tokens": 13410773.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6196061372756958, "sampling/importance_sampling_ratio/mean": 1.0004019737243652, "sampling/importance_sampling_ratio/min": 0.6818975210189819, "sampling/sampling_logp_difference/max": 0.48218297958374023, "sampling/sampling_logp_difference/mean": 0.012811325490474701, "step": 311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1255.0, "completions/max_terminated_length": 1255.0, "completions/mean_length": 446.109375, "completions/mean_terminated_length": 446.109375, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "entropy": 0.3657413423061371, "epoch": 0.38235294117647056, "frac_reward_zero_std": 0.75, "grad_norm": 0.5515582528044881, "kl": 0.009820051491260529, "learning_rate": 9.978574110329172e-07, "loss": -0.0157, "num_tokens": 13462412.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.520021915435791, "sampling/importance_sampling_ratio/mean": 1.0001370906829834, "sampling/importance_sampling_ratio/min": 0.6197339296340942, "sampling/sampling_logp_difference/max": 0.47846508026123047, "sampling/sampling_logp_difference/mean": 0.013433972373604774, "step": 312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2008.0, "completions/max_terminated_length": 2008.0, "completions/mean_length": 564.421875, "completions/mean_terminated_length": 564.421875, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.3767271935939789, "epoch": 0.38357843137254904, "frac_reward_zero_std": 1.0, "grad_norm": 0.03488140179451417, "kl": 0.01068820059299469, "learning_rate": 9.977910270557078e-07, "loss": 0.0001, "num_tokens": 13520023.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000051498413086, "sampling/importance_sampling_ratio/min": 0.22405000030994415, "sampling/sampling_logp_difference/max": 1.4958860874176025, "sampling/sampling_logp_difference/mean": 0.013243179768323898, "step": 313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1308.0, "completions/max_terminated_length": 1308.0, "completions/mean_length": 444.8125, "completions/mean_terminated_length": 444.8125, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "entropy": 0.43546828627586365, "epoch": 0.38480392156862747, "frac_reward_zero_std": 1.0, "grad_norm": 0.018545326680841374, "kl": 0.009344382211565971, "learning_rate": 9.977236325942497e-07, "loss": 0.0001, "num_tokens": 13568955.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4527751207351685, "sampling/importance_sampling_ratio/mean": 1.0003941059112549, "sampling/importance_sampling_ratio/min": 0.39065560698509216, "sampling/sampling_logp_difference/max": 0.9399289488792419, "sampling/sampling_logp_difference/mean": 0.013672100380063057, "step": 314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1708.0, "completions/max_terminated_length": 1708.0, "completions/mean_length": 527.359375, "completions/mean_terminated_length": 527.359375, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "entropy": 0.44667309522628784, "epoch": 0.3860294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.020736617878692475, "kl": 0.009554138407111168, "learning_rate": 9.97655227785349e-07, "loss": 0.0001, "num_tokens": 13620002.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4848815202713013, "sampling/importance_sampling_ratio/mean": 0.9998436570167542, "sampling/importance_sampling_ratio/min": 0.5581986308097839, "sampling/sampling_logp_difference/max": 0.5830403566360474, "sampling/sampling_logp_difference/mean": 0.0141693614423275, "step": 315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1136.0, "completions/max_terminated_length": 1136.0, "completions/mean_length": 481.734375, "completions/mean_terminated_length": 481.734375, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "entropy": 0.3508520722389221, "epoch": 0.3872549019607843, "frac_reward_zero_std": 1.0, "grad_norm": 0.017226762078213817, "kl": 0.008662508800625801, "learning_rate": 9.975858127678633e-07, "loss": 0.0001, "num_tokens": 13670113.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999427199363708, "sampling/importance_sampling_ratio/min": 0.6345277428627014, "sampling/sampling_logp_difference/max": 0.8365395069122314, "sampling/sampling_logp_difference/mean": 0.011945986188948154, "step": 316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1126.0, "completions/max_terminated_length": 1126.0, "completions/mean_length": 538.9375, "completions/mean_terminated_length": 538.9375, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.4329877495765686, "epoch": 0.38848039215686275, "frac_reward_zero_std": 0.75, "grad_norm": 0.37081407817206735, "kl": 0.01127848494797945, "learning_rate": 9.975153876827007e-07, "loss": 0.0317, "num_tokens": 13722445.0, "reward": -0.40625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": -0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0002152919769287, "sampling/importance_sampling_ratio/min": 0.5477887988090515, "sampling/sampling_logp_difference/max": 1.2426128387451172, "sampling/sampling_logp_difference/mean": 0.014280378818511963, "step": 317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/max_terminated_length": 536.0, "completions/mean_length": 340.484375, "completions/mean_terminated_length": 340.484375, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "entropy": 0.2607770562171936, "epoch": 0.3897058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.022687503125395657, "kl": 0.009989684447646141, "learning_rate": 9.974439526728196e-07, "loss": 0.0001, "num_tokens": 13762780.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4195702075958252, "sampling/importance_sampling_ratio/mean": 0.999373197555542, "sampling/importance_sampling_ratio/min": 0.5479230284690857, "sampling/sampling_logp_difference/max": 0.6016204357147217, "sampling/sampling_logp_difference/mean": 0.0111830560490489, "step": 318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 675.0, "completions/max_terminated_length": 675.0, "completions/mean_length": 351.5625, "completions/mean_terminated_length": 351.5625, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "entropy": 0.3954874873161316, "epoch": 0.3909313725490196, "frac_reward_zero_std": 1.0, "grad_norm": 0.025277203019882025, "kl": 0.011732909828424454, "learning_rate": 9.973715078832286e-07, "loss": 0.0001, "num_tokens": 13801264.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5124925374984741, "sampling/importance_sampling_ratio/mean": 0.9995311498641968, "sampling/importance_sampling_ratio/min": 0.5261453986167908, "sampling/sampling_logp_difference/max": 0.6421776413917542, "sampling/sampling_logp_difference/mean": 0.014720329083502293, "step": 319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 909.0, "completions/max_terminated_length": 909.0, "completions/mean_length": 424.0625, "completions/mean_terminated_length": 424.0625, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.31637364625930786, "epoch": 0.39215686274509803, "frac_reward_zero_std": 1.0, "grad_norm": 0.02708067614644354, "kl": 0.012055089697241783, "learning_rate": 9.97298053460986e-07, "loss": 0.0001, "num_tokens": 13847380.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000877380371094, "sampling/importance_sampling_ratio/min": 0.5868601202964783, "sampling/sampling_logp_difference/max": 0.9466497898101807, "sampling/sampling_logp_difference/mean": 0.01237516850233078, "step": 320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 786.0, "completions/max_terminated_length": 786.0, "completions/mean_length": 303.921875, "completions/mean_terminated_length": 303.921875, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.2878584563732147, "epoch": 0.39338235294117646, "frac_reward_zero_std": 1.0, "grad_norm": 0.028153379970654407, "kl": 0.01241111196577549, "learning_rate": 9.972235895552e-07, "loss": 0.0001, "num_tokens": 13881327.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4330931901931763, "sampling/importance_sampling_ratio/mean": 1.0002539157867432, "sampling/importance_sampling_ratio/min": 0.586326003074646, "sampling/sampling_logp_difference/max": 0.533879280090332, "sampling/sampling_logp_difference/mean": 0.011896773241460323, "step": 321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 953.0, "completions/max_terminated_length": 953.0, "completions/mean_length": 448.015625, "completions/mean_terminated_length": 448.015625, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "entropy": 0.3956063985824585, "epoch": 0.3946078431372549, "frac_reward_zero_std": 0.5, "grad_norm": 0.7193693821722937, "kl": 0.01758223958313465, "learning_rate": 9.971481163170269e-07, "loss": 0.0579, "num_tokens": 13931280.0, "reward": 0.875, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.6256792545318604, "sampling/importance_sampling_ratio/mean": 0.9999731183052063, "sampling/importance_sampling_ratio/min": 0.6421686410903931, "sampling/sampling_logp_difference/max": 0.48592567443847656, "sampling/sampling_logp_difference/mean": 0.013555377721786499, "step": 322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 815.0, "completions/max_terminated_length": 815.0, "completions/mean_length": 387.96875, "completions/mean_terminated_length": 387.96875, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "entropy": 0.3299681544303894, "epoch": 0.3958333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.05612541403522332, "kl": 0.013185951858758926, "learning_rate": 9.97071633899673e-07, "loss": 0.0001, "num_tokens": 13971342.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.7520533800125122, "sampling/importance_sampling_ratio/mean": 1.0000653266906738, "sampling/importance_sampling_ratio/min": 0.49098291993141174, "sampling/sampling_logp_difference/max": 0.711345911026001, "sampling/sampling_logp_difference/mean": 0.012901096604764462, "step": 323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1241.0, "completions/max_terminated_length": 1241.0, "completions/mean_length": 392.078125, "completions/mean_terminated_length": 392.078125, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "entropy": 0.33490121364593506, "epoch": 0.39705882352941174, "frac_reward_zero_std": 1.0, "grad_norm": 0.026787092489695217, "kl": 0.012691620737314224, "learning_rate": 9.969941424583925e-07, "loss": 0.0001, "num_tokens": 14017683.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6242562532424927, "sampling/importance_sampling_ratio/mean": 0.9999720454216003, "sampling/importance_sampling_ratio/min": 0.6484373807907104, "sampling/sampling_logp_difference/max": 0.4850499629974365, "sampling/sampling_logp_difference/mean": 0.01282031461596489, "step": 324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 625.0, "completions/max_terminated_length": 625.0, "completions/mean_length": 361.5625, "completions/mean_terminated_length": 361.5625, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.3535000681877136, "epoch": 0.39828431372549017, "frac_reward_zero_std": 1.0, "grad_norm": 0.02416851638304603, "kl": 0.012164209969341755, "learning_rate": 9.969156421504887e-07, "loss": 0.0001, "num_tokens": 14059783.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.5249487161636353, "sampling/importance_sampling_ratio/mean": 0.9996787905693054, "sampling/importance_sampling_ratio/min": 0.3417090177536011, "sampling/sampling_logp_difference/max": 1.0737957954406738, "sampling/sampling_logp_difference/mean": 0.014508511871099472, "step": 325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 614.0, "completions/max_terminated_length": 614.0, "completions/mean_length": 265.421875, "completions/mean_terminated_length": 265.421875, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.3898376226425171, "epoch": 0.39950980392156865, "frac_reward_zero_std": 0.75, "grad_norm": 0.7840442516300553, "kl": 0.021428652107715607, "learning_rate": 9.968361331353116e-07, "loss": 0.0382, "num_tokens": 14090434.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.378686785697937, "sampling/importance_sampling_ratio/mean": 1.000022292137146, "sampling/importance_sampling_ratio/min": 0.630874514579773, "sampling/sampling_logp_difference/max": 0.4606482982635498, "sampling/sampling_logp_difference/mean": 0.01512196660041809, "step": 326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 640.0, "completions/max_terminated_length": 640.0, "completions/mean_length": 398.9375, "completions/mean_terminated_length": 398.9375, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "entropy": 0.4144642949104309, "epoch": 0.4007352941176471, "frac_reward_zero_std": 1.0, "grad_norm": 0.023423452836702243, "kl": 0.0088006891310215, "learning_rate": 9.9675561557426e-07, "loss": 0.0001, "num_tokens": 14136446.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.446859359741211, "sampling/importance_sampling_ratio/mean": 0.9995441436767578, "sampling/importance_sampling_ratio/min": 0.6130678057670593, "sampling/sampling_logp_difference/max": 0.48927974700927734, "sampling/sampling_logp_difference/mean": 0.01478845439851284, "step": 327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1379.0, "completions/max_terminated_length": 1379.0, "completions/mean_length": 471.34375, "completions/mean_terminated_length": 471.34375, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "entropy": 0.416068971157074, "epoch": 0.4019607843137255, "frac_reward_zero_std": 0.75, "grad_norm": 0.44642895018585144, "kl": 0.007422855123877525, "learning_rate": 9.966740896307791e-07, "loss": 0.012, "num_tokens": 14186484.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.5031661987304688, "sampling/importance_sampling_ratio/mean": 0.9998141527175903, "sampling/importance_sampling_ratio/min": 0.43515825271606445, "sampling/sampling_logp_difference/max": 0.8320455551147461, "sampling/sampling_logp_difference/mean": 0.01243920624256134, "step": 328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 882.0, "completions/max_terminated_length": 882.0, "completions/mean_length": 357.59375, "completions/mean_terminated_length": 357.59375, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "entropy": 0.32247310876846313, "epoch": 0.40318627450980393, "frac_reward_zero_std": 1.0, "grad_norm": 0.022105159737606725, "kl": 0.01313629001379013, "learning_rate": 9.965915554703613e-07, "loss": 0.0001, "num_tokens": 14223482.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6528152227401733, "sampling/importance_sampling_ratio/mean": 1.0001882314682007, "sampling/importance_sampling_ratio/min": 0.3325205147266388, "sampling/sampling_logp_difference/max": 1.1010537147521973, "sampling/sampling_logp_difference/mean": 0.012866199016571045, "step": 329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 791.0, "completions/max_terminated_length": 791.0, "completions/mean_length": 321.203125, "completions/mean_terminated_length": 321.203125, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.3625539541244507, "epoch": 0.40441176470588236, "frac_reward_zero_std": 1.0, "grad_norm": 0.01642665853718244, "kl": 0.008433051407337189, "learning_rate": 9.965080132605461e-07, "loss": 0.0001, "num_tokens": 14261943.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5822275876998901, "sampling/importance_sampling_ratio/mean": 1.0000501871109009, "sampling/importance_sampling_ratio/min": 0.6134088039398193, "sampling/sampling_logp_difference/max": 0.4887237548828125, "sampling/sampling_logp_difference/mean": 0.013902903534471989, "step": 330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1432.0, "completions/max_terminated_length": 1432.0, "completions/mean_length": 475.203125, "completions/mean_terminated_length": 475.203125, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.39955011010169983, "epoch": 0.4056372549019608, "frac_reward_zero_std": 1.0, "grad_norm": 0.018077321322305905, "kl": 0.008903050795197487, "learning_rate": 9.964234631709185e-07, "loss": 0.0001, "num_tokens": 14312836.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.586427927017212, "sampling/importance_sampling_ratio/mean": 0.9998592734336853, "sampling/importance_sampling_ratio/min": 0.4448848366737366, "sampling/sampling_logp_difference/max": 0.8099398612976074, "sampling/sampling_logp_difference/mean": 0.013436764478683472, "step": 331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 790.0, "completions/max_terminated_length": 790.0, "completions/mean_length": 391.09375, "completions/mean_terminated_length": 391.09375, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.3514891564846039, "epoch": 0.4068627450980392, "frac_reward_zero_std": 1.0, "grad_norm": 0.026476223528054895, "kl": 0.009936319664120674, "learning_rate": 9.963379053731102e-07, "loss": 0.0001, "num_tokens": 14354186.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5117250680923462, "sampling/importance_sampling_ratio/mean": 0.9996531009674072, "sampling/importance_sampling_ratio/min": 0.6575544476509094, "sampling/sampling_logp_difference/max": 0.4192277193069458, "sampling/sampling_logp_difference/mean": 0.01271536760032177, "step": 332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 662.0, "completions/max_terminated_length": 662.0, "completions/mean_length": 305.453125, "completions/mean_terminated_length": 305.453125, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.38945114612579346, "epoch": 0.40808823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.03462052591833793, "kl": 0.01252373680472374, "learning_rate": 9.96251340040798e-07, "loss": 0.0001, "num_tokens": 14390807.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5083720684051514, "sampling/importance_sampling_ratio/mean": 1.000170111656189, "sampling/importance_sampling_ratio/min": 0.5318205952644348, "sampling/sampling_logp_difference/max": 0.6314491033554077, "sampling/sampling_logp_difference/mean": 0.015917697921395302, "step": 333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1667.0, "completions/max_terminated_length": 1667.0, "completions/mean_length": 405.609375, "completions/mean_terminated_length": 405.609375, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "entropy": 0.3895723819732666, "epoch": 0.40931372549019607, "frac_reward_zero_std": 1.0, "grad_norm": 0.016105757376146217, "kl": 0.006673391908407211, "learning_rate": 9.96163767349704e-07, "loss": 0.0001, "num_tokens": 14437422.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.6812361478805542, "sampling/importance_sampling_ratio/mean": 0.9999213814735413, "sampling/importance_sampling_ratio/min": 0.6338549256324768, "sampling/sampling_logp_difference/max": 0.5195293426513672, "sampling/sampling_logp_difference/mean": 0.012851076200604439, "step": 334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1181.0, "completions/max_terminated_length": 1181.0, "completions/mean_length": 457.625, "completions/mean_terminated_length": 457.625, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.4651409089565277, "epoch": 0.4105392156862745, "frac_reward_zero_std": 1.0, "grad_norm": 0.036212445471804626, "kl": 0.010748028755187988, "learning_rate": 9.96075187477595e-07, "loss": 0.0001, "num_tokens": 14484406.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6240545511245728, "sampling/importance_sampling_ratio/mean": 0.999816358089447, "sampling/importance_sampling_ratio/min": 0.5170950889587402, "sampling/sampling_logp_difference/max": 0.6595284938812256, "sampling/sampling_logp_difference/mean": 0.015890110284090042, "step": 335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 857.0, "completions/max_terminated_length": 857.0, "completions/mean_length": 288.796875, "completions/mean_terminated_length": 288.796875, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.2877309024333954, "epoch": 0.4117647058823529, "frac_reward_zero_std": 1.0, "grad_norm": 0.020308054587971153, "kl": 0.0075369905680418015, "learning_rate": 9.959856006042828e-07, "loss": 0.0001, "num_tokens": 14520889.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.8134900331497192, "sampling/importance_sampling_ratio/mean": 1.0001498460769653, "sampling/importance_sampling_ratio/min": 0.6090335249900818, "sampling/sampling_logp_difference/max": 0.5952532291412354, "sampling/sampling_logp_difference/mean": 0.011952281929552555, "step": 336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 708.0, "completions/max_terminated_length": 708.0, "completions/mean_length": 306.828125, "completions/mean_terminated_length": 306.828125, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "entropy": 0.45471590757369995, "epoch": 0.41299019607843135, "frac_reward_zero_std": 1.0, "grad_norm": 0.03078667343142889, "kl": 0.015055126510560513, "learning_rate": 9.95895006911623e-07, "loss": 0.0001, "num_tokens": 14562110.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0007987022399902, "sampling/importance_sampling_ratio/min": 0.6451879143714905, "sampling/sampling_logp_difference/max": 0.8809695243835449, "sampling/sampling_logp_difference/mean": 0.01711321994662285, "step": 337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1508.0, "completions/max_terminated_length": 1508.0, "completions/mean_length": 323.640625, "completions/mean_terminated_length": 323.640625, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.35282206535339355, "epoch": 0.41421568627450983, "frac_reward_zero_std": 0.75, "grad_norm": 0.5762364184084314, "kl": 0.011881256476044655, "learning_rate": 9.95803406583515e-07, "loss": 0.0101, "num_tokens": 14596599.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.7521339654922485, "sampling/importance_sampling_ratio/mean": 0.9998881816864014, "sampling/importance_sampling_ratio/min": 0.6546947956085205, "sampling/sampling_logp_difference/max": 0.5608344078063965, "sampling/sampling_logp_difference/mean": 0.013441571965813637, "step": 338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/max_terminated_length": 529.0, "completions/mean_length": 251.65625, "completions/mean_terminated_length": 251.65625, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.5205357074737549, "epoch": 0.41544117647058826, "frac_reward_zero_std": 0.5, "grad_norm": 1.2185116574844135, "kl": 0.020023316144943237, "learning_rate": 9.957107998059018e-07, "loss": -0.0274, "num_tokens": 14628337.0, "reward": 0.28125, "reward_std": 0.4515564441680908, "rewards/decision_reward_func/mean": 0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 1.6128053665161133, "sampling/importance_sampling_ratio/mean": 1.0000221729278564, "sampling/importance_sampling_ratio/min": 0.6136941909790039, "sampling/sampling_logp_difference/max": 0.48825860023498535, "sampling/sampling_logp_difference/mean": 0.018551619723439217, "step": 339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 846.0, "completions/max_terminated_length": 846.0, "completions/mean_length": 362.4375, "completions/mean_terminated_length": 362.4375, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.5136936902999878, "epoch": 0.4166666666666667, "frac_reward_zero_std": 0.75, "grad_norm": 0.6071064908531735, "kl": 0.009995874017477036, "learning_rate": 9.956171867667693e-07, "loss": -0.0203, "num_tokens": 14674685.0, "reward": 0.09375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.8533201217651367, "sampling/importance_sampling_ratio/mean": 0.9999036192893982, "sampling/importance_sampling_ratio/min": 0.4742354452610016, "sampling/sampling_logp_difference/max": 0.7460513114929199, "sampling/sampling_logp_difference/mean": 0.01657097041606903, "step": 340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 275.0, "completions/max_terminated_length": 275.0, "completions/mean_length": 188.375, "completions/mean_terminated_length": 188.375, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.4074804186820984, "epoch": 0.4178921568627451, "frac_reward_zero_std": 1.0, "grad_norm": 0.036476834547099116, "kl": 0.015126178972423077, "learning_rate": 9.955225676561459e-07, "loss": 0.0001, "num_tokens": 14699477.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.5468698740005493, "sampling/importance_sampling_ratio/mean": 1.0003670454025269, "sampling/importance_sampling_ratio/min": 0.6701616644859314, "sampling/sampling_logp_difference/max": 0.4362335205078125, "sampling/sampling_logp_difference/mean": 0.018005145713686943, "step": 341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 808.0, "completions/max_terminated_length": 808.0, "completions/mean_length": 318.34375, "completions/mean_terminated_length": 318.34375, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.34958961606025696, "epoch": 0.41911764705882354, "frac_reward_zero_std": 0.75, "grad_norm": 0.7609959621550058, "kl": 0.007451679557561874, "learning_rate": 9.954269426661022e-07, "loss": 0.0211, "num_tokens": 14744011.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.471272349357605, "sampling/importance_sampling_ratio/mean": 1.0003349781036377, "sampling/importance_sampling_ratio/min": 0.5679161548614502, "sampling/sampling_logp_difference/max": 0.5657814741134644, "sampling/sampling_logp_difference/mean": 0.012452412396669388, "step": 342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 978.0, "completions/max_terminated_length": 978.0, "completions/mean_length": 339.0625, "completions/mean_terminated_length": 339.0625, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.4025729298591614, "epoch": 0.42034313725490197, "frac_reward_zero_std": 0.75, "grad_norm": 0.5961014331480262, "kl": 0.007322858087718487, "learning_rate": 9.953303119907513e-07, "loss": 0.0135, "num_tokens": 14783343.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.8775856494903564, "sampling/importance_sampling_ratio/mean": 1.0002448558807373, "sampling/importance_sampling_ratio/min": 0.624758243560791, "sampling/sampling_logp_difference/max": 0.6299867630004883, "sampling/sampling_logp_difference/mean": 0.013631120324134827, "step": 343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 662.0, "completions/max_terminated_length": 662.0, "completions/mean_length": 309.125, "completions/mean_terminated_length": 309.125, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "entropy": 0.4073280096054077, "epoch": 0.4215686274509804, "frac_reward_zero_std": 0.75, "grad_norm": 0.5716701063657293, "kl": 0.009497517719864845, "learning_rate": 9.952326758262472e-07, "loss": 0.0106, "num_tokens": 14822599.0, "reward": 0.09375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.8617557287216187, "sampling/importance_sampling_ratio/mean": 0.9995983839035034, "sampling/importance_sampling_ratio/min": 0.6254897117614746, "sampling/sampling_logp_difference/max": 0.6215200424194336, "sampling/sampling_logp_difference/mean": 0.013475436717271805, "step": 344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/max_terminated_length": 344.0, "completions/mean_length": 242.140625, "completions/mean_terminated_length": 242.140625, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.32578033208847046, "epoch": 0.4227941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.012096581527440477, "kl": 0.006686481647193432, "learning_rate": 9.95134034370785e-07, "loss": 0.0001, "num_tokens": 14851760.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5989513397216797, "sampling/importance_sampling_ratio/mean": 1.0001709461212158, "sampling/importance_sampling_ratio/min": 0.677209734916687, "sampling/sampling_logp_difference/max": 0.4693479537963867, "sampling/sampling_logp_difference/mean": 0.013052243739366531, "step": 345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 656.0, "completions/max_terminated_length": 656.0, "completions/mean_length": 266.109375, "completions/mean_terminated_length": 266.109375, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.4384629428386688, "epoch": 0.42401960784313725, "frac_reward_zero_std": 1.0, "grad_norm": 0.014592412757529321, "kl": 0.007628190331161022, "learning_rate": 9.950343878246009e-07, "loss": 0.0001, "num_tokens": 14895015.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.399330973625183, "sampling/importance_sampling_ratio/mean": 0.9996635913848877, "sampling/importance_sampling_ratio/min": 0.5859622359275818, "sampling/sampling_logp_difference/max": 0.5344998836517334, "sampling/sampling_logp_difference/mean": 0.016006257385015488, "step": 346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1039.0, "completions/max_terminated_length": 1039.0, "completions/mean_length": 342.9375, "completions/mean_terminated_length": 342.9375, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.5310128927230835, "epoch": 0.4252450980392157, "frac_reward_zero_std": 0.75, "grad_norm": 0.8972818911453753, "kl": 0.010398059152066708, "learning_rate": 9.949337363899708e-07, "loss": 0.0297, "num_tokens": 14932835.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.7108179330825806, "sampling/importance_sampling_ratio/mean": 0.999699056148529, "sampling/importance_sampling_ratio/min": 0.6263183951377869, "sampling/sampling_logp_difference/max": 0.5369715690612793, "sampling/sampling_logp_difference/mean": 0.017189286649227142, "step": 347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/max_terminated_length": 488.0, "completions/mean_length": 266.828125, "completions/mean_terminated_length": 266.828125, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.5145455002784729, "epoch": 0.4264705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.018251934509387377, "kl": 0.010695382952690125, "learning_rate": 9.948320802712107e-07, "loss": 0.0001, "num_tokens": 14965192.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4491758346557617, "sampling/importance_sampling_ratio/mean": 1.000340223312378, "sampling/importance_sampling_ratio/min": 0.6676943898200989, "sampling/sampling_logp_difference/max": 0.40392470359802246, "sampling/sampling_logp_difference/mean": 0.016726354137063026, "step": 348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 729.0, "completions/max_terminated_length": 729.0, "completions/mean_length": 326.921875, "completions/mean_terminated_length": 326.921875, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.3721088767051697, "epoch": 0.42769607843137253, "frac_reward_zero_std": 1.0, "grad_norm": 0.013724449477437693, "kl": 0.007321214769035578, "learning_rate": 9.947294196746762e-07, "loss": 0.0001, "num_tokens": 15004915.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6521241664886475, "sampling/importance_sampling_ratio/mean": 0.9995253086090088, "sampling/importance_sampling_ratio/min": 0.676246166229248, "sampling/sampling_logp_difference/max": 0.5020618438720703, "sampling/sampling_logp_difference/mean": 0.014097096398472786, "step": 349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 581.0, "completions/max_terminated_length": 581.0, "completions/mean_length": 253.96875, "completions/mean_terminated_length": 253.96875, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "entropy": 0.43177446722984314, "epoch": 0.42892156862745096, "frac_reward_zero_std": 1.0, "grad_norm": 0.016837544649757424, "kl": 0.00731238117441535, "learning_rate": 9.946257548087619e-07, "loss": 0.0001, "num_tokens": 15037025.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.331712007522583, "sampling/importance_sampling_ratio/mean": 1.0004875659942627, "sampling/importance_sampling_ratio/min": 0.7195606827735901, "sampling/sampling_logp_difference/max": 0.3291144371032715, "sampling/sampling_logp_difference/mean": 0.013819659128785133, "step": 350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 620.0, "completions/max_terminated_length": 620.0, "completions/mean_length": 324.5, "completions/mean_terminated_length": 324.5, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.41477513313293457, "epoch": 0.43014705882352944, "frac_reward_zero_std": 1.0, "grad_norm": 0.013251461421814064, "kl": 0.006891029886901379, "learning_rate": 9.945210858839008e-07, "loss": 0.0001, "num_tokens": 15074609.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5788918733596802, "sampling/importance_sampling_ratio/mean": 0.9999963641166687, "sampling/importance_sampling_ratio/min": 0.6069620251655579, "sampling/sampling_logp_difference/max": 0.49928903579711914, "sampling/sampling_logp_difference/mean": 0.013883974403142929, "step": 351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 888.0, "completions/max_terminated_length": 888.0, "completions/mean_length": 483.546875, "completions/mean_terminated_length": 483.546875, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "entropy": 0.4055541455745697, "epoch": 0.43137254901960786, "frac_reward_zero_std": 0.75, "grad_norm": 0.49560110477831015, "kl": 0.005117390304803848, "learning_rate": 9.944154131125642e-07, "loss": 0.0221, "num_tokens": 15123412.0, "reward": -0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": -0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.581242561340332, "sampling/importance_sampling_ratio/mean": 1.0000226497650146, "sampling/importance_sampling_ratio/min": 0.68509441614151, "sampling/sampling_logp_difference/max": 0.45821094512939453, "sampling/sampling_logp_difference/mean": 0.01238926686346531, "step": 352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 764.0, "completions/max_terminated_length": 764.0, "completions/mean_length": 323.484375, "completions/mean_terminated_length": 323.484375, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.5161611437797546, "epoch": 0.4325980392156863, "frac_reward_zero_std": 1.0, "grad_norm": 0.015533436491323327, "kl": 0.011319116689264774, "learning_rate": 9.94308736709261e-07, "loss": 0.0001, "num_tokens": 15164659.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9239755868911743, "sampling/importance_sampling_ratio/mean": 1.0002652406692505, "sampling/importance_sampling_ratio/min": 0.6609427332878113, "sampling/sampling_logp_difference/max": 0.6543936729431152, "sampling/sampling_logp_difference/mean": 0.016223888844251633, "step": 353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 722.0, "completions/max_terminated_length": 722.0, "completions/mean_length": 333.0625, "completions/mean_terminated_length": 333.0625, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "entropy": 0.3820299208164215, "epoch": 0.4338235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.012385636513313954, "kl": 0.006774532608687878, "learning_rate": 9.94201056890538e-07, "loss": 0.0001, "num_tokens": 15203191.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5673291683197021, "sampling/importance_sampling_ratio/mean": 0.9998567700386047, "sampling/importance_sampling_ratio/min": 0.3985017240047455, "sampling/sampling_logp_difference/max": 0.9200434684753418, "sampling/sampling_logp_difference/mean": 0.013704527169466019, "step": 354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 864.0, "completions/max_terminated_length": 864.0, "completions/mean_length": 414.453125, "completions/mean_terminated_length": 414.453125, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "entropy": 0.5273058414459229, "epoch": 0.43504901960784315, "frac_reward_zero_std": 0.75, "grad_norm": 0.5271282147057225, "kl": 0.006991592235863209, "learning_rate": 9.940923738749777e-07, "loss": 0.0039, "num_tokens": 15246964.0, "reward": -0.09375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": -0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.7437046766281128, "sampling/importance_sampling_ratio/mean": 0.9997588992118835, "sampling/importance_sampling_ratio/min": 0.5510984659194946, "sampling/sampling_logp_difference/max": 0.5958417654037476, "sampling/sampling_logp_difference/mean": 0.015264918096363544, "step": 355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 705.0, "completions/max_terminated_length": 705.0, "completions/mean_length": 310.78125, "completions/mean_terminated_length": 310.78125, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.5642998218536377, "epoch": 0.4362745098039216, "frac_reward_zero_std": 1.0, "grad_norm": 0.016890349827799963, "kl": 0.008994456380605698, "learning_rate": 9.939826878832003e-07, "loss": 0.0001, "num_tokens": 15282326.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5511773824691772, "sampling/importance_sampling_ratio/mean": 0.9999070167541504, "sampling/importance_sampling_ratio/min": 0.6636795401573181, "sampling/sampling_logp_difference/max": 0.439014196395874, "sampling/sampling_logp_difference/mean": 0.01801394298672676, "step": 356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 743.0, "completions/max_terminated_length": 743.0, "completions/mean_length": 374.15625, "completions/mean_terminated_length": 374.15625, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.48606714606285095, "epoch": 0.4375, "frac_reward_zero_std": 0.75, "grad_norm": 0.5695870657943967, "kl": 0.008363102562725544, "learning_rate": 9.938719991378613e-07, "loss": 0.0136, "num_tokens": 15328256.0, "reward": -0.0625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": -0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.527762532234192, "sampling/importance_sampling_ratio/mean": 1.0002577304840088, "sampling/importance_sampling_ratio/min": 0.6166898012161255, "sampling/sampling_logp_difference/max": 0.48338913917541504, "sampling/sampling_logp_difference/mean": 0.014546483755111694, "step": 357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1381.0, "completions/max_terminated_length": 1381.0, "completions/mean_length": 430.53125, "completions/mean_terminated_length": 430.53125, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "entropy": 0.46838104724884033, "epoch": 0.4387254901960784, "frac_reward_zero_std": 1.0, "grad_norm": 0.0109215015772703, "kl": 0.00633890088647604, "learning_rate": 9.937603078636518e-07, "loss": 0.0001, "num_tokens": 15381362.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.9891951084136963, "sampling/importance_sampling_ratio/mean": 1.0002669095993042, "sampling/importance_sampling_ratio/min": 0.458951860666275, "sampling/sampling_logp_difference/max": 0.7788100242614746, "sampling/sampling_logp_difference/mean": 0.014039875939488411, "step": 358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 436.0, "completions/max_terminated_length": 436.0, "completions/mean_length": 224.640625, "completions/mean_terminated_length": 224.640625, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.4758760631084442, "epoch": 0.43995098039215685, "frac_reward_zero_std": 0.75, "grad_norm": 0.7249059470333209, "kl": 0.012955006211996078, "learning_rate": 9.936476142872977e-07, "loss": 0.0213, "num_tokens": 15409291.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.3744429349899292, "sampling/importance_sampling_ratio/mean": 1.0005557537078857, "sampling/importance_sampling_ratio/min": 0.6296499967575073, "sampling/sampling_logp_difference/max": 0.46259117126464844, "sampling/sampling_logp_difference/mean": 0.016263581812381744, "step": 359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 875.0, "completions/max_terminated_length": 875.0, "completions/mean_length": 358.015625, "completions/mean_terminated_length": 358.015625, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.4982360005378723, "epoch": 0.4411764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.013318113556384587, "kl": 0.006157570984214544, "learning_rate": 9.935339186375603e-07, "loss": 0.0001, "num_tokens": 15453036.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.6019818782806396, "sampling/importance_sampling_ratio/mean": 1.000604510307312, "sampling/importance_sampling_ratio/min": 0.6174748539924622, "sampling/sampling_logp_difference/max": 0.4821169376373291, "sampling/sampling_logp_difference/mean": 0.015117994509637356, "step": 360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1020.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 398.828125, "completions/mean_terminated_length": 398.828125, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "entropy": 0.47226619720458984, "epoch": 0.4424019607843137, "frac_reward_zero_std": 0.75, "grad_norm": 0.6240947529725706, "kl": 0.005854130256921053, "learning_rate": 9.934192211452344e-07, "loss": -0.076, "num_tokens": 15504929.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.6057943105697632, "sampling/importance_sampling_ratio/mean": 0.9999524354934692, "sampling/importance_sampling_ratio/min": 0.40564554929733276, "sampling/sampling_logp_difference/max": 0.902275562286377, "sampling/sampling_logp_difference/mean": 0.014106315560638905, "step": 361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 663.0, "completions/max_terminated_length": 663.0, "completions/mean_length": 359.390625, "completions/mean_terminated_length": 359.390625, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.3958386182785034, "epoch": 0.44362745098039214, "frac_reward_zero_std": 1.0, "grad_norm": 0.012293118289202275, "kl": 0.0055130962282419205, "learning_rate": 9.933035220431487e-07, "loss": 0.0001, "num_tokens": 15545962.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6292108297348022, "sampling/importance_sampling_ratio/mean": 1.0000524520874023, "sampling/importance_sampling_ratio/min": 0.5615935921669006, "sampling/sampling_logp_difference/max": 0.5769767761230469, "sampling/sampling_logp_difference/mean": 0.012994674034416676, "step": 362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 761.0, "completions/max_terminated_length": 761.0, "completions/mean_length": 295.0, "completions/mean_terminated_length": 295.0, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.5115153789520264, "epoch": 0.44485294117647056, "frac_reward_zero_std": 0.75, "grad_norm": 0.9589372846327973, "kl": 0.0111757917329669, "learning_rate": 9.931868215661647e-07, "loss": -0.0063, "num_tokens": 15581162.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.488693118095398, "sampling/importance_sampling_ratio/mean": 1.0000672340393066, "sampling/importance_sampling_ratio/min": 0.6547616720199585, "sampling/sampling_logp_difference/max": 0.4234839677810669, "sampling/sampling_logp_difference/mean": 0.016424331814050674, "step": 363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 626.0, "completions/max_terminated_length": 626.0, "completions/mean_length": 295.1875, "completions/mean_terminated_length": 295.1875, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.3777884840965271, "epoch": 0.44607843137254904, "frac_reward_zero_std": 1.0, "grad_norm": 0.016199156640608586, "kl": 0.007039773277938366, "learning_rate": 9.930691199511773e-07, "loss": 0.0001, "num_tokens": 15613094.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.437111258506775, "sampling/importance_sampling_ratio/mean": 0.9998798966407776, "sampling/importance_sampling_ratio/min": 0.6118602156639099, "sampling/sampling_logp_difference/max": 0.49125146865844727, "sampling/sampling_logp_difference/mean": 0.014503110200166702, "step": 364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 910.0, "completions/max_terminated_length": 910.0, "completions/mean_length": 299.796875, "completions/mean_terminated_length": 299.796875, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.4404449760913849, "epoch": 0.44730392156862747, "frac_reward_zero_std": 0.75, "grad_norm": 0.5865475382138243, "kl": 0.011987416073679924, "learning_rate": 9.929504174371136e-07, "loss": 0.023, "num_tokens": 15650169.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.5755946636199951, "sampling/importance_sampling_ratio/mean": 1.0001384019851685, "sampling/importance_sampling_ratio/min": 0.6437512636184692, "sampling/sampling_logp_difference/max": 0.4546327590942383, "sampling/sampling_logp_difference/mean": 0.014040719717741013, "step": 365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 556.0, "completions/max_terminated_length": 556.0, "completions/mean_length": 262.390625, "completions/mean_terminated_length": 262.390625, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.5279446840286255, "epoch": 0.4485294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.03418981656062144, "kl": 0.011099225841462612, "learning_rate": 9.928307142649314e-07, "loss": 0.0001, "num_tokens": 15681602.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5103704929351807, "sampling/importance_sampling_ratio/mean": 0.9995712041854858, "sampling/importance_sampling_ratio/min": 0.3746173679828644, "sampling/sampling_logp_difference/max": 0.9818501472473145, "sampling/sampling_logp_difference/mean": 0.017491858452558517, "step": 366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 587.0, "completions/max_terminated_length": 587.0, "completions/mean_length": 267.171875, "completions/mean_terminated_length": 267.171875, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.45025694370269775, "epoch": 0.4497549019607843, "frac_reward_zero_std": 0.75, "grad_norm": 0.9676334677639602, "kl": 0.008700523525476456, "learning_rate": 9.927100106776212e-07, "loss": 0.0275, "num_tokens": 15713501.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.5770145654678345, "sampling/importance_sampling_ratio/mean": 0.9994328022003174, "sampling/importance_sampling_ratio/min": 0.7568055987358093, "sampling/sampling_logp_difference/max": 0.455533504486084, "sampling/sampling_logp_difference/mean": 0.015321536920964718, "step": 367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1389.0, "completions/max_terminated_length": 1389.0, "completions/mean_length": 321.921875, "completions/mean_terminated_length": 321.921875, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "entropy": 0.5642329454421997, "epoch": 0.45098039215686275, "frac_reward_zero_std": 0.75, "grad_norm": 0.8753902062327625, "kl": 0.009802030399441719, "learning_rate": 9.925883069202034e-07, "loss": -0.1366, "num_tokens": 15752680.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9994522929191589, "sampling/importance_sampling_ratio/min": 0.6958832144737244, "sampling/sampling_logp_difference/max": 0.7540812492370605, "sampling/sampling_logp_difference/mean": 0.01814807578921318, "step": 368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1582.0, "completions/max_terminated_length": 1582.0, "completions/mean_length": 452.109375, "completions/mean_terminated_length": 452.109375, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.521472692489624, "epoch": 0.4522058823529412, "frac_reward_zero_std": 0.75, "grad_norm": 0.4885690823055149, "kl": 0.007102770730853081, "learning_rate": 9.92465603239729e-07, "loss": -0.0113, "num_tokens": 15800207.0, "reward": 0.78125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.3944483995437622, "sampling/importance_sampling_ratio/mean": 0.9999794363975525, "sampling/importance_sampling_ratio/min": 0.47963574528694153, "sampling/sampling_logp_difference/max": 0.7347283363342285, "sampling/sampling_logp_difference/mean": 0.015743356198072433, "step": 369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 705.0, "completions/max_terminated_length": 705.0, "completions/mean_length": 313.640625, "completions/mean_terminated_length": 313.640625, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.5323586463928223, "epoch": 0.4534313725490196, "frac_reward_zero_std": 0.75, "grad_norm": 0.7234237232549712, "kl": 0.009263429790735245, "learning_rate": 9.923418998852787e-07, "loss": 0.0017, "num_tokens": 15834520.0, "reward": -0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": -0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.6267601251602173, "sampling/importance_sampling_ratio/mean": 1.0000381469726562, "sampling/importance_sampling_ratio/min": 0.2013111710548401, "sampling/sampling_logp_difference/max": 1.6029033660888672, "sampling/sampling_logp_difference/mean": 0.017345821484923363, "step": 370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 702.0, "completions/max_terminated_length": 702.0, "completions/mean_length": 320.890625, "completions/mean_terminated_length": 320.890625, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.4449070990085602, "epoch": 0.45465686274509803, "frac_reward_zero_std": 1.0, "grad_norm": 0.014706916409245901, "kl": 0.0072118304669857025, "learning_rate": 9.922171971079622e-07, "loss": 0.0001, "num_tokens": 15871713.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4552481174468994, "sampling/importance_sampling_ratio/mean": 1.0001744031906128, "sampling/importance_sampling_ratio/min": 0.6263210773468018, "sampling/sampling_logp_difference/max": 0.4678921699523926, "sampling/sampling_logp_difference/mean": 0.014359300024807453, "step": 371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1044.0, "completions/max_terminated_length": 1044.0, "completions/mean_length": 289.046875, "completions/mean_terminated_length": 289.046875, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "entropy": 0.46369802951812744, "epoch": 0.45588235294117646, "frac_reward_zero_std": 1.0, "grad_norm": 0.01657346841974805, "kl": 0.008313331753015518, "learning_rate": 9.920914951609186e-07, "loss": 0.0001, "num_tokens": 15908324.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4346879720687866, "sampling/importance_sampling_ratio/mean": 1.000383973121643, "sampling/importance_sampling_ratio/min": 0.536011815071106, "sampling/sampling_logp_difference/max": 0.6235990524291992, "sampling/sampling_logp_difference/mean": 0.014268613420426846, "step": 372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 978.0, "completions/max_terminated_length": 978.0, "completions/mean_length": 273.75, "completions/mean_terminated_length": 273.75, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.4555667042732239, "epoch": 0.4571078431372549, "frac_reward_zero_std": 0.75, "grad_norm": 0.6166466764290625, "kl": 0.01220642402768135, "learning_rate": 9.919647942993147e-07, "loss": 0.0168, "num_tokens": 15943908.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.3784596920013428, "sampling/importance_sampling_ratio/mean": 0.9999669790267944, "sampling/importance_sampling_ratio/min": 0.5652586817741394, "sampling/sampling_logp_difference/max": 0.5704717636108398, "sampling/sampling_logp_difference/mean": 0.01564379781484604, "step": 373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 930.0, "completions/max_terminated_length": 930.0, "completions/mean_length": 367.546875, "completions/mean_terminated_length": 367.546875, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "entropy": 0.49472859501838684, "epoch": 0.4583333333333333, "frac_reward_zero_std": 0.75, "grad_norm": 0.6505424838052759, "kl": 0.006950271315872669, "learning_rate": 9.918370947803455e-07, "loss": -0.0043, "num_tokens": 15990951.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.6197224855422974, "sampling/importance_sampling_ratio/mean": 0.9999094605445862, "sampling/importance_sampling_ratio/min": 0.6800689697265625, "sampling/sampling_logp_difference/max": 0.4822547435760498, "sampling/sampling_logp_difference/mean": 0.01419869251549244, "step": 374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 859.0, "completions/max_terminated_length": 859.0, "completions/mean_length": 329.5, "completions/mean_terminated_length": 329.5, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.46280694007873535, "epoch": 0.45955882352941174, "frac_reward_zero_std": 0.75, "grad_norm": 0.510247972353449, "kl": 0.013187779113650322, "learning_rate": 9.917083968632326e-07, "loss": -0.0128, "num_tokens": 16027463.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.4676592350006104, "sampling/importance_sampling_ratio/mean": 0.9999902248382568, "sampling/importance_sampling_ratio/min": 0.5435534715652466, "sampling/sampling_logp_difference/max": 0.6096272468566895, "sampling/sampling_logp_difference/mean": 0.014881093986332417, "step": 375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 606.0, "completions/max_terminated_length": 606.0, "completions/mean_length": 320.421875, "completions/mean_terminated_length": 320.421875, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.6291946768760681, "epoch": 0.46078431372549017, "frac_reward_zero_std": 0.5, "grad_norm": 0.8568171825023735, "kl": 0.01314575970172882, "learning_rate": 9.915787008092246e-07, "loss": -0.0038, "num_tokens": 16070386.0, "reward": 0.40625, "reward_std": 0.4515564441680908, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.4345569610595703, "sampling/importance_sampling_ratio/mean": 1.0000001192092896, "sampling/importance_sampling_ratio/min": 0.45025113224983215, "sampling/sampling_logp_difference/max": 0.7979497909545898, "sampling/sampling_logp_difference/mean": 0.016919022426009178, "step": 376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 446.0, "completions/max_terminated_length": 446.0, "completions/mean_length": 245.5, "completions/mean_terminated_length": 245.5, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.48392730951309204, "epoch": 0.46200980392156865, "frac_reward_zero_std": 0.75, "grad_norm": 0.8774399064329496, "kl": 0.011201918125152588, "learning_rate": 9.914480068815961e-07, "loss": -0.0211, "num_tokens": 16106002.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.4221773147583008, "sampling/importance_sampling_ratio/mean": 0.9999245405197144, "sampling/importance_sampling_ratio/min": 0.6408695578575134, "sampling/sampling_logp_difference/max": 0.4449293613433838, "sampling/sampling_logp_difference/mean": 0.015163412317633629, "step": 377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 575.0, "completions/max_terminated_length": 575.0, "completions/mean_length": 273.03125, "completions/mean_terminated_length": 273.03125, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.4214988946914673, "epoch": 0.4632352941176471, "frac_reward_zero_std": 1.0, "grad_norm": 0.015767187515838013, "kl": 0.009089452214539051, "learning_rate": 9.913163153456482e-07, "loss": 0.0001, "num_tokens": 16138244.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001201629638672, "sampling/importance_sampling_ratio/min": 0.5881833434104919, "sampling/sampling_logp_difference/max": 0.8351068496704102, "sampling/sampling_logp_difference/mean": 0.013711215928196907, "step": 378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 611.0, "completions/max_terminated_length": 611.0, "completions/mean_length": 370.8125, "completions/mean_terminated_length": 370.8125, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "entropy": 0.4226321280002594, "epoch": 0.4644607843137255, "frac_reward_zero_std": 1.0, "grad_norm": 0.013942840227994123, "kl": 0.00788489356637001, "learning_rate": 9.91183626468706e-07, "loss": 0.0001, "num_tokens": 16180936.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.627266764640808, "sampling/importance_sampling_ratio/mean": 1.0001024007797241, "sampling/importance_sampling_ratio/min": 0.5282998085021973, "sampling/sampling_logp_difference/max": 0.6380913257598877, "sampling/sampling_logp_difference/mean": 0.012683962471783161, "step": 379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1112.0, "completions/max_terminated_length": 1112.0, "completions/mean_length": 431.09375, "completions/mean_terminated_length": 431.09375, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.5355674028396606, "epoch": 0.46568627450980393, "frac_reward_zero_std": 1.0, "grad_norm": 0.013894057963017632, "kl": 0.009171638637781143, "learning_rate": 9.910499405201193e-07, "loss": 0.0001, "num_tokens": 16227390.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.2929408550262451, "sampling/importance_sampling_ratio/mean": 0.9996322393417358, "sampling/importance_sampling_ratio/min": 0.6960487365722656, "sampling/sampling_logp_difference/max": 0.36233556270599365, "sampling/sampling_logp_difference/mean": 0.0148024782538414, "step": 380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 557.0, "completions/max_terminated_length": 557.0, "completions/mean_length": 320.703125, "completions/mean_terminated_length": 320.703125, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.6161620020866394, "epoch": 0.46691176470588236, "frac_reward_zero_std": 0.75, "grad_norm": 0.5585524525397322, "kl": 0.014791369438171387, "learning_rate": 9.909152577712625e-07, "loss": 0.0262, "num_tokens": 16262603.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.4130288362503052, "sampling/importance_sampling_ratio/mean": 1.000399112701416, "sampling/importance_sampling_ratio/min": 0.6892130970954895, "sampling/sampling_logp_difference/max": 0.3722047805786133, "sampling/sampling_logp_difference/mean": 0.017356421798467636, "step": 381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1020.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 372.703125, "completions/mean_terminated_length": 372.703125, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "entropy": 0.2847789525985718, "epoch": 0.4681372549019608, "frac_reward_zero_std": 0.75, "grad_norm": 0.7093035719028128, "kl": 0.006375766824930906, "learning_rate": 9.907795784955326e-07, "loss": 0.0021, "num_tokens": 16303096.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.4866317510604858, "sampling/importance_sampling_ratio/mean": 1.0000295639038086, "sampling/importance_sampling_ratio/min": 0.615215539932251, "sampling/sampling_logp_difference/max": 0.4857826232910156, "sampling/sampling_logp_difference/mean": 0.00981362909078598, "step": 382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 701.0, "completions/max_terminated_length": 701.0, "completions/mean_length": 378.3125, "completions/mean_terminated_length": 378.3125, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.5317325592041016, "epoch": 0.4693627450980392, "frac_reward_zero_std": 0.75, "grad_norm": 0.5464422251747819, "kl": 0.007729554083198309, "learning_rate": 9.906429029683504e-07, "loss": -0.0035, "num_tokens": 16344460.0, "reward": -0.1875, "reward_std": 0.25, "rewards/decision_reward_func/mean": -0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 1.5744106769561768, "sampling/importance_sampling_ratio/mean": 1.0002042055130005, "sampling/importance_sampling_ratio/min": 0.6521392464637756, "sampling/sampling_logp_difference/max": 0.45388102531433105, "sampling/sampling_logp_difference/mean": 0.014293940737843513, "step": 383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 738.0, "completions/max_terminated_length": 738.0, "completions/mean_length": 456.71875, "completions/mean_terminated_length": 456.71875, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.578429102897644, "epoch": 0.47058823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 0.6705251090099297, "kl": 0.008822070434689522, "learning_rate": 9.90505231467158e-07, "loss": -0.0249, "num_tokens": 16396122.0, "reward": 0.40625, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.4041556119918823, "sampling/importance_sampling_ratio/mean": 1.0000476837158203, "sampling/importance_sampling_ratio/min": 0.6883706450462341, "sampling/sampling_logp_difference/max": 0.3734278678894043, "sampling/sampling_logp_difference/mean": 0.015498055145144463, "step": 384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 713.0, "completions/max_terminated_length": 713.0, "completions/mean_length": 361.203125, "completions/mean_terminated_length": 361.203125, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "entropy": 0.575154185295105, "epoch": 0.47181372549019607, "frac_reward_zero_std": 0.5, "grad_norm": 0.8226290032961374, "kl": 0.010125217959284782, "learning_rate": 9.903665642714204e-07, "loss": 0.0238, "num_tokens": 16436871.0, "reward": 0.71875, "reward_std": 0.4629635810852051, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.6273009777069092, "sampling/importance_sampling_ratio/mean": 1.0002424716949463, "sampling/importance_sampling_ratio/min": 0.5634684562683105, "sampling/sampling_logp_difference/max": 0.5736439228057861, "sampling/sampling_logp_difference/mean": 0.015197636559605598, "step": 385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 910.0, "completions/max_terminated_length": 910.0, "completions/mean_length": 456.890625, "completions/mean_terminated_length": 456.890625, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.5371774435043335, "epoch": 0.4730392156862745, "frac_reward_zero_std": 1.0, "grad_norm": 0.014711009801852666, "kl": 0.010284563526511192, "learning_rate": 9.90226901662623e-07, "loss": 0.0001, "num_tokens": 16481744.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999085068702698, "sampling/importance_sampling_ratio/min": 0.6891399621963501, "sampling/sampling_logp_difference/max": 0.7020729780197144, "sampling/sampling_logp_difference/mean": 0.015175249427556992, "step": 386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 755.0, "completions/max_terminated_length": 755.0, "completions/mean_length": 407.59375, "completions/mean_terminated_length": 407.59375, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "entropy": 0.5156686305999756, "epoch": 0.4742647058823529, "frac_reward_zero_std": 1.0, "grad_norm": 0.01624091301804241, "kl": 0.010845974087715149, "learning_rate": 9.900862439242718e-07, "loss": 0.0001, "num_tokens": 16525526.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.30564546585083, "sampling/importance_sampling_ratio/mean": 0.9999505877494812, "sampling/importance_sampling_ratio/min": 0.6970733404159546, "sampling/sampling_logp_difference/max": 0.36086463928222656, "sampling/sampling_logp_difference/mean": 0.014238346368074417, "step": 387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 841.0, "completions/max_terminated_length": 841.0, "completions/mean_length": 374.796875, "completions/mean_terminated_length": 374.796875, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.6364955306053162, "epoch": 0.47549019607843135, "frac_reward_zero_std": 0.75, "grad_norm": 0.6914250548406201, "kl": 0.011504191905260086, "learning_rate": 9.899445913418935e-07, "loss": -0.0468, "num_tokens": 16571049.0, "reward": -0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": -0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.759347677230835, "sampling/importance_sampling_ratio/mean": 0.9999186992645264, "sampling/importance_sampling_ratio/min": 0.6382997035980225, "sampling/sampling_logp_difference/max": 0.5649430751800537, "sampling/sampling_logp_difference/mean": 0.016772475093603134, "step": 388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/max_terminated_length": 520.0, "completions/mean_length": 352.21875, "completions/mean_terminated_length": 352.21875, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "entropy": 0.531876802444458, "epoch": 0.47671568627450983, "frac_reward_zero_std": 0.75, "grad_norm": 0.6257454529250119, "kl": 0.01216909196227789, "learning_rate": 9.898019442030337e-07, "loss": -0.0383, "num_tokens": 16608231.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.4391876459121704, "sampling/importance_sampling_ratio/mean": 0.9996486902236938, "sampling/importance_sampling_ratio/min": 0.6644465327262878, "sampling/sampling_logp_difference/max": 0.4088008403778076, "sampling/sampling_logp_difference/mean": 0.014866403304040432, "step": 389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 286.484375, "completions/mean_terminated_length": 286.484375, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.6964697241783142, "epoch": 0.47794117647058826, "frac_reward_zero_std": 0.75, "grad_norm": 0.5913164315156034, "kl": 0.021920951083302498, "learning_rate": 9.89658302797257e-07, "loss": 0.0133, "num_tokens": 16640678.0, "reward": 0.84375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.4976394176483154, "sampling/importance_sampling_ratio/mean": 1.0000478029251099, "sampling/importance_sampling_ratio/min": 0.6214598417282104, "sampling/sampling_logp_difference/max": 0.47568392753601074, "sampling/sampling_logp_difference/mean": 0.019625462591648102, "step": 390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 858.0, "completions/max_terminated_length": 858.0, "completions/mean_length": 345.46875, "completions/mean_terminated_length": 345.46875, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.5459054708480835, "epoch": 0.4791666666666667, "frac_reward_zero_std": 0.5, "grad_norm": 0.8931014305990419, "kl": 0.013694975525140762, "learning_rate": 9.895136674161464e-07, "loss": 0.1015, "num_tokens": 16680084.0, "reward": 0.40625, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.349234938621521, "sampling/importance_sampling_ratio/mean": 0.9996093511581421, "sampling/importance_sampling_ratio/min": 0.6643379926681519, "sampling/sampling_logp_difference/max": 0.4089641571044922, "sampling/sampling_logp_difference/mean": 0.015522184781730175, "step": 391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1906.0, "completions/max_terminated_length": 1906.0, "completions/mean_length": 493.640625, "completions/mean_terminated_length": 493.640625, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "entropy": 0.6637204885482788, "epoch": 0.4803921568627451, "frac_reward_zero_std": 0.75, "grad_norm": 0.4123863184327225, "kl": 0.013020217418670654, "learning_rate": 9.893680383533024e-07, "loss": 0.0171, "num_tokens": 16731357.0, "reward": 0.28125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 1.4193106889724731, "sampling/importance_sampling_ratio/mean": 1.0000452995300293, "sampling/importance_sampling_ratio/min": 0.5995286107063293, "sampling/sampling_logp_difference/max": 0.5116115808486938, "sampling/sampling_logp_difference/mean": 0.017154892906546593, "step": 392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1118.0, "completions/max_terminated_length": 1118.0, "completions/mean_length": 475.21875, "completions/mean_terminated_length": 475.21875, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "entropy": 0.5360798835754395, "epoch": 0.48161764705882354, "frac_reward_zero_std": 0.75, "grad_norm": 0.39576230302925547, "kl": 0.013842311687767506, "learning_rate": 9.892214159043433e-07, "loss": -0.0002, "num_tokens": 16781723.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.7142975330352783, "sampling/importance_sampling_ratio/mean": 1.0002368688583374, "sampling/importance_sampling_ratio/min": 0.5636467337608337, "sampling/sampling_logp_difference/max": 0.5733275413513184, "sampling/sampling_logp_difference/mean": 0.014451893977820873, "step": 393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 919.0, "completions/max_terminated_length": 919.0, "completions/mean_length": 343.84375, "completions/mean_terminated_length": 343.84375, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.477566659450531, "epoch": 0.48284313725490197, "frac_reward_zero_std": 0.75, "grad_norm": 0.5475373018031643, "kl": 0.014531772583723068, "learning_rate": 9.890738003669027e-07, "loss": -0.0186, "num_tokens": 16819313.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.3974531888961792, "sampling/importance_sampling_ratio/mean": 0.9999526739120483, "sampling/importance_sampling_ratio/min": 0.723064124584198, "sampling/sampling_logp_difference/max": 0.33465147018432617, "sampling/sampling_logp_difference/mean": 0.01394276786595583, "step": 394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1467.0, "completions/max_terminated_length": 1467.0, "completions/mean_length": 472.40625, "completions/mean_terminated_length": 472.40625, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "entropy": 0.6431002020835876, "epoch": 0.4840686274509804, "frac_reward_zero_std": 0.5, "grad_norm": 0.7618640375380666, "kl": 0.014173690229654312, "learning_rate": 9.889251920406312e-07, "loss": -0.044, "num_tokens": 16864459.0, "reward": 0.03125, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.366355061531067, "sampling/importance_sampling_ratio/mean": 0.9999476671218872, "sampling/importance_sampling_ratio/min": 0.6993353962898254, "sampling/sampling_logp_difference/max": 0.35762476921081543, "sampling/sampling_logp_difference/mean": 0.0159437358379364, "step": 395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1056.0, "completions/max_terminated_length": 1056.0, "completions/mean_length": 498.578125, "completions/mean_terminated_length": 498.578125, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.34457606077194214, "epoch": 0.4852941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.00999804825394461, "kl": 0.0057158335112035275, "learning_rate": 9.887755912271942e-07, "loss": 0.0001, "num_tokens": 16913360.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3962302207946777, "sampling/importance_sampling_ratio/mean": 0.9996973276138306, "sampling/importance_sampling_ratio/min": 0.37642964720726013, "sampling/sampling_logp_difference/max": 0.9770241975784302, "sampling/sampling_logp_difference/mean": 0.010589707642793655, "step": 396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1023.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 358.609375, "completions/mean_terminated_length": 358.609375, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "entropy": 0.5422418117523193, "epoch": 0.48651960784313725, "frac_reward_zero_std": 1.0, "grad_norm": 0.01720658858719129, "kl": 0.012366555631160736, "learning_rate": 9.886249982302718e-07, "loss": 0.0001, "num_tokens": 16954359.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4096999168395996, "sampling/importance_sampling_ratio/mean": 1.0001957416534424, "sampling/importance_sampling_ratio/min": 0.6254607439041138, "sampling/sampling_logp_difference/max": 0.4692666530609131, "sampling/sampling_logp_difference/mean": 0.01464086677879095, "step": 397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2130.0, "completions/max_terminated_length": 2130.0, "completions/mean_length": 532.890625, "completions/mean_terminated_length": 532.890625, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "entropy": 0.5304117202758789, "epoch": 0.4877450980392157, "frac_reward_zero_std": 0.75, "grad_norm": 0.34742046469266935, "kl": 0.014952598139643669, "learning_rate": 9.884734133555585e-07, "loss": 0.0089, "num_tokens": 17005472.0, "reward": 0.65625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.5073752403259277, "sampling/importance_sampling_ratio/mean": 1.0000643730163574, "sampling/importance_sampling_ratio/min": 0.6476128697395325, "sampling/sampling_logp_difference/max": 0.43446218967437744, "sampling/sampling_logp_difference/mean": 0.014020275324583054, "step": 398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 566.0, "completions/max_terminated_length": 566.0, "completions/mean_length": 355.46875, "completions/mean_terminated_length": 355.46875, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.49435314536094666, "epoch": 0.4889705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.022951951949991462, "kl": 0.016476020216941833, "learning_rate": 9.883208369107617e-07, "loss": 0.0002, "num_tokens": 17044110.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5089552402496338, "sampling/importance_sampling_ratio/mean": 0.9997061491012573, "sampling/importance_sampling_ratio/min": 0.7013763785362244, "sampling/sampling_logp_difference/max": 0.41141748428344727, "sampling/sampling_logp_difference/mean": 0.014618675224483013, "step": 399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 639.0, "completions/max_terminated_length": 639.0, "completions/mean_length": 310.75, "completions/mean_terminated_length": 310.75, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.4882148206233978, "epoch": 0.49019607843137253, "frac_reward_zero_std": 1.0, "grad_norm": 0.022565975419333813, "kl": 0.018462996929883957, "learning_rate": 9.88167269205602e-07, "loss": 0.0002, "num_tokens": 17077182.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.3635746240615845, "sampling/importance_sampling_ratio/mean": 0.9999900460243225, "sampling/importance_sampling_ratio/min": 0.7561637163162231, "sampling/sampling_logp_difference/max": 0.31010961532592773, "sampling/sampling_logp_difference/mean": 0.013947639614343643, "step": 400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 890.0, "completions/max_terminated_length": 890.0, "completions/mean_length": 417.671875, "completions/mean_terminated_length": 417.671875, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.4593542218208313, "epoch": 0.49142156862745096, "frac_reward_zero_std": 1.0, "grad_norm": 0.016609115035280588, "kl": 0.01179174892604351, "learning_rate": 9.880127105518122e-07, "loss": 0.0001, "num_tokens": 17121625.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5982648134231567, "sampling/importance_sampling_ratio/mean": 1.0001604557037354, "sampling/importance_sampling_ratio/min": 0.6899809241294861, "sampling/sampling_logp_difference/max": 0.4689185619354248, "sampling/sampling_logp_difference/mean": 0.013563069514930248, "step": 401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1656.0, "completions/max_terminated_length": 1656.0, "completions/mean_length": 386.265625, "completions/mean_terminated_length": 386.265625, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.6214208006858826, "epoch": 0.49264705882352944, "frac_reward_zero_std": 0.5, "grad_norm": 0.8384325921841563, "kl": 0.0199696384370327, "learning_rate": 9.878571612631363e-07, "loss": -0.0902, "num_tokens": 17161610.0, "reward": 0.09375, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.5209263563156128, "sampling/importance_sampling_ratio/mean": 1.0003373622894287, "sampling/importance_sampling_ratio/min": 0.5757555365562439, "sampling/sampling_logp_difference/max": 0.5520720481872559, "sampling/sampling_logp_difference/mean": 0.017345454543828964, "step": 402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 923.0, "completions/max_terminated_length": 923.0, "completions/mean_length": 366.09375, "completions/mean_terminated_length": 366.09375, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "entropy": 0.5026034116744995, "epoch": 0.49387254901960786, "frac_reward_zero_std": 1.0, "grad_norm": 0.01895487749320785, "kl": 0.014405404217541218, "learning_rate": 9.8770062165533e-07, "loss": 0.0001, "num_tokens": 17207424.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4571722745895386, "sampling/importance_sampling_ratio/mean": 0.9999120235443115, "sampling/importance_sampling_ratio/min": 0.6585052013397217, "sampling/sampling_logp_difference/max": 0.41778290271759033, "sampling/sampling_logp_difference/mean": 0.014270633459091187, "step": 403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 575.0, "completions/max_terminated_length": 575.0, "completions/mean_length": 279.203125, "completions/mean_terminated_length": 279.203125, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.529657244682312, "epoch": 0.4950980392156863, "frac_reward_zero_std": 1.0, "grad_norm": 0.02881161459281529, "kl": 0.026784248650074005, "learning_rate": 9.875430920461583e-07, "loss": 0.0002, "num_tokens": 17242589.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5313911437988281, "sampling/importance_sampling_ratio/mean": 1.0000829696655273, "sampling/importance_sampling_ratio/min": 0.6775246858596802, "sampling/sampling_logp_difference/max": 0.4261765480041504, "sampling/sampling_logp_difference/mean": 0.016510039567947388, "step": 404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1432.0, "completions/max_terminated_length": 1432.0, "completions/mean_length": 450.96875, "completions/mean_terminated_length": 450.96875, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.4007662534713745, "epoch": 0.4963235294117647, "frac_reward_zero_std": 0.75, "grad_norm": 0.6625691866417871, "kl": 0.00870803464204073, "learning_rate": 9.873845727553965e-07, "loss": 0.0956, "num_tokens": 17293771.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.7259007692337036, "sampling/importance_sampling_ratio/mean": 0.9999662637710571, "sampling/importance_sampling_ratio/min": 0.6534259915351868, "sampling/sampling_logp_difference/max": 0.5457490682601929, "sampling/sampling_logp_difference/mean": 0.0112020093947649, "step": 405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 781.0, "completions/max_terminated_length": 781.0, "completions/mean_length": 416.109375, "completions/mean_terminated_length": 416.109375, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "entropy": 0.5932797789573669, "epoch": 0.49754901960784315, "frac_reward_zero_std": 1.0, "grad_norm": 0.02260320205505566, "kl": 0.017917141318321228, "learning_rate": 9.87225064104829e-07, "loss": 0.0002, "num_tokens": 17335330.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6954232454299927, "sampling/importance_sampling_ratio/mean": 0.9999051094055176, "sampling/importance_sampling_ratio/min": 0.5895090699195862, "sampling/sampling_logp_difference/max": 0.5284651517868042, "sampling/sampling_logp_difference/mean": 0.01656300202012062, "step": 406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 632.0, "completions/max_terminated_length": 632.0, "completions/mean_length": 317.140625, "completions/mean_terminated_length": 317.140625, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.4130098521709442, "epoch": 0.4987745098039216, "frac_reward_zero_std": 1.0, "grad_norm": 0.019549894057575068, "kl": 0.013659714721143246, "learning_rate": 9.870645664182476e-07, "loss": 0.0001, "num_tokens": 17372027.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.2927042245864868, "sampling/importance_sampling_ratio/mean": 1.0000004768371582, "sampling/importance_sampling_ratio/min": 0.6424325704574585, "sampling/sampling_logp_difference/max": 0.4424934387207031, "sampling/sampling_logp_difference/mean": 0.012468937784433365, "step": 407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 828.0, "completions/max_terminated_length": 828.0, "completions/mean_length": 313.0625, "completions/mean_terminated_length": 313.0625, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.43560439348220825, "epoch": 0.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.020242542216486102, "kl": 0.013805761933326721, "learning_rate": 9.86903080021453e-07, "loss": 0.0001, "num_tokens": 17410447.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.668404459953308, "sampling/importance_sampling_ratio/mean": 1.00040864944458, "sampling/importance_sampling_ratio/min": 0.6312558054924011, "sampling/sampling_logp_difference/max": 0.5118677616119385, "sampling/sampling_logp_difference/mean": 0.013869582675397396, "step": 408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 902.0, "completions/max_terminated_length": 902.0, "completions/mean_length": 387.921875, "completions/mean_terminated_length": 387.921875, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.5084930658340454, "epoch": 0.5012254901960784, "frac_reward_zero_std": 0.75, "grad_norm": 0.7706370768487162, "kl": 0.011641206219792366, "learning_rate": 9.867406052422523e-07, "loss": 0.0129, "num_tokens": 17457434.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.4210155010223389, "sampling/importance_sampling_ratio/mean": 1.0001786947250366, "sampling/importance_sampling_ratio/min": 0.5099390745162964, "sampling/sampling_logp_difference/max": 0.6734640598297119, "sampling/sampling_logp_difference/mean": 0.015187649056315422, "step": 409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 569.0, "completions/max_terminated_length": 569.0, "completions/mean_length": 355.78125, "completions/mean_terminated_length": 355.78125, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "entropy": 0.5451535582542419, "epoch": 0.5024509803921569, "frac_reward_zero_std": 0.75, "grad_norm": 0.5877595068436049, "kl": 0.011499369516968727, "learning_rate": 9.865771424104587e-07, "loss": -0.0013, "num_tokens": 17497292.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.556929588317871, "sampling/importance_sampling_ratio/mean": 1.0000817775726318, "sampling/importance_sampling_ratio/min": 0.7067108750343323, "sampling/sampling_logp_difference/max": 0.4427156448364258, "sampling/sampling_logp_difference/mean": 0.014264975674450397, "step": 410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1110.0, "completions/max_terminated_length": 1110.0, "completions/mean_length": 467.1875, "completions/mean_terminated_length": 467.1875, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.5624011754989624, "epoch": 0.5036764705882353, "frac_reward_zero_std": 0.75, "grad_norm": 0.7225931314485432, "kl": 0.010640924796462059, "learning_rate": 9.864126918578919e-07, "loss": -0.0556, "num_tokens": 17545272.0, "reward": -0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": -0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.6156482696533203, "sampling/importance_sampling_ratio/mean": 1.0000306367874146, "sampling/importance_sampling_ratio/min": 0.40122684836387634, "sampling/sampling_logp_difference/max": 0.9132283329963684, "sampling/sampling_logp_difference/mean": 0.014856145717203617, "step": 411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1873.0, "completions/max_terminated_length": 1873.0, "completions/mean_length": 379.03125, "completions/mean_terminated_length": 379.03125, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.5414950251579285, "epoch": 0.5049019607843137, "frac_reward_zero_std": 0.75, "grad_norm": 0.4765193162576344, "kl": 0.01409471407532692, "learning_rate": 9.862472539183755e-07, "loss": 0.0303, "num_tokens": 17585018.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.444906234741211, "sampling/importance_sampling_ratio/mean": 0.9997316598892212, "sampling/importance_sampling_ratio/min": 0.689365565776825, "sampling/sampling_logp_difference/max": 0.37198352813720703, "sampling/sampling_logp_difference/mean": 0.015769779682159424, "step": 412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 886.0, "completions/max_terminated_length": 886.0, "completions/mean_length": 302.34375, "completions/mean_terminated_length": 302.34375, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.48937544226646423, "epoch": 0.5061274509803921, "frac_reward_zero_std": 1.0, "grad_norm": 0.02248400848042363, "kl": 0.015292640775442123, "learning_rate": 9.860808289277385e-07, "loss": 0.0001, "num_tokens": 17621344.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4632993936538696, "sampling/importance_sampling_ratio/mean": 1.0001461505889893, "sampling/importance_sampling_ratio/min": 0.26966604590415955, "sampling/sampling_logp_difference/max": 1.3105709552764893, "sampling/sampling_logp_difference/mean": 0.014968957751989365, "step": 413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.0, "completions/max_terminated_length": 486.0, "completions/mean_length": 308.96875, "completions/mean_terminated_length": 308.96875, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "entropy": 0.3994520306587219, "epoch": 0.5073529411764706, "frac_reward_zero_std": 0.75, "grad_norm": 0.6779978430743736, "kl": 0.010735863819718361, "learning_rate": 9.859134172238128e-07, "loss": 0.0237, "num_tokens": 17655758.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.5137768983840942, "sampling/importance_sampling_ratio/mean": 1.0001378059387207, "sampling/importance_sampling_ratio/min": 0.712831974029541, "sampling/sampling_logp_difference/max": 0.4146077632904053, "sampling/sampling_logp_difference/mean": 0.012119371443986893, "step": 414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 811.0, "completions/max_terminated_length": 811.0, "completions/mean_length": 313.796875, "completions/mean_terminated_length": 313.796875, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "entropy": 0.4439709782600403, "epoch": 0.508578431372549, "frac_reward_zero_std": 0.75, "grad_norm": 0.9179577968537204, "kl": 0.014490782283246517, "learning_rate": 9.857450191464337e-07, "loss": 0.0788, "num_tokens": 17691793.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.5226970911026, "sampling/importance_sampling_ratio/mean": 0.9997588396072388, "sampling/importance_sampling_ratio/min": 0.45969530940055847, "sampling/sampling_logp_difference/max": 0.7771914005279541, "sampling/sampling_logp_difference/mean": 0.014258937910199165, "step": 415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1082.0, "completions/max_terminated_length": 1082.0, "completions/mean_length": 445.09375, "completions/mean_terminated_length": 445.09375, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "entropy": 0.5256571769714355, "epoch": 0.5098039215686274, "frac_reward_zero_std": 1.0, "grad_norm": 0.012993733043711665, "kl": 0.010102033615112305, "learning_rate": 9.855756350374386e-07, "loss": 0.0001, "num_tokens": 17748903.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.4279029369354248, "sampling/importance_sampling_ratio/mean": 0.9999684691429138, "sampling/importance_sampling_ratio/min": 0.6408191323280334, "sampling/sampling_logp_difference/max": 0.4450080394744873, "sampling/sampling_logp_difference/mean": 0.015364844352006912, "step": 416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1171.0, "completions/max_terminated_length": 1171.0, "completions/mean_length": 442.671875, "completions/mean_terminated_length": 442.671875, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "entropy": 0.4873322546482086, "epoch": 0.5110294117647058, "frac_reward_zero_std": 0.75, "grad_norm": 0.5429835802887981, "kl": 0.009810942225158215, "learning_rate": 9.854052652406665e-07, "loss": 0.0055, "num_tokens": 17798210.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.6446665525436401, "sampling/importance_sampling_ratio/mean": 0.9997557997703552, "sampling/importance_sampling_ratio/min": 0.5059256553649902, "sampling/sampling_logp_difference/max": 0.6813654899597168, "sampling/sampling_logp_difference/mean": 0.01432083174586296, "step": 417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1385.0, "completions/max_terminated_length": 1385.0, "completions/mean_length": 466.5, "completions/mean_terminated_length": 466.5, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.49410387873649597, "epoch": 0.5122549019607843, "frac_reward_zero_std": 1.0, "grad_norm": 0.010538711262332249, "kl": 0.011043190024793148, "learning_rate": 9.852339101019572e-07, "loss": 0.0001, "num_tokens": 17845234.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000226497650146, "sampling/importance_sampling_ratio/min": 0.5049200654029846, "sampling/sampling_logp_difference/max": 0.7731021642684937, "sampling/sampling_logp_difference/mean": 0.014621015638113022, "step": 418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 881.0, "completions/max_terminated_length": 881.0, "completions/mean_length": 340.296875, "completions/mean_terminated_length": 340.296875, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.5282124876976013, "epoch": 0.5134803921568627, "frac_reward_zero_std": 1.0, "grad_norm": 0.017160593644474843, "kl": 0.012833751738071442, "learning_rate": 9.85061569969151e-07, "loss": 0.0001, "num_tokens": 17888261.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.6409162282943726, "sampling/importance_sampling_ratio/mean": 1.00032639503479, "sampling/importance_sampling_ratio/min": 0.3901737630367279, "sampling/sampling_logp_difference/max": 0.9411630630493164, "sampling/sampling_logp_difference/mean": 0.014918262138962746, "step": 419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1210.0, "completions/max_terminated_length": 1210.0, "completions/mean_length": 407.9375, "completions/mean_terminated_length": 407.9375, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "entropy": 0.4433687925338745, "epoch": 0.5147058823529411, "frac_reward_zero_std": 1.0, "grad_norm": 0.008658709771006424, "kl": 0.005597110837697983, "learning_rate": 9.848882451920875e-07, "loss": 0.0001, "num_tokens": 17932737.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4387460947036743, "sampling/importance_sampling_ratio/mean": 0.9999854564666748, "sampling/importance_sampling_ratio/min": 0.6221131682395935, "sampling/sampling_logp_difference/max": 0.47463321685791016, "sampling/sampling_logp_difference/mean": 0.012759259901940823, "step": 420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 743.0, "completions/max_terminated_length": 743.0, "completions/mean_length": 326.1875, "completions/mean_terminated_length": 326.1875, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.42033079266548157, "epoch": 0.5159313725490197, "frac_reward_zero_std": 1.0, "grad_norm": 0.014837746183438543, "kl": 0.010691646486520767, "learning_rate": 9.847139361226046e-07, "loss": 0.0001, "num_tokens": 17968333.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9906790256500244, "sampling/importance_sampling_ratio/mean": 0.9999564290046692, "sampling/importance_sampling_ratio/min": 0.6311607956886292, "sampling/sampling_logp_difference/max": 0.6884758472442627, "sampling/sampling_logp_difference/mean": 0.013186516240239143, "step": 421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 563.0, "completions/max_terminated_length": 563.0, "completions/mean_length": 296.15625, "completions/mean_terminated_length": 296.15625, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.6118624806404114, "epoch": 0.5171568627450981, "frac_reward_zero_std": 1.0, "grad_norm": 0.021326189407088577, "kl": 0.016667809337377548, "learning_rate": 9.84538643114539e-07, "loss": 0.0002, "num_tokens": 17999767.0, "reward": -0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": -0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.474505066871643, "sampling/importance_sampling_ratio/mean": 1.0001413822174072, "sampling/importance_sampling_ratio/min": 0.690427839756012, "sampling/sampling_logp_difference/max": 0.3883223533630371, "sampling/sampling_logp_difference/mean": 0.017754338681697845, "step": 422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1060.0, "completions/max_terminated_length": 1060.0, "completions/mean_length": 323.890625, "completions/mean_terminated_length": 323.890625, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.4755157232284546, "epoch": 0.5183823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.01616481531782975, "kl": 0.01253415085375309, "learning_rate": 9.843623665237242e-07, "loss": 0.0001, "num_tokens": 18039824.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5040522813796997, "sampling/importance_sampling_ratio/mean": 1.0001437664031982, "sampling/importance_sampling_ratio/min": 0.5364373922348022, "sampling/sampling_logp_difference/max": 0.6228054165840149, "sampling/sampling_logp_difference/mean": 0.01538270153105259, "step": 423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1160.0, "completions/max_terminated_length": 1160.0, "completions/mean_length": 417.78125, "completions/mean_terminated_length": 417.78125, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "entropy": 0.4744122624397278, "epoch": 0.5196078431372549, "frac_reward_zero_std": 1.0, "grad_norm": 0.0096098774232414, "kl": 0.007309806998819113, "learning_rate": 9.841851067079908e-07, "loss": 0.0001, "num_tokens": 18083058.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.4378365278244019, "sampling/importance_sampling_ratio/mean": 0.9999535083770752, "sampling/importance_sampling_ratio/min": 0.662236213684082, "sampling/sampling_logp_difference/max": 0.41213297843933105, "sampling/sampling_logp_difference/mean": 0.013659223914146423, "step": 424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 909.0, "completions/max_terminated_length": 909.0, "completions/mean_length": 361.375, "completions/mean_terminated_length": 361.375, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.4486447870731354, "epoch": 0.5208333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.009699274891573786, "kl": 0.007319447584450245, "learning_rate": 9.840068640271647e-07, "loss": 0.0001, "num_tokens": 18121146.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.609108567237854, "sampling/importance_sampling_ratio/mean": 0.9998490810394287, "sampling/importance_sampling_ratio/min": 0.6916384696960449, "sampling/sampling_logp_difference/max": 0.4756803512573242, "sampling/sampling_logp_difference/mean": 0.013014143332839012, "step": 425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 630.0, "completions/max_terminated_length": 630.0, "completions/mean_length": 269.609375, "completions/mean_terminated_length": 269.609375, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.370256632566452, "epoch": 0.5220588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.01292605657042224, "kl": 0.008275941014289856, "learning_rate": 9.838276388430675e-07, "loss": 0.0001, "num_tokens": 18153185.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.365869164466858, "sampling/importance_sampling_ratio/mean": 0.9999139308929443, "sampling/importance_sampling_ratio/min": 0.48008766770362854, "sampling/sampling_logp_difference/max": 0.7337865829467773, "sampling/sampling_logp_difference/mean": 0.012244380079209805, "step": 426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 872.0, "completions/max_terminated_length": 872.0, "completions/mean_length": 371.09375, "completions/mean_terminated_length": 371.09375, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.5753024816513062, "epoch": 0.5232843137254902, "frac_reward_zero_std": 0.5, "grad_norm": 0.8286198138258921, "kl": 0.01294616423547268, "learning_rate": 9.836474315195147e-07, "loss": 0.033, "num_tokens": 18199271.0, "reward": 0.21875, "reward_std": 0.375, "rewards/decision_reward_func/mean": 0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 1.5700253248214722, "sampling/importance_sampling_ratio/mean": 0.9999474287033081, "sampling/importance_sampling_ratio/min": 0.7082139849662781, "sampling/sampling_logp_difference/max": 0.4510917663574219, "sampling/sampling_logp_difference/mean": 0.01625710539519787, "step": 427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1131.0, "completions/max_terminated_length": 1131.0, "completions/mean_length": 348.40625, "completions/mean_terminated_length": 348.40625, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.44378286600112915, "epoch": 0.5245098039215687, "frac_reward_zero_std": 0.75, "grad_norm": 0.6446379273416699, "kl": 0.009088552556931973, "learning_rate": 9.83466242422316e-07, "loss": 0.0098, "num_tokens": 18236273.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.3645559549331665, "sampling/importance_sampling_ratio/mean": 0.9998284578323364, "sampling/importance_sampling_ratio/min": 0.7430564761161804, "sampling/sampling_logp_difference/max": 0.3108290433883667, "sampling/sampling_logp_difference/mean": 0.01364219095557928, "step": 428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 708.0, "completions/max_terminated_length": 708.0, "completions/mean_length": 416.078125, "completions/mean_terminated_length": 416.078125, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "entropy": 0.35826367139816284, "epoch": 0.5257352941176471, "frac_reward_zero_std": 1.0, "grad_norm": 0.00848062506099253, "kl": 0.0060065677389502525, "learning_rate": 9.832840719192735e-07, "loss": 0.0001, "num_tokens": 18281510.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4691907167434692, "sampling/importance_sampling_ratio/mean": 1.0000654458999634, "sampling/importance_sampling_ratio/min": 0.6423425078392029, "sampling/sampling_logp_difference/max": 0.44263362884521484, "sampling/sampling_logp_difference/mean": 0.010820208117365837, "step": 429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1593.0, "completions/max_terminated_length": 1593.0, "completions/mean_length": 382.78125, "completions/mean_terminated_length": 382.78125, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.4758942723274231, "epoch": 0.5269607843137255, "frac_reward_zero_std": 1.0, "grad_norm": 0.009911820702528178, "kl": 0.009016171097755432, "learning_rate": 9.831009203801822e-07, "loss": 0.0001, "num_tokens": 18326328.0, "reward": -0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": -0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4273873567581177, "sampling/importance_sampling_ratio/mean": 0.9999033212661743, "sampling/importance_sampling_ratio/min": 0.5262091755867004, "sampling/sampling_logp_difference/max": 0.6420564651489258, "sampling/sampling_logp_difference/mean": 0.014082267880439758, "step": 430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 780.0, "completions/max_terminated_length": 780.0, "completions/mean_length": 429.171875, "completions/mean_terminated_length": 429.171875, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "entropy": 0.4669371545314789, "epoch": 0.5281862745098039, "frac_reward_zero_std": 0.75, "grad_norm": 0.6907200510674834, "kl": 0.0094781294465065, "learning_rate": 9.829167881768277e-07, "loss": -0.0244, "num_tokens": 18374195.0, "reward": -0.375, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": -0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9996737241744995, "sampling/importance_sampling_ratio/min": 0.2538463771343231, "sampling/sampling_logp_difference/max": 1.3710259199142456, "sampling/sampling_logp_difference/mean": 0.01350916363298893, "step": 431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 791.0, "completions/max_terminated_length": 791.0, "completions/mean_length": 405.703125, "completions/mean_terminated_length": 405.703125, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.5928594470024109, "epoch": 0.5294117647058824, "frac_reward_zero_std": 0.75, "grad_norm": 0.6950791831084212, "kl": 0.008725784718990326, "learning_rate": 9.82731675682987e-07, "loss": 0.0171, "num_tokens": 18420096.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.3959887027740479, "sampling/importance_sampling_ratio/mean": 0.9994189739227295, "sampling/importance_sampling_ratio/min": 0.6030636429786682, "sampling/sampling_logp_difference/max": 0.505732536315918, "sampling/sampling_logp_difference/mean": 0.016530536115169525, "step": 432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1475.0, "completions/max_terminated_length": 1475.0, "completions/mean_length": 469.421875, "completions/mean_terminated_length": 469.421875, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "entropy": 0.4766601324081421, "epoch": 0.5306372549019608, "frac_reward_zero_std": 0.75, "grad_norm": 0.5228286106025906, "kl": 0.006615672260522842, "learning_rate": 9.825455832744266e-07, "loss": 0.0074, "num_tokens": 18470907.0, "reward": 0.625, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.948551893234253, "sampling/importance_sampling_ratio/mean": 1.0001471042633057, "sampling/importance_sampling_ratio/min": 0.44628918170928955, "sampling/sampling_logp_difference/max": 0.8067882061004639, "sampling/sampling_logp_difference/mean": 0.013530303724110126, "step": 433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 929.0, "completions/max_terminated_length": 929.0, "completions/mean_length": 482.984375, "completions/mean_terminated_length": 482.984375, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "entropy": 0.5816645622253418, "epoch": 0.5318627450980392, "frac_reward_zero_std": 1.0, "grad_norm": 0.010799697026994166, "kl": 0.007967125624418259, "learning_rate": 9.823585113289023e-07, "loss": 0.0001, "num_tokens": 18530522.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.616102933883667, "sampling/importance_sampling_ratio/mean": 0.9999399185180664, "sampling/importance_sampling_ratio/min": 0.5400082468986511, "sampling/sampling_logp_difference/max": 0.6161708831787109, "sampling/sampling_logp_difference/mean": 0.016719266772270203, "step": 434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 907.0, "completions/max_terminated_length": 907.0, "completions/mean_length": 416.4375, "completions/mean_terminated_length": 416.4375, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.5502423048019409, "epoch": 0.5330882352941176, "frac_reward_zero_std": 1.0, "grad_norm": 0.01140423300642042, "kl": 0.010603120550513268, "learning_rate": 9.821704602261585e-07, "loss": 0.0001, "num_tokens": 18579542.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4649577140808105, "sampling/importance_sampling_ratio/mean": 0.9998550415039062, "sampling/importance_sampling_ratio/min": 0.372272789478302, "sampling/sampling_logp_difference/max": 0.9881284236907959, "sampling/sampling_logp_difference/mean": 0.016059566289186478, "step": 435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 657.0, "completions/max_terminated_length": 657.0, "completions/mean_length": 320.0, "completions/mean_terminated_length": 320.0, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.4394180178642273, "epoch": 0.5343137254901961, "frac_reward_zero_std": 0.75, "grad_norm": 0.6105013074847713, "kl": 0.011131037026643753, "learning_rate": 9.819814303479267e-07, "loss": 0.0071, "num_tokens": 18616742.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.7304489612579346, "sampling/importance_sampling_ratio/mean": 1.0000858306884766, "sampling/importance_sampling_ratio/min": 0.7045102715492249, "sampling/sampling_logp_difference/max": 0.5483808517456055, "sampling/sampling_logp_difference/mean": 0.013144023716449738, "step": 436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 791.0, "completions/max_terminated_length": 791.0, "completions/mean_length": 314.1875, "completions/mean_terminated_length": 314.1875, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "entropy": 0.5403032302856445, "epoch": 0.5355392156862745, "frac_reward_zero_std": 1.0, "grad_norm": 0.015937994877750345, "kl": 0.01106307003647089, "learning_rate": 9.817914220779256e-07, "loss": 0.0001, "num_tokens": 18654882.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6274499893188477, "sampling/importance_sampling_ratio/mean": 0.9997000694274902, "sampling/importance_sampling_ratio/min": 0.6577688455581665, "sampling/sampling_logp_difference/max": 0.4870142936706543, "sampling/sampling_logp_difference/mean": 0.016290977597236633, "step": 437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 849.0, "completions/max_terminated_length": 849.0, "completions/mean_length": 296.078125, "completions/mean_terminated_length": 296.078125, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.5009875297546387, "epoch": 0.5367647058823529, "frac_reward_zero_std": 1.0, "grad_norm": 0.013979013049918351, "kl": 0.011295182630419731, "learning_rate": 9.816004358018603e-07, "loss": 0.0001, "num_tokens": 18691239.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4236782789230347, "sampling/importance_sampling_ratio/mean": 0.9995468854904175, "sampling/importance_sampling_ratio/min": 0.6771499514579773, "sampling/sampling_logp_difference/max": 0.3898625373840332, "sampling/sampling_logp_difference/mean": 0.015363456681370735, "step": 438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/max_terminated_length": 530.0, "completions/mean_length": 238.828125, "completions/mean_terminated_length": 238.828125, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.43725284934043884, "epoch": 0.5379901960784313, "frac_reward_zero_std": 1.0, "grad_norm": 0.020062066874179583, "kl": 0.0133253438398242, "learning_rate": 9.814084719074204e-07, "loss": 0.0001, "num_tokens": 18722748.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6363584995269775, "sampling/importance_sampling_ratio/mean": 1.0005732774734497, "sampling/importance_sampling_ratio/min": 0.700526237487793, "sampling/sampling_logp_difference/max": 0.4924733638763428, "sampling/sampling_logp_difference/mean": 0.015090974979102612, "step": 439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 724.0, "completions/max_terminated_length": 724.0, "completions/mean_length": 363.78125, "completions/mean_terminated_length": 363.78125, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.5925343036651611, "epoch": 0.5392156862745098, "frac_reward_zero_std": 0.5, "grad_norm": 0.7337069052815051, "kl": 0.012147855013608932, "learning_rate": 9.81215530784281e-07, "loss": -0.0257, "num_tokens": 18761886.0, "reward": -0.1875, "reward_std": 0.3811737596988678, "rewards/decision_reward_func/mean": -0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 1.435200810432434, "sampling/importance_sampling_ratio/mean": 1.0001330375671387, "sampling/importance_sampling_ratio/min": 0.6841229200363159, "sampling/sampling_logp_difference/max": 0.37961769104003906, "sampling/sampling_logp_difference/mean": 0.015470972284674644, "step": 440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 427.0, "completions/max_terminated_length": 427.0, "completions/mean_length": 225.640625, "completions/mean_terminated_length": 225.640625, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.41190338134765625, "epoch": 0.5404411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.018889641130865845, "kl": 0.012833962216973305, "learning_rate": 9.810216128240996e-07, "loss": 0.0001, "num_tokens": 18790615.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9285204410552979, "sampling/importance_sampling_ratio/mean": 1.000483751296997, "sampling/importance_sampling_ratio/min": 0.6157411336898804, "sampling/sampling_logp_difference/max": 0.6567530632019043, "sampling/sampling_logp_difference/mean": 0.01476193405687809, "step": 441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 924.0, "completions/max_terminated_length": 924.0, "completions/mean_length": 316.890625, "completions/mean_terminated_length": 316.890625, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "entropy": 0.4268978238105774, "epoch": 0.5416666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.01242367423727755, "kl": 0.007485729642212391, "learning_rate": 9.808267184205181e-07, "loss": 0.0001, "num_tokens": 18832528.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5071556568145752, "sampling/importance_sampling_ratio/mean": 1.0008069276809692, "sampling/importance_sampling_ratio/min": 0.6111128926277161, "sampling/sampling_logp_difference/max": 0.4924736022949219, "sampling/sampling_logp_difference/mean": 0.012631865218281746, "step": 442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 771.0, "completions/max_terminated_length": 771.0, "completions/mean_length": 375.828125, "completions/mean_terminated_length": 375.828125, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.5734411478042603, "epoch": 0.5428921568627451, "frac_reward_zero_std": 0.5, "grad_norm": 0.887667277516138, "kl": 0.01094148214906454, "learning_rate": 9.806308479691594e-07, "loss": -0.0266, "num_tokens": 18877813.0, "reward": -0.0625, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": -0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.6141189336776733, "sampling/importance_sampling_ratio/mean": 1.0005214214324951, "sampling/importance_sampling_ratio/min": 0.6889285445213318, "sampling/sampling_logp_difference/max": 0.4787893295288086, "sampling/sampling_logp_difference/mean": 0.015951339155435562, "step": 443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 783.0, "completions/max_terminated_length": 783.0, "completions/mean_length": 389.21875, "completions/mean_terminated_length": 389.21875, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.517313539981842, "epoch": 0.5441176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 0.8491596577337914, "kl": 0.009967934340238571, "learning_rate": 9.80434001867628e-07, "loss": 0.0497, "num_tokens": 18927619.0, "reward": 0.8125, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.938145637512207, "sampling/importance_sampling_ratio/mean": 0.9998151063919067, "sampling/importance_sampling_ratio/min": 0.5180969834327698, "sampling/sampling_logp_difference/max": 0.6617317199707031, "sampling/sampling_logp_difference/mean": 0.016022633761167526, "step": 444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 584.0, "completions/max_terminated_length": 584.0, "completions/mean_length": 274.796875, "completions/mean_terminated_length": 274.796875, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.4159596562385559, "epoch": 0.5453431372549019, "frac_reward_zero_std": 1.0, "grad_norm": 0.012214114518990671, "kl": 0.007587377913296223, "learning_rate": 9.802361805155097e-07, "loss": 0.0001, "num_tokens": 18958182.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3555891513824463, "sampling/importance_sampling_ratio/mean": 0.9999185800552368, "sampling/importance_sampling_ratio/min": 0.6925557255744934, "sampling/sampling_logp_difference/max": 0.3673665523529053, "sampling/sampling_logp_difference/mean": 0.013740738853812218, "step": 445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1148.0, "completions/max_terminated_length": 1148.0, "completions/mean_length": 443.53125, "completions/mean_terminated_length": 443.53125, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "entropy": 0.48682963848114014, "epoch": 0.5465686274509803, "frac_reward_zero_std": 0.5, "grad_norm": 0.7397401921582077, "kl": 0.007162432186305523, "learning_rate": 9.800373843143683e-07, "loss": 0.0387, "num_tokens": 19014312.0, "reward": 0.375, "reward_std": 0.34156501293182373, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999200105667114, "sampling/importance_sampling_ratio/min": 0.3135911822319031, "sampling/sampling_logp_difference/max": 1.1596651077270508, "sampling/sampling_logp_difference/mean": 0.014190856367349625, "step": 446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 593.0, "completions/max_terminated_length": 593.0, "completions/mean_length": 317.59375, "completions/mean_terminated_length": 317.59375, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.480175644159317, "epoch": 0.5477941176470589, "frac_reward_zero_std": 0.75, "grad_norm": 0.6462339680358771, "kl": 0.009109245613217354, "learning_rate": 9.798376136677484e-07, "loss": -0.0204, "num_tokens": 19051598.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.673612356185913, "sampling/importance_sampling_ratio/mean": 1.000412940979004, "sampling/importance_sampling_ratio/min": 0.6532645225524902, "sampling/sampling_logp_difference/max": 0.5149843692779541, "sampling/sampling_logp_difference/mean": 0.014607889577746391, "step": 447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 998.0, "completions/max_terminated_length": 998.0, "completions/mean_length": 386.765625, "completions/mean_terminated_length": 386.765625, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.4678851366043091, "epoch": 0.5490196078431373, "frac_reward_zero_std": 1.0, "grad_norm": 0.01293398881973557, "kl": 0.009703919291496277, "learning_rate": 9.796368689811712e-07, "loss": 0.0001, "num_tokens": 19094335.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3913686275482178, "sampling/importance_sampling_ratio/mean": 0.9996761679649353, "sampling/importance_sampling_ratio/min": 0.627776026725769, "sampling/sampling_logp_difference/max": 0.46557188034057617, "sampling/sampling_logp_difference/mean": 0.014894812367856503, "step": 448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 702.0, "completions/max_terminated_length": 702.0, "completions/mean_length": 264.46875, "completions/mean_terminated_length": 264.46875, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.5280132293701172, "epoch": 0.5502450980392157, "frac_reward_zero_std": 1.0, "grad_norm": 0.016894525082821316, "kl": 0.010450221598148346, "learning_rate": 9.79435150662136e-07, "loss": 0.0001, "num_tokens": 19124445.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4352926015853882, "sampling/importance_sampling_ratio/mean": 1.0003700256347656, "sampling/importance_sampling_ratio/min": 0.616044819355011, "sampling/sampling_logp_difference/max": 0.4844355583190918, "sampling/sampling_logp_difference/mean": 0.015505093149840832, "step": 449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 790.0, "completions/max_terminated_length": 790.0, "completions/mean_length": 332.34375, "completions/mean_terminated_length": 332.34375, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "entropy": 0.4909639358520508, "epoch": 0.5514705882352942, "frac_reward_zero_std": 1.0, "grad_norm": 0.012588097725780723, "kl": 0.00781976617872715, "learning_rate": 9.792324591201177e-07, "loss": 0.0001, "num_tokens": 19169299.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.400328278541565, "sampling/importance_sampling_ratio/mean": 0.9999714493751526, "sampling/importance_sampling_ratio/min": 0.7089029550552368, "sampling/sampling_logp_difference/max": 0.3440365791320801, "sampling/sampling_logp_difference/mean": 0.01451482530683279, "step": 450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1121.0, "completions/max_terminated_length": 1121.0, "completions/mean_length": 398.75, "completions/mean_terminated_length": 398.75, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.4690524935722351, "epoch": 0.5526960784313726, "frac_reward_zero_std": 1.0, "grad_norm": 0.01563789973197071, "kl": 0.009900645352900028, "learning_rate": 9.790287947665681e-07, "loss": 0.0001, "num_tokens": 19216707.0, "reward": -0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": -0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5984046459197998, "sampling/importance_sampling_ratio/mean": 1.0006173849105835, "sampling/importance_sampling_ratio/min": 0.5900133848190308, "sampling/sampling_logp_difference/max": 0.5276100635528564, "sampling/sampling_logp_difference/mean": 0.01505946833640337, "step": 451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 639.0, "completions/max_terminated_length": 639.0, "completions/mean_length": 360.9375, "completions/mean_terminated_length": 360.9375, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.48329469561576843, "epoch": 0.553921568627451, "frac_reward_zero_std": 0.75, "grad_norm": 0.7508300546029819, "kl": 0.006920414976775646, "learning_rate": 9.788241580149122e-07, "loss": 0.0159, "num_tokens": 19262271.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.4718111753463745, "sampling/importance_sampling_ratio/mean": 0.9999725222587585, "sampling/importance_sampling_ratio/min": 0.6947948336601257, "sampling/sampling_logp_difference/max": 0.3864936828613281, "sampling/sampling_logp_difference/mean": 0.014035481959581375, "step": 452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 645.0, "completions/max_terminated_length": 645.0, "completions/mean_length": 270.78125, "completions/mean_terminated_length": 270.78125, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.524167001247406, "epoch": 0.5551470588235294, "frac_reward_zero_std": 0.75, "grad_norm": 0.6873190513635767, "kl": 0.01436869241297245, "learning_rate": 9.786185492805501e-07, "loss": -0.0101, "num_tokens": 19295057.0, "reward": 0.125, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.00006103515625, "sampling/importance_sampling_ratio/min": 0.6737567186355591, "sampling/sampling_logp_difference/max": 0.7390550374984741, "sampling/sampling_logp_difference/mean": 0.016703473404049873, "step": 453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 914.0, "completions/max_terminated_length": 914.0, "completions/mean_length": 338.125, "completions/mean_terminated_length": 338.125, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.45524275302886963, "epoch": 0.5563725490196079, "frac_reward_zero_std": 0.5, "grad_norm": 0.7684671718819818, "kl": 0.010827132500708103, "learning_rate": 9.784119689808542e-07, "loss": -0.0124, "num_tokens": 19336393.0, "reward": 0.4375, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.5796773433685303, "sampling/importance_sampling_ratio/mean": 1.000125527381897, "sampling/importance_sampling_ratio/min": 0.6954967975616455, "sampling/sampling_logp_difference/max": 0.45722055435180664, "sampling/sampling_logp_difference/mean": 0.013932979665696621, "step": 454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1153.0, "completions/max_terminated_length": 1153.0, "completions/mean_length": 376.890625, "completions/mean_terminated_length": 376.890625, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.45477816462516785, "epoch": 0.5575980392156863, "frac_reward_zero_std": 0.75, "grad_norm": 0.664348313455457, "kl": 0.010480504482984543, "learning_rate": 9.782044175351699e-07, "loss": -0.0217, "num_tokens": 19380658.0, "reward": 0.125, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 1.460005760192871, "sampling/importance_sampling_ratio/mean": 0.9999960660934448, "sampling/importance_sampling_ratio/min": 0.572995662689209, "sampling/sampling_logp_difference/max": 0.5568771362304688, "sampling/sampling_logp_difference/mean": 0.014640471898019314, "step": 455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1000.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 360.15625, "completions/mean_terminated_length": 360.15625, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "entropy": 0.5603446960449219, "epoch": 0.5588235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.013663673633796336, "kl": 0.008531229570508003, "learning_rate": 9.779958953648129e-07, "loss": 0.0001, "num_tokens": 19422940.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4075660705566406, "sampling/importance_sampling_ratio/mean": 1.00009024143219, "sampling/importance_sampling_ratio/min": 0.6884828805923462, "sampling/sampling_logp_difference/max": 0.37326478958129883, "sampling/sampling_logp_difference/mean": 0.014874497428536415, "step": 456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 585.0, "completions/max_terminated_length": 585.0, "completions/mean_length": 252.765625, "completions/mean_terminated_length": 252.765625, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.5780113935470581, "epoch": 0.5600490196078431, "frac_reward_zero_std": 0.75, "grad_norm": 0.7759419502144755, "kl": 0.016921930015087128, "learning_rate": 9.777864028930705e-07, "loss": 0.0085, "num_tokens": 19454925.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.4065632820129395, "sampling/importance_sampling_ratio/mean": 1.0001938343048096, "sampling/importance_sampling_ratio/min": 0.5963423848152161, "sampling/sampling_logp_difference/max": 0.5169403553009033, "sampling/sampling_logp_difference/mean": 0.017642803490161896, "step": 457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/max_terminated_length": 488.0, "completions/mean_length": 310.9375, "completions/mean_terminated_length": 310.9375, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "entropy": 0.2957494854927063, "epoch": 0.5612745098039216, "frac_reward_zero_std": 1.0, "grad_norm": 0.011004529079387051, "kl": 0.0068382686004042625, "learning_rate": 9.775759405451986e-07, "loss": 0.0001, "num_tokens": 19495513.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5578750371932983, "sampling/importance_sampling_ratio/mean": 1.0000064373016357, "sampling/importance_sampling_ratio/min": 0.5854133367538452, "sampling/sampling_logp_difference/max": 0.5354371070861816, "sampling/sampling_logp_difference/mean": 0.010777350515127182, "step": 458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 719.0, "completions/max_terminated_length": 719.0, "completions/mean_length": 384.1875, "completions/mean_terminated_length": 384.1875, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.5290184617042542, "epoch": 0.5625, "frac_reward_zero_std": 1.0, "grad_norm": 0.011894776172345146, "kl": 0.008571705780923367, "learning_rate": 9.773645087484228e-07, "loss": 0.0001, "num_tokens": 19536997.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4352070093154907, "sampling/importance_sampling_ratio/mean": 0.9998887181282043, "sampling/importance_sampling_ratio/min": 0.6276262998580933, "sampling/sampling_logp_difference/max": 0.4658103585243225, "sampling/sampling_logp_difference/mean": 0.015627212822437286, "step": 459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 573.0, "completions/max_terminated_length": 573.0, "completions/mean_length": 332.875, "completions/mean_terminated_length": 332.875, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.3707161545753479, "epoch": 0.5637254901960784, "frac_reward_zero_std": 1.0, "grad_norm": 0.011890233270510059, "kl": 0.0069488706067204475, "learning_rate": 9.771521079319363e-07, "loss": 0.0001, "num_tokens": 19579293.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000385046005249, "sampling/importance_sampling_ratio/min": 0.5437952280044556, "sampling/sampling_logp_difference/max": 0.9685087203979492, "sampling/sampling_logp_difference/mean": 0.01225197035819292, "step": 460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 593.0, "completions/max_terminated_length": 593.0, "completions/mean_length": 316.890625, "completions/mean_terminated_length": 316.890625, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.4821564555168152, "epoch": 0.5649509803921569, "frac_reward_zero_std": 1.0, "grad_norm": 0.012002035015850692, "kl": 0.00849146768450737, "learning_rate": 9.76938738526899e-07, "loss": 0.0001, "num_tokens": 19620838.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5404114723205566, "sampling/importance_sampling_ratio/mean": 1.0001474618911743, "sampling/importance_sampling_ratio/min": 0.6804428696632385, "sampling/sampling_logp_difference/max": 0.4320495128631592, "sampling/sampling_logp_difference/mean": 0.01465106662362814, "step": 461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 666.0, "completions/max_terminated_length": 666.0, "completions/mean_length": 369.015625, "completions/mean_terminated_length": 369.015625, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.32366156578063965, "epoch": 0.5661764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.00950497378063026, "kl": 0.0062925685197114944, "learning_rate": 9.767244009664376e-07, "loss": 0.0001, "num_tokens": 19667111.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.459479808807373, "sampling/importance_sampling_ratio/mean": 1.000180721282959, "sampling/importance_sampling_ratio/min": 0.6117788553237915, "sampling/sampling_logp_difference/max": 0.49138450622558594, "sampling/sampling_logp_difference/mean": 0.011026697233319283, "step": 462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 710.0, "completions/max_terminated_length": 710.0, "completions/mean_length": 379.65625, "completions/mean_terminated_length": 379.65625, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "entropy": 0.43677449226379395, "epoch": 0.5674019607843137, "frac_reward_zero_std": 1.0, "grad_norm": 0.010173814858493113, "kl": 0.006671119015663862, "learning_rate": 9.765090956856435e-07, "loss": 0.0001, "num_tokens": 19709809.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6244935989379883, "sampling/importance_sampling_ratio/mean": 0.9996877908706665, "sampling/importance_sampling_ratio/min": 0.6916855573654175, "sampling/sampling_logp_difference/max": 0.4851961135864258, "sampling/sampling_logp_difference/mean": 0.013114294037222862, "step": 463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 998.0, "completions/max_terminated_length": 998.0, "completions/mean_length": 394.75, "completions/mean_terminated_length": 394.75, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.525558352470398, "epoch": 0.5686274509803921, "frac_reward_zero_std": 0.5, "grad_norm": 0.8387592423797123, "kl": 0.011691516265273094, "learning_rate": 9.76292823121573e-07, "loss": 0.0183, "num_tokens": 19755521.0, "reward": 0.375, "reward_std": 0.481805682182312, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001029968261719, "sampling/importance_sampling_ratio/min": 0.49625363945961, "sampling/sampling_logp_difference/max": 0.8018059730529785, "sampling/sampling_logp_difference/mean": 0.016435515135526657, "step": 464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 785.0, "completions/max_terminated_length": 785.0, "completions/mean_length": 392.671875, "completions/mean_terminated_length": 392.671875, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "entropy": 0.4251392185688019, "epoch": 0.5698529411764706, "frac_reward_zero_std": 0.75, "grad_norm": 0.6557449229408352, "kl": 0.007682814262807369, "learning_rate": 9.760755837132457e-07, "loss": 0.0211, "num_tokens": 19806252.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.4006872177124023, "sampling/importance_sampling_ratio/mean": 0.9997063875198364, "sampling/importance_sampling_ratio/min": 0.6078288555145264, "sampling/sampling_logp_difference/max": 0.4978618621826172, "sampling/sampling_logp_difference/mean": 0.01323317177593708, "step": 465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/max_terminated_length": 357.0, "completions/mean_length": 217.953125, "completions/mean_terminated_length": 217.953125, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.4774248003959656, "epoch": 0.571078431372549, "frac_reward_zero_std": 0.75, "grad_norm": 0.7695632237732828, "kl": 0.014072461053729057, "learning_rate": 9.758573779016436e-07, "loss": 0.0318, "num_tokens": 19832233.0, "reward": 0.59375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.453607201576233, "sampling/importance_sampling_ratio/mean": 0.9996403455734253, "sampling/importance_sampling_ratio/min": 0.6263987421989441, "sampling/sampling_logp_difference/max": 0.46776819229125977, "sampling/sampling_logp_difference/mean": 0.016757836565375328, "step": 466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 772.0, "completions/max_terminated_length": 772.0, "completions/mean_length": 349.8125, "completions/mean_terminated_length": 349.8125, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.40837669372558594, "epoch": 0.5723039215686274, "frac_reward_zero_std": 0.75, "grad_norm": 0.540093571166408, "kl": 0.010147717781364918, "learning_rate": 9.75638206129711e-07, "loss": -0.003, "num_tokens": 19869917.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.4613875150680542, "sampling/importance_sampling_ratio/mean": 0.9995831251144409, "sampling/importance_sampling_ratio/min": 0.6909335255622864, "sampling/sampling_logp_difference/max": 0.379386305809021, "sampling/sampling_logp_difference/mean": 0.013787595555186272, "step": 467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 544.0, "completions/max_terminated_length": 544.0, "completions/mean_length": 316.515625, "completions/mean_terminated_length": 316.515625, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "entropy": 0.6752997636795044, "epoch": 0.5735294117647058, "frac_reward_zero_std": 0.5, "grad_norm": 0.818889376823017, "kl": 0.01571234129369259, "learning_rate": 9.754180688423524e-07, "loss": 0.0089, "num_tokens": 19909214.0, "reward": 0.625, "reward_std": 0.42078250646591187, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.5467478036880493, "sampling/importance_sampling_ratio/mean": 0.9999995827674866, "sampling/importance_sampling_ratio/min": 0.6551179885864258, "sampling/sampling_logp_difference/max": 0.4361546039581299, "sampling/sampling_logp_difference/mean": 0.01868288218975067, "step": 468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 760.0, "completions/max_terminated_length": 760.0, "completions/mean_length": 356.34375, "completions/mean_terminated_length": 356.34375, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.5488570928573608, "epoch": 0.5747549019607843, "frac_reward_zero_std": 0.25, "grad_norm": 0.912046629234056, "kl": 0.010272404178977013, "learning_rate": 9.751969664864326e-07, "loss": 0.0207, "num_tokens": 19951348.0, "reward": 0.40625, "reward_std": 0.519389271736145, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.3740123510360718, "sampling/importance_sampling_ratio/mean": 0.999782919883728, "sampling/importance_sampling_ratio/min": 0.7134128212928772, "sampling/sampling_logp_difference/max": 0.3376951217651367, "sampling/sampling_logp_difference/mean": 0.015747036784887314, "step": 469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 559.0, "completions/max_terminated_length": 559.0, "completions/mean_length": 343.796875, "completions/mean_terminated_length": 343.796875, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "entropy": 0.5533717274665833, "epoch": 0.5759803921568627, "frac_reward_zero_std": 0.5, "grad_norm": 0.8687031525182312, "kl": 0.011935630813241005, "learning_rate": 9.749748995107756e-07, "loss": 0.0007, "num_tokens": 19990823.0, "reward": 0.0625, "reward_std": 0.49553054571151733, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.4351999759674072, "sampling/importance_sampling_ratio/mean": 0.9996095895767212, "sampling/importance_sampling_ratio/min": 0.6060752868652344, "sampling/sampling_logp_difference/max": 0.5007510185241699, "sampling/sampling_logp_difference/mean": 0.016084488481283188, "step": 470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 716.0, "completions/max_terminated_length": 716.0, "completions/mean_length": 360.796875, "completions/mean_terminated_length": 360.796875, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.42497777938842773, "epoch": 0.5772058823529411, "frac_reward_zero_std": 1.0, "grad_norm": 0.012791595001087626, "kl": 0.00913158617913723, "learning_rate": 9.74751868366163e-07, "loss": 0.0001, "num_tokens": 20031898.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4040508270263672, "sampling/importance_sampling_ratio/mean": 1.000016689300537, "sampling/importance_sampling_ratio/min": 0.6204608678817749, "sampling/sampling_logp_difference/max": 0.4772927761077881, "sampling/sampling_logp_difference/mean": 0.013213417492806911, "step": 471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 770.0, "completions/max_terminated_length": 770.0, "completions/mean_length": 383.15625, "completions/mean_terminated_length": 383.15625, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.6443812847137451, "epoch": 0.5784313725490197, "frac_reward_zero_std": 0.5, "grad_norm": 0.7662417021657064, "kl": 0.011527059599757195, "learning_rate": 9.745278735053343e-07, "loss": -0.0466, "num_tokens": 20084692.0, "reward": -0.21875, "reward_std": 0.42695626616477966, "rewards/decision_reward_func/mean": -0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 1.4837896823883057, "sampling/importance_sampling_ratio/mean": 0.9996339082717896, "sampling/importance_sampling_ratio/min": 0.5914270281791687, "sampling/sampling_logp_difference/max": 0.5252169370651245, "sampling/sampling_logp_difference/mean": 0.017904652282595634, "step": 472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 693.0, "completions/max_terminated_length": 693.0, "completions/mean_length": 339.296875, "completions/mean_terminated_length": 339.296875, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.4992023706436157, "epoch": 0.5796568627450981, "frac_reward_zero_std": 0.5, "grad_norm": 0.7131284932478158, "kl": 0.01318102516233921, "learning_rate": 9.743029153829845e-07, "loss": -0.0454, "num_tokens": 20125831.0, "reward": 0.53125, "reward_std": 0.4629635810852051, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.6543093919754028, "sampling/importance_sampling_ratio/mean": 0.9996520280838013, "sampling/importance_sampling_ratio/min": 0.6160585880279541, "sampling/sampling_logp_difference/max": 0.5033836364746094, "sampling/sampling_logp_difference/mean": 0.014915497973561287, "step": 473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 698.0, "completions/max_terminated_length": 698.0, "completions/mean_length": 296.125, "completions/mean_terminated_length": 296.125, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.5474790930747986, "epoch": 0.5808823529411765, "frac_reward_zero_std": 0.5, "grad_norm": 1.110513472256323, "kl": 0.012099158018827438, "learning_rate": 9.740769944557644e-07, "loss": -0.0432, "num_tokens": 20168639.0, "reward": -0.125, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": -0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 1.2867412567138672, "sampling/importance_sampling_ratio/mean": 1.0001749992370605, "sampling/importance_sampling_ratio/min": 0.6303616762161255, "sampling/sampling_logp_difference/max": 0.46146154403686523, "sampling/sampling_logp_difference/mean": 0.016308285295963287, "step": 474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 655.0, "completions/max_terminated_length": 655.0, "completions/mean_length": 320.65625, "completions/mean_terminated_length": 320.65625, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.49292558431625366, "epoch": 0.5821078431372549, "frac_reward_zero_std": 0.75, "grad_norm": 0.7447775187058583, "kl": 0.011737293563783169, "learning_rate": 9.738501111822792e-07, "loss": -0.0165, "num_tokens": 20211833.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.5968154668807983, "sampling/importance_sampling_ratio/mean": 1.0000004768371582, "sampling/importance_sampling_ratio/min": 0.30434948205947876, "sampling/sampling_logp_difference/max": 1.1895785331726074, "sampling/sampling_logp_difference/mean": 0.01492564007639885, "step": 475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/max_terminated_length": 334.0, "completions/mean_length": 237.4375, "completions/mean_terminated_length": 237.4375, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.376506507396698, "epoch": 0.5833333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.019949849278521083, "kl": 0.012760442681610584, "learning_rate": 9.736222660230878e-07, "loss": 0.0001, "num_tokens": 20248917.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4234800338745117, "sampling/importance_sampling_ratio/mean": 0.9998863339424133, "sampling/importance_sampling_ratio/min": 0.6644997596740723, "sampling/sampling_logp_difference/max": 0.4087207317352295, "sampling/sampling_logp_difference/mean": 0.014670666307210922, "step": 476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 630.0, "completions/max_terminated_length": 630.0, "completions/mean_length": 368.59375, "completions/mean_terminated_length": 368.59375, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "entropy": 0.570227861404419, "epoch": 0.5845588235294118, "frac_reward_zero_std": 0.75, "grad_norm": 0.5791186397403194, "kl": 0.013337528333067894, "learning_rate": 9.73393459440701e-07, "loss": 0.0024, "num_tokens": 20292395.0, "reward": 0.78125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.6155208349227905, "sampling/importance_sampling_ratio/mean": 1.0003381967544556, "sampling/importance_sampling_ratio/min": 0.6470731496810913, "sampling/sampling_logp_difference/max": 0.4796574115753174, "sampling/sampling_logp_difference/mean": 0.017941348254680634, "step": 477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 881.0, "completions/max_terminated_length": 881.0, "completions/mean_length": 432.359375, "completions/mean_terminated_length": 432.359375, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "entropy": 0.4905966818332672, "epoch": 0.5857843137254902, "frac_reward_zero_std": 0.75, "grad_norm": 0.5093736477628529, "kl": 0.009135225787758827, "learning_rate": 9.73163691899582e-07, "loss": -0.0071, "num_tokens": 20340178.0, "reward": -0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": -0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.5745823383331299, "sampling/importance_sampling_ratio/mean": 0.9997084736824036, "sampling/importance_sampling_ratio/min": 0.6924284100532532, "sampling/sampling_logp_difference/max": 0.45398998260498047, "sampling/sampling_logp_difference/mean": 0.014028344303369522, "step": 478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 468.0, "completions/max_terminated_length": 468.0, "completions/mean_length": 228.84375, "completions/mean_terminated_length": 228.84375, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.4381697177886963, "epoch": 0.5870098039215687, "frac_reward_zero_std": 0.75, "grad_norm": 0.8471655383539695, "kl": 0.01867147907614708, "learning_rate": 9.729329638661444e-07, "loss": 0.0243, "num_tokens": 20372888.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.7855929136276245, "sampling/importance_sampling_ratio/mean": 1.0006465911865234, "sampling/importance_sampling_ratio/min": 0.6651729941368103, "sampling/sampling_logp_difference/max": 0.5797505378723145, "sampling/sampling_logp_difference/mean": 0.015415986999869347, "step": 479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1200.0, "completions/max_terminated_length": 1200.0, "completions/mean_length": 463.03125, "completions/mean_terminated_length": 463.03125, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.404219388961792, "epoch": 0.5882352941176471, "frac_reward_zero_std": 0.75, "grad_norm": 0.5049451962486377, "kl": 0.009356936439871788, "learning_rate": 9.727012758087512e-07, "loss": 0.0033, "num_tokens": 20422170.0, "reward": 0.75, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.435335636138916, "sampling/importance_sampling_ratio/mean": 1.0001070499420166, "sampling/importance_sampling_ratio/min": 0.5255594849586487, "sampling/sampling_logp_difference/max": 0.6432919502258301, "sampling/sampling_logp_difference/mean": 0.013228632509708405, "step": 480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 812.0, "completions/max_terminated_length": 812.0, "completions/mean_length": 373.015625, "completions/mean_terminated_length": 373.015625, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.45835280418395996, "epoch": 0.5894607843137255, "frac_reward_zero_std": 0.25, "grad_norm": 0.9851747439224965, "kl": 0.0118996761739254, "learning_rate": 9.724686281977146e-07, "loss": 0.0342, "num_tokens": 20466283.0, "reward": 0.4375, "reward_std": 0.4973389506340027, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.546732783317566, "sampling/importance_sampling_ratio/mean": 0.9996258616447449, "sampling/importance_sampling_ratio/min": 0.5180447697639465, "sampling/sampling_logp_difference/max": 0.65769362449646, "sampling/sampling_logp_difference/mean": 0.013937268406152725, "step": 481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 922.0, "completions/max_terminated_length": 922.0, "completions/mean_length": 357.984375, "completions/mean_terminated_length": 357.984375, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.593146562576294, "epoch": 0.5906862745098039, "frac_reward_zero_std": 1.0, "grad_norm": 0.014314226449441986, "kl": 0.014483312144875526, "learning_rate": 9.722350215052946e-07, "loss": 0.0001, "num_tokens": 20511626.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6447120904922485, "sampling/importance_sampling_ratio/mean": 1.0001804828643799, "sampling/importance_sampling_ratio/min": 0.6681034564971924, "sampling/sampling_logp_difference/max": 0.49756526947021484, "sampling/sampling_logp_difference/mean": 0.017075400799512863, "step": 482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 771.0, "completions/max_terminated_length": 771.0, "completions/mean_length": 309.859375, "completions/mean_terminated_length": 309.859375, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.40712323784828186, "epoch": 0.5919117647058824, "frac_reward_zero_std": 0.5, "grad_norm": 0.9676199930404759, "kl": 0.011711872182786465, "learning_rate": 9.720004562056979e-07, "loss": -0.019, "num_tokens": 20549345.0, "reward": 0.09375, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.354978084564209, "sampling/importance_sampling_ratio/mean": 1.0001581907272339, "sampling/importance_sampling_ratio/min": 0.5250301957130432, "sampling/sampling_logp_difference/max": 0.6442995071411133, "sampling/sampling_logp_difference/mean": 0.013158833608031273, "step": 483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 758.0, "completions/max_terminated_length": 758.0, "completions/mean_length": 334.375, "completions/mean_terminated_length": 334.375, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.49657291173934937, "epoch": 0.5931372549019608, "frac_reward_zero_std": 0.5, "grad_norm": 0.7935961119350425, "kl": 0.014093760401010513, "learning_rate": 9.717649327750773e-07, "loss": -0.041, "num_tokens": 20589945.0, "reward": 0.8125, "reward_std": 0.3943893015384674, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.6125301122665405, "sampling/importance_sampling_ratio/mean": 1.0003741979599, "sampling/importance_sampling_ratio/min": 0.6577131748199463, "sampling/sampling_logp_difference/max": 0.47780442237854004, "sampling/sampling_logp_difference/mean": 0.015174554660916328, "step": 484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1416.0, "completions/max_terminated_length": 1416.0, "completions/mean_length": 386.84375, "completions/mean_terminated_length": 386.84375, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "entropy": 0.3591594398021698, "epoch": 0.5943627450980392, "frac_reward_zero_std": 0.75, "grad_norm": 0.7034945835421502, "kl": 0.008480289950966835, "learning_rate": 9.7152845169153e-07, "loss": -0.1078, "num_tokens": 20631903.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.6007341146469116, "sampling/importance_sampling_ratio/mean": 0.9999156594276428, "sampling/importance_sampling_ratio/min": 0.6171379685401917, "sampling/sampling_logp_difference/max": 0.4826626777648926, "sampling/sampling_logp_difference/mean": 0.011368841864168644, "step": 485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 769.0, "completions/max_terminated_length": 769.0, "completions/mean_length": 410.0625, "completions/mean_terminated_length": 410.0625, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "entropy": 0.5623461604118347, "epoch": 0.5955882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 0.9306369428648532, "kl": 0.011310016736388206, "learning_rate": 9.712910134350984e-07, "loss": 0.1024, "num_tokens": 20675651.0, "reward": 0.84375, "reward_std": 0.3723389506340027, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.4166650772094727, "sampling/importance_sampling_ratio/mean": 0.9996798634529114, "sampling/importance_sampling_ratio/min": 0.5822422504425049, "sampling/sampling_logp_difference/max": 0.5408686399459839, "sampling/sampling_logp_difference/mean": 0.016486983746290207, "step": 486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 941.0, "completions/max_terminated_length": 941.0, "completions/mean_length": 336.15625, "completions/mean_terminated_length": 336.15625, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.4990158677101135, "epoch": 0.5968137254901961, "frac_reward_zero_std": 0.5, "grad_norm": 0.7797609598104052, "kl": 0.014600454829633236, "learning_rate": 9.710526184877666e-07, "loss": -0.0364, "num_tokens": 20711437.0, "reward": 0.90625, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.615415334701538, "sampling/importance_sampling_ratio/mean": 1.000476598739624, "sampling/importance_sampling_ratio/min": 0.6469404101371765, "sampling/sampling_logp_difference/max": 0.47959208488464355, "sampling/sampling_logp_difference/mean": 0.015240306034684181, "step": 487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 654.0, "completions/max_terminated_length": 654.0, "completions/mean_length": 302.40625, "completions/mean_terminated_length": 302.40625, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.4607468843460083, "epoch": 0.5980392156862745, "frac_reward_zero_std": 0.75, "grad_norm": 0.619314919220392, "kl": 0.015611632727086544, "learning_rate": 9.708132673334615e-07, "loss": 0.0081, "num_tokens": 20747015.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.6977907419204712, "sampling/importance_sampling_ratio/mean": 0.9998382329940796, "sampling/importance_sampling_ratio/min": 0.5348140597343445, "sampling/sampling_logp_difference/max": 0.6258361339569092, "sampling/sampling_logp_difference/mean": 0.014138374477624893, "step": 488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1082.0, "completions/max_terminated_length": 1082.0, "completions/mean_length": 445.78125, "completions/mean_terminated_length": 445.78125, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "entropy": 0.5838571190834045, "epoch": 0.5992647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 0.6708070197714049, "kl": 0.012384839355945587, "learning_rate": 9.705729604580505e-07, "loss": -0.0336, "num_tokens": 20792393.0, "reward": 0.5625, "reward_std": 0.49553054571151733, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.4032156467437744, "sampling/importance_sampling_ratio/mean": 1.0002527236938477, "sampling/importance_sampling_ratio/min": 0.5765946507453918, "sampling/sampling_logp_difference/max": 0.5506157875061035, "sampling/sampling_logp_difference/mean": 0.015412995591759682, "step": 489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 588.0, "completions/max_terminated_length": 588.0, "completions/mean_length": 365.59375, "completions/mean_terminated_length": 365.59375, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "entropy": 0.4130157232284546, "epoch": 0.6004901960784313, "frac_reward_zero_std": 0.25, "grad_norm": 0.8899361774031538, "kl": 0.011389223858714104, "learning_rate": 9.703316983493412e-07, "loss": 0.042, "num_tokens": 20832495.0, "reward": 0.3125, "reward_std": 0.5, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.4332804679870605, "sampling/importance_sampling_ratio/mean": 1.0001184940338135, "sampling/importance_sampling_ratio/min": 0.6472539305686951, "sampling/sampling_logp_difference/max": 0.4350166320800781, "sampling/sampling_logp_difference/mean": 0.012384315952658653, "step": 490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1245.0, "completions/max_terminated_length": 1245.0, "completions/mean_length": 420.125, "completions/mean_terminated_length": 420.125, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "entropy": 0.4221573770046234, "epoch": 0.6017156862745098, "frac_reward_zero_std": 0.25, "grad_norm": 0.7574714434855864, "kl": 0.013254991732537746, "learning_rate": 9.700894814970808e-07, "loss": 0.0051, "num_tokens": 20873879.0, "reward": 0.21875, "reward_std": 0.5457825064659119, "rewards/decision_reward_func/mean": 0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 1.6629348993301392, "sampling/importance_sampling_ratio/mean": 1.000283122062683, "sampling/importance_sampling_ratio/min": 0.5589829683303833, "sampling/sampling_logp_difference/max": 0.5816363096237183, "sampling/sampling_logp_difference/mean": 0.012358050793409348, "step": 491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 928.0, "completions/max_terminated_length": 928.0, "completions/mean_length": 402.046875, "completions/mean_terminated_length": 402.046875, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "entropy": 0.3929465115070343, "epoch": 0.6029411764705882, "frac_reward_zero_std": 0.75, "grad_norm": 0.4405770822553843, "kl": 0.012394443154335022, "learning_rate": 9.698463103929541e-07, "loss": 0.0095, "num_tokens": 20916010.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.4110153913497925, "sampling/importance_sampling_ratio/mean": 0.9998313784599304, "sampling/importance_sampling_ratio/min": 0.6333394646644592, "sampling/sampling_logp_difference/max": 0.45674872398376465, "sampling/sampling_logp_difference/mean": 0.01177979912608862, "step": 492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 641.0, "completions/max_terminated_length": 641.0, "completions/mean_length": 346.25, "completions/mean_terminated_length": 346.25, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.34366485476493835, "epoch": 0.6041666666666666, "frac_reward_zero_std": 0.75, "grad_norm": 0.5626801048768034, "kl": 0.015693364664912224, "learning_rate": 9.69602185530583e-07, "loss": 0.0127, "num_tokens": 20956362.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.3671519756317139, "sampling/importance_sampling_ratio/mean": 0.9996634721755981, "sampling/importance_sampling_ratio/min": 0.5665221810340881, "sampling/sampling_logp_difference/max": 0.5682390332221985, "sampling/sampling_logp_difference/mean": 0.011747309938073158, "step": 493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1141.0, "completions/max_terminated_length": 1141.0, "completions/mean_length": 456.21875, "completions/mean_terminated_length": 456.21875, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "entropy": 0.5867520570755005, "epoch": 0.6053921568627451, "frac_reward_zero_std": 0.5, "grad_norm": 0.7119615752719796, "kl": 0.01387823186814785, "learning_rate": 9.693571074055254e-07, "loss": 0.0492, "num_tokens": 21001704.0, "reward": 0.46875, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.4893393516540527, "sampling/importance_sampling_ratio/mean": 1.0000396966934204, "sampling/importance_sampling_ratio/min": 0.6369433403015137, "sampling/sampling_logp_difference/max": 0.45107460021972656, "sampling/sampling_logp_difference/mean": 0.01601700484752655, "step": 494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1654.0, "completions/max_terminated_length": 1654.0, "completions/mean_length": 483.8125, "completions/mean_terminated_length": 483.8125, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "entropy": 0.5275751352310181, "epoch": 0.6066176470588235, "frac_reward_zero_std": 0.5, "grad_norm": 0.7277639011606737, "kl": 0.016475841403007507, "learning_rate": 9.691110765152744e-07, "loss": -0.0004, "num_tokens": 21050636.0, "reward": -0.71875, "reward_std": 0.42695626616477966, "rewards/decision_reward_func/mean": -0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.609750509262085, "sampling/importance_sampling_ratio/mean": 1.0003583431243896, "sampling/importance_sampling_ratio/min": 0.47200924158096313, "sampling/sampling_logp_difference/max": 0.7507567405700684, "sampling/sampling_logp_difference/mean": 0.014974809251725674, "step": 495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 921.0, "completions/max_terminated_length": 921.0, "completions/mean_length": 446.8125, "completions/mean_terminated_length": 446.8125, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "entropy": 0.3733007609844208, "epoch": 0.6078431372549019, "frac_reward_zero_std": 0.75, "grad_norm": 0.5118963765177298, "kl": 0.011317054741084576, "learning_rate": 9.688640933592572e-07, "loss": 0.005, "num_tokens": 21094528.0, "reward": 0.6875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.2939987182617188, "sampling/importance_sampling_ratio/mean": 0.9997740983963013, "sampling/importance_sampling_ratio/min": 0.387348473072052, "sampling/sampling_logp_difference/max": 0.9484305381774902, "sampling/sampling_logp_difference/mean": 0.010323213413357735, "step": 496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 977.0, "completions/max_terminated_length": 977.0, "completions/mean_length": 546.859375, "completions/mean_terminated_length": 546.859375, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "entropy": 0.6766536235809326, "epoch": 0.6090686274509803, "frac_reward_zero_std": 0.0, "grad_norm": 0.8519335455781267, "kl": 0.016889993101358414, "learning_rate": 9.686161584388339e-07, "loss": 0.0318, "num_tokens": 21146535.0, "reward": -0.53125, "reward_std": 0.7535127401351929, "rewards/decision_reward_func/mean": -0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.367415189743042, "sampling/importance_sampling_ratio/mean": 0.999870240688324, "sampling/importance_sampling_ratio/min": 0.69861900806427, "sampling/sampling_logp_difference/max": 0.35864973068237305, "sampling/sampling_logp_difference/mean": 0.017139490693807602, "step": 497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 868.0, "completions/max_terminated_length": 868.0, "completions/mean_length": 468.921875, "completions/mean_terminated_length": 468.921875, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "entropy": 0.6287610530853271, "epoch": 0.6102941176470589, "frac_reward_zero_std": 0.5, "grad_norm": 0.677614120530804, "kl": 0.016639644280076027, "learning_rate": 9.683672722572966e-07, "loss": 0.0461, "num_tokens": 21193362.0, "reward": 0.15625, "reward_std": 0.47978055477142334, "rewards/decision_reward_func/mean": 0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.3555666208267212, "sampling/importance_sampling_ratio/mean": 1.0003186464309692, "sampling/importance_sampling_ratio/min": 0.7089166641235352, "sampling/sampling_logp_difference/max": 0.34401726722717285, "sampling/sampling_logp_difference/mean": 0.015504148788750172, "step": 498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 868.0, "completions/max_terminated_length": 868.0, "completions/mean_length": 402.953125, "completions/mean_terminated_length": 402.953125, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.543073296546936, "epoch": 0.6115196078431373, "frac_reward_zero_std": 0.5, "grad_norm": 0.8398591146550262, "kl": 0.020147666335105896, "learning_rate": 9.681174353198686e-07, "loss": 0.0135, "num_tokens": 21237183.0, "reward": 0.90625, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.3680572509765625, "sampling/importance_sampling_ratio/mean": 0.9997414350509644, "sampling/importance_sampling_ratio/min": 0.47535887360572815, "sampling/sampling_logp_difference/max": 0.743685245513916, "sampling/sampling_logp_difference/mean": 0.01554950326681137, "step": 499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1047.0, "completions/max_terminated_length": 1047.0, "completions/mean_length": 353.640625, "completions/mean_terminated_length": 353.640625, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.633564293384552, "epoch": 0.6127450980392157, "frac_reward_zero_std": 0.25, "grad_norm": 1.2491709137134575, "kl": 0.031656257808208466, "learning_rate": 9.678666481337031e-07, "loss": 0.1351, "num_tokens": 21278280.0, "reward": 0.84375, "reward_std": 0.46656501293182373, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0002896785736084, "sampling/importance_sampling_ratio/min": 0.6937450766563416, "sampling/sampling_logp_difference/max": 1.046295166015625, "sampling/sampling_logp_difference/mean": 0.01819945126771927, "step": 500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 876.0, "completions/max_terminated_length": 876.0, "completions/mean_length": 466.09375, "completions/mean_terminated_length": 466.09375, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "entropy": 0.4996642470359802, "epoch": 0.6139705882352942, "frac_reward_zero_std": 0.25, "grad_norm": 0.750197295629183, "kl": 0.017038879916071892, "learning_rate": 9.67614911207882e-07, "loss": -0.0271, "num_tokens": 21324750.0, "reward": 0.125, "reward_std": 0.6285127401351929, "rewards/decision_reward_func/mean": 0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 1.4032411575317383, "sampling/importance_sampling_ratio/mean": 0.9999557733535767, "sampling/importance_sampling_ratio/min": 0.6378746032714844, "sampling/sampling_logp_difference/max": 0.4496135711669922, "sampling/sampling_logp_difference/mean": 0.013633529655635357, "step": 501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 745.0, "completions/max_terminated_length": 745.0, "completions/mean_length": 457.421875, "completions/mean_terminated_length": 457.421875, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "entropy": 0.3832704424858093, "epoch": 0.6151960784313726, "frac_reward_zero_std": 0.5, "grad_norm": 0.8176424486837744, "kl": 0.013402925804257393, "learning_rate": 9.673622250534155e-07, "loss": 0.0457, "num_tokens": 21373593.0, "reward": 0.9375, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.5278794765472412, "sampling/importance_sampling_ratio/mean": 0.9998066425323486, "sampling/importance_sampling_ratio/min": 0.6498814821243286, "sampling/sampling_logp_difference/max": 0.4309651851654053, "sampling/sampling_logp_difference/mean": 0.010751089081168175, "step": 502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 645.0, "completions/max_terminated_length": 645.0, "completions/mean_length": 271.75, "completions/mean_terminated_length": 271.75, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.4678614139556885, "epoch": 0.616421568627451, "frac_reward_zero_std": 0.5, "grad_norm": 1.3325200988581203, "kl": 0.031888678669929504, "learning_rate": 9.671085901832404e-07, "loss": 0.0136, "num_tokens": 21404025.0, "reward": 0.40625, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.3969424962997437, "sampling/importance_sampling_ratio/mean": 1.0003552436828613, "sampling/importance_sampling_ratio/min": 0.4375641942024231, "sampling/sampling_logp_difference/max": 0.8265318870544434, "sampling/sampling_logp_difference/mean": 0.014766569249331951, "step": 503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 835.0, "completions/max_terminated_length": 835.0, "completions/mean_length": 425.890625, "completions/mean_terminated_length": 425.890625, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "entropy": 0.5149611234664917, "epoch": 0.6176470588235294, "frac_reward_zero_std": 0.75, "grad_norm": 0.7124946647524734, "kl": 0.021874921396374702, "learning_rate": 9.668540071122195e-07, "loss": 0.0616, "num_tokens": 21447250.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.428485631942749, "sampling/importance_sampling_ratio/mean": 0.9999268651008606, "sampling/importance_sampling_ratio/min": 0.5755301117897034, "sampling/sampling_logp_difference/max": 0.5524637699127197, "sampling/sampling_logp_difference/mean": 0.014709859155118465, "step": 504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 588.0, "completions/max_terminated_length": 588.0, "completions/mean_length": 346.765625, "completions/mean_terminated_length": 346.765625, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.5040950775146484, "epoch": 0.6188725490196079, "frac_reward_zero_std": 0.75, "grad_norm": 0.4685806475401511, "kl": 0.020670205354690552, "learning_rate": 9.665984763571402e-07, "loss": -0.009, "num_tokens": 21487811.0, "reward": 0.375, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.5302246809005737, "sampling/importance_sampling_ratio/mean": 0.9998977184295654, "sampling/importance_sampling_ratio/min": 0.6670789122581482, "sampling/sampling_logp_difference/max": 0.4254145622253418, "sampling/sampling_logp_difference/mean": 0.013146307319402695, "step": 505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 994.0, "completions/max_terminated_length": 994.0, "completions/mean_length": 454.546875, "completions/mean_terminated_length": 454.546875, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.4359341859817505, "epoch": 0.6200980392156863, "frac_reward_zero_std": 0.75, "grad_norm": 0.5398082519514399, "kl": 0.020259380340576172, "learning_rate": 9.663419984367137e-07, "loss": -0.018, "num_tokens": 21535830.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.5733730792999268, "sampling/importance_sampling_ratio/mean": 1.0001991987228394, "sampling/importance_sampling_ratio/min": 0.40676799416542053, "sampling/sampling_logp_difference/max": 0.8995122909545898, "sampling/sampling_logp_difference/mean": 0.012344993650913239, "step": 506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 945.0, "completions/max_terminated_length": 945.0, "completions/mean_length": 430.40625, "completions/mean_terminated_length": 430.40625, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "entropy": 0.5192615389823914, "epoch": 0.6213235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.817777387971958, "kl": 0.01912645809352398, "learning_rate": 9.660845738715742e-07, "loss": 0.0154, "num_tokens": 21579264.0, "reward": 0.5, "reward_std": 0.34156501293182373, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.384622573852539, "sampling/importance_sampling_ratio/mean": 0.9998804330825806, "sampling/importance_sampling_ratio/min": 0.46197131276130676, "sampling/sampling_logp_difference/max": 0.7722525596618652, "sampling/sampling_logp_difference/mean": 0.0137818967923522, "step": 507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 647.0, "completions/max_terminated_length": 647.0, "completions/mean_length": 391.765625, "completions/mean_terminated_length": 391.765625, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.6357569694519043, "epoch": 0.6225490196078431, "frac_reward_zero_std": 0.0, "grad_norm": 1.0336682859581021, "kl": 0.0180780328810215, "learning_rate": 9.658262031842769e-07, "loss": -0.0169, "num_tokens": 21622737.0, "reward": 0.5, "reward_std": 0.7059217691421509, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999750852584839, "sampling/importance_sampling_ratio/min": 0.6072848439216614, "sampling/sampling_logp_difference/max": 0.773681640625, "sampling/sampling_logp_difference/mean": 0.015597140416502953, "step": 508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 961.0, "completions/max_terminated_length": 961.0, "completions/mean_length": 420.859375, "completions/mean_terminated_length": 420.859375, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.6357181668281555, "epoch": 0.6237745098039216, "frac_reward_zero_std": 0.5, "grad_norm": 0.8120496916132142, "kl": 0.026201242581009865, "learning_rate": 9.655668868992983e-07, "loss": -0.0318, "num_tokens": 21669256.0, "reward": 0.5625, "reward_std": 0.47360679507255554, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.6660155057907104, "sampling/importance_sampling_ratio/mean": 1.000308871269226, "sampling/importance_sampling_ratio/min": 0.6932603120803833, "sampling/sampling_logp_difference/max": 0.5104348659515381, "sampling/sampling_logp_difference/mean": 0.015852190554142, "step": 509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 611.0, "completions/max_terminated_length": 611.0, "completions/mean_length": 297.671875, "completions/mean_terminated_length": 297.671875, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.5518385767936707, "epoch": 0.625, "frac_reward_zero_std": 1.0, "grad_norm": 0.032627329375102494, "kl": 0.03265194594860077, "learning_rate": 9.653066255430338e-07, "loss": 0.0003, "num_tokens": 21703795.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5586049556732178, "sampling/importance_sampling_ratio/mean": 0.9992661476135254, "sampling/importance_sampling_ratio/min": 0.6118167638778687, "sampling/sampling_logp_difference/max": 0.49132251739501953, "sampling/sampling_logp_difference/mean": 0.01678563468158245, "step": 510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1027.0, "completions/max_terminated_length": 1027.0, "completions/mean_length": 409.53125, "completions/mean_terminated_length": 409.53125, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "entropy": 0.511876106262207, "epoch": 0.6262254901960784, "frac_reward_zero_std": 0.5, "grad_norm": 0.6246223608266159, "kl": 0.019001290202140808, "learning_rate": 9.650454196437973e-07, "loss": 0.0052, "num_tokens": 21745509.0, "reward": 0.5, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4583449363708496, "sampling/importance_sampling_ratio/mean": 1.0003107786178589, "sampling/importance_sampling_ratio/min": 0.6052066683769226, "sampling/sampling_logp_difference/max": 0.5021853446960449, "sampling/sampling_logp_difference/mean": 0.01391204260289669, "step": 511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 694.0, "completions/max_terminated_length": 694.0, "completions/mean_length": 452.3125, "completions/mean_terminated_length": 452.3125, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "entropy": 0.3959510624408722, "epoch": 0.6274509803921569, "frac_reward_zero_std": 0.75, "grad_norm": 0.378868473240597, "kl": 0.016375333070755005, "learning_rate": 9.647832697318206e-07, "loss": -0.022, "num_tokens": 21794249.0, "reward": 0.84375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.8459519147872925, "sampling/importance_sampling_ratio/mean": 1.0000736713409424, "sampling/importance_sampling_ratio/min": 0.5532311201095581, "sampling/sampling_logp_difference/max": 0.6129951477050781, "sampling/sampling_logp_difference/mean": 0.011700225993990898, "step": 512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 795.0, "completions/max_terminated_length": 795.0, "completions/mean_length": 422.328125, "completions/mean_terminated_length": 422.328125, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "entropy": 0.4188688397407532, "epoch": 0.6286764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.7762747807743928, "kl": 0.01593124121427536, "learning_rate": 9.645201763392513e-07, "loss": -0.0269, "num_tokens": 21839054.0, "reward": 0.5, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4021224975585938, "sampling/importance_sampling_ratio/mean": 1.000046968460083, "sampling/importance_sampling_ratio/min": 0.6196773648262024, "sampling/sampling_logp_difference/max": 0.47855639457702637, "sampling/sampling_logp_difference/mean": 0.012365646660327911, "step": 513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 642.0, "completions/max_terminated_length": 642.0, "completions/mean_length": 279.796875, "completions/mean_terminated_length": 279.796875, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.44530823826789856, "epoch": 0.6299019607843137, "frac_reward_zero_std": 1.0, "grad_norm": 0.026986499491856265, "kl": 0.028601959347724915, "learning_rate": 9.64256140000152e-07, "loss": 0.0003, "num_tokens": 21875601.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4480172395706177, "sampling/importance_sampling_ratio/mean": 1.0000555515289307, "sampling/importance_sampling_ratio/min": 0.6960031986236572, "sampling/sampling_logp_difference/max": 0.3701951503753662, "sampling/sampling_logp_difference/mean": 0.014470323920249939, "step": 514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 619.0, "completions/max_terminated_length": 619.0, "completions/mean_length": 379.734375, "completions/mean_terminated_length": 379.734375, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "entropy": 0.428574800491333, "epoch": 0.6311274509803921, "frac_reward_zero_std": 1.0, "grad_norm": 0.023097332275899152, "kl": 0.020401127636432648, "learning_rate": 9.639911612505003e-07, "loss": 0.0002, "num_tokens": 21920736.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4377331733703613, "sampling/importance_sampling_ratio/mean": 0.999815821647644, "sampling/importance_sampling_ratio/min": 0.6796164512634277, "sampling/sampling_logp_difference/max": 0.3862266540527344, "sampling/sampling_logp_difference/mean": 0.012874692678451538, "step": 515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1075.0, "completions/max_terminated_length": 1075.0, "completions/mean_length": 372.078125, "completions/mean_terminated_length": 372.078125, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.5065062642097473, "epoch": 0.6323529411764706, "frac_reward_zero_std": 0.75, "grad_norm": 0.5369748495004193, "kl": 0.02640634775161743, "learning_rate": 9.63725240628186e-07, "loss": 0.0196, "num_tokens": 21962437.0, "reward": -0.0625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": -0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000227689743042, "sampling/importance_sampling_ratio/min": 0.7082078456878662, "sampling/sampling_logp_difference/max": 0.7951180934906006, "sampling/sampling_logp_difference/mean": 0.014405781403183937, "step": 516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 570.0, "completions/max_terminated_length": 570.0, "completions/mean_length": 314.328125, "completions/mean_terminated_length": 314.328125, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.5035575032234192, "epoch": 0.633578431372549, "frac_reward_zero_std": 1.0, "grad_norm": 0.02041856411111397, "kl": 0.021476712077856064, "learning_rate": 9.634583786730108e-07, "loss": 0.0002, "num_tokens": 22000842.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.5469943284988403, "sampling/importance_sampling_ratio/mean": 0.9997060298919678, "sampling/importance_sampling_ratio/min": 0.6973562240600586, "sampling/sampling_logp_difference/max": 0.4363138675689697, "sampling/sampling_logp_difference/mean": 0.01439275685697794, "step": 517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 603.0, "completions/max_terminated_length": 603.0, "completions/mean_length": 338.234375, "completions/mean_terminated_length": 338.234375, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.435118168592453, "epoch": 0.6348039215686274, "frac_reward_zero_std": 1.0, "grad_norm": 0.019656064876507146, "kl": 0.023213863372802734, "learning_rate": 9.63190575926688e-07, "loss": 0.0002, "num_tokens": 22040489.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.969374656677246, "sampling/importance_sampling_ratio/mean": 0.999826192855835, "sampling/importance_sampling_ratio/min": 0.6166771650314331, "sampling/sampling_logp_difference/max": 0.6777160167694092, "sampling/sampling_logp_difference/mean": 0.013315270654857159, "step": 518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1608.0, "completions/max_terminated_length": 1608.0, "completions/mean_length": 435.234375, "completions/mean_terminated_length": 435.234375, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "entropy": 0.5735545754432678, "epoch": 0.6360294117647058, "frac_reward_zero_std": 0.25, "grad_norm": 0.9047269501935339, "kl": 0.0185175109654665, "learning_rate": 9.6292183293284e-07, "loss": -0.0519, "num_tokens": 22087912.0, "reward": -0.28125, "reward_std": 0.48935678601264954, "rewards/decision_reward_func/mean": -0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 1.4295918941497803, "sampling/importance_sampling_ratio/mean": 1.000097632408142, "sampling/importance_sampling_ratio/min": 0.7661705613136292, "sampling/sampling_logp_difference/max": 0.357388973236084, "sampling/sampling_logp_difference/mean": 0.013957493007183075, "step": 519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 862.0, "completions/max_terminated_length": 862.0, "completions/mean_length": 423.71875, "completions/mean_terminated_length": 423.71875, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.47148168087005615, "epoch": 0.6372549019607843, "frac_reward_zero_std": 0.5, "grad_norm": 0.7006178595450491, "kl": 0.018485475331544876, "learning_rate": 9.626521502369983e-07, "loss": 0.0189, "num_tokens": 22128598.0, "reward": 0.40625, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.654662847518921, "sampling/importance_sampling_ratio/mean": 1.0000654458999634, "sampling/importance_sampling_ratio/min": 0.549578070640564, "sampling/sampling_logp_difference/max": 0.5986044406890869, "sampling/sampling_logp_difference/mean": 0.01272359024733305, "step": 520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 955.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 382.296875, "completions/mean_terminated_length": 382.296875, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "entropy": 0.484565794467926, "epoch": 0.6384803921568627, "frac_reward_zero_std": 0.75, "grad_norm": 0.4693818397728542, "kl": 0.018903281539678574, "learning_rate": 9.623815283866015e-07, "loss": 0.0125, "num_tokens": 22169257.0, "reward": 0.34375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 1.340274691581726, "sampling/importance_sampling_ratio/mean": 0.9999719262123108, "sampling/importance_sampling_ratio/min": 0.7161155343055725, "sampling/sampling_logp_difference/max": 0.33391380310058594, "sampling/sampling_logp_difference/mean": 0.01233886182308197, "step": 521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 907.0, "completions/max_terminated_length": 907.0, "completions/mean_length": 303.453125, "completions/mean_terminated_length": 303.453125, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.5343384742736816, "epoch": 0.6397058823529411, "frac_reward_zero_std": 0.75, "grad_norm": 0.5133127622622239, "kl": 0.03019445203244686, "learning_rate": 9.621099679309946e-07, "loss": -0.039, "num_tokens": 22205222.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.4702454805374146, "sampling/importance_sampling_ratio/mean": 0.9999204277992249, "sampling/importance_sampling_ratio/min": 0.7050918340682983, "sampling/sampling_logp_difference/max": 0.38542938232421875, "sampling/sampling_logp_difference/mean": 0.015871543437242508, "step": 522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 564.0, "completions/max_terminated_length": 564.0, "completions/mean_length": 307.5, "completions/mean_terminated_length": 307.5, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.4860917627811432, "epoch": 0.6409313725490197, "frac_reward_zero_std": 0.5, "grad_norm": 0.8712880417483513, "kl": 0.03121115081012249, "learning_rate": 9.618374694214285e-07, "loss": -0.0266, "num_tokens": 22238998.0, "reward": 0.25, "reward_std": 0.3811737596988678, "rewards/decision_reward_func/mean": 0.25, "rewards/decision_reward_func/std": 0.9759001135826111, "sampling/importance_sampling_ratio/max": 1.4663472175598145, "sampling/importance_sampling_ratio/mean": 0.9999791383743286, "sampling/importance_sampling_ratio/min": 0.6546947956085205, "sampling/sampling_logp_difference/max": 0.4235861301422119, "sampling/sampling_logp_difference/mean": 0.013401854783296585, "step": 523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1007.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 479.796875, "completions/mean_terminated_length": 479.796875, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "entropy": 0.588794469833374, "epoch": 0.6421568627450981, "frac_reward_zero_std": 0.0, "grad_norm": 0.9653181973611036, "kl": 0.018110765144228935, "learning_rate": 9.615640334110578e-07, "loss": -0.0082, "num_tokens": 22292057.0, "reward": -0.15625, "reward_std": 0.625, "rewards/decision_reward_func/mean": -0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.5898159742355347, "sampling/importance_sampling_ratio/mean": 0.9999878406524658, "sampling/importance_sampling_ratio/min": 0.6093446016311646, "sampling/sampling_logp_difference/max": 0.49537134170532227, "sampling/sampling_logp_difference/mean": 0.014567906968295574, "step": 524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1166.0, "completions/max_terminated_length": 1166.0, "completions/mean_length": 446.078125, "completions/mean_terminated_length": 446.078125, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "entropy": 0.5163763761520386, "epoch": 0.6433823529411765, "frac_reward_zero_std": 0.5, "grad_norm": 0.7994200314279192, "kl": 0.025508200749754906, "learning_rate": 9.612896604549401e-07, "loss": -0.0707, "num_tokens": 22335470.0, "reward": 0.0, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.554340124130249, "sampling/importance_sampling_ratio/mean": 1.0000594854354858, "sampling/importance_sampling_ratio/min": 0.69683837890625, "sampling/sampling_logp_difference/max": 0.44105100631713867, "sampling/sampling_logp_difference/mean": 0.013873875141143799, "step": 525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 853.0, "completions/max_terminated_length": 853.0, "completions/mean_length": 481.640625, "completions/mean_terminated_length": 481.640625, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "entropy": 0.48791414499282837, "epoch": 0.6446078431372549, "frac_reward_zero_std": 1.0, "grad_norm": 0.024352429886521414, "kl": 0.026691341772675514, "learning_rate": 9.610143511100354e-07, "loss": 0.0003, "num_tokens": 22381175.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4352011680603027, "sampling/importance_sampling_ratio/mean": 1.000247597694397, "sampling/importance_sampling_ratio/min": 0.6208049058914185, "sampling/sampling_logp_difference/max": 0.47673845291137695, "sampling/sampling_logp_difference/mean": 0.013716398738324642, "step": 526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 852.0, "completions/max_terminated_length": 852.0, "completions/mean_length": 392.984375, "completions/mean_terminated_length": 392.984375, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.5679036378860474, "epoch": 0.6458333333333334, "frac_reward_zero_std": 0.5, "grad_norm": 0.6495232241860538, "kl": 0.03857613354921341, "learning_rate": 9.607381059352038e-07, "loss": 0.0332, "num_tokens": 22426518.0, "reward": -0.25, "reward_std": 0.3811737596988678, "rewards/decision_reward_func/mean": -0.25, "rewards/decision_reward_func/std": 0.9759001135826111, "sampling/importance_sampling_ratio/max": 1.550405740737915, "sampling/importance_sampling_ratio/mean": 1.0000765323638916, "sampling/importance_sampling_ratio/min": 0.6482210755348206, "sampling/sampling_logp_difference/max": 0.43851661682128906, "sampling/sampling_logp_difference/mean": 0.015368265099823475, "step": 527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 910.0, "completions/max_terminated_length": 910.0, "completions/mean_length": 486.265625, "completions/mean_terminated_length": 486.265625, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "entropy": 0.43796950578689575, "epoch": 0.6470588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.02039325301519705, "kl": 0.023471936583518982, "learning_rate": 9.60460925491206e-07, "loss": 0.0002, "num_tokens": 22478055.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4586842060089111, "sampling/importance_sampling_ratio/mean": 0.9999660849571228, "sampling/importance_sampling_ratio/min": 0.734658420085907, "sampling/sampling_logp_difference/max": 0.3775348663330078, "sampling/sampling_logp_difference/mean": 0.011706223711371422, "step": 528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 804.0, "completions/max_terminated_length": 804.0, "completions/mean_length": 348.140625, "completions/mean_terminated_length": 348.140625, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "entropy": 0.5358662605285645, "epoch": 0.6482843137254902, "frac_reward_zero_std": 0.75, "grad_norm": 0.6136724466859764, "kl": 0.031138675287365913, "learning_rate": 9.601828103407004e-07, "loss": -0.0389, "num_tokens": 22521216.0, "reward": 0.625, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.4432358741760254, "sampling/importance_sampling_ratio/mean": 1.0001251697540283, "sampling/importance_sampling_ratio/min": 0.677302360534668, "sampling/sampling_logp_difference/max": 0.38963747024536133, "sampling/sampling_logp_difference/mean": 0.014512967318296432, "step": 529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 987.0, "completions/max_terminated_length": 987.0, "completions/mean_length": 444.34375, "completions/mean_terminated_length": 444.34375, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "entropy": 0.4174489676952362, "epoch": 0.6495098039215687, "frac_reward_zero_std": 0.75, "grad_norm": 0.537307196644949, "kl": 0.03059738129377365, "learning_rate": 9.599037610482433e-07, "loss": 0.0485, "num_tokens": 22567462.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.5640616416931152, "sampling/importance_sampling_ratio/mean": 0.9999585151672363, "sampling/importance_sampling_ratio/min": 0.5811601281166077, "sampling/sampling_logp_difference/max": 0.5427289009094238, "sampling/sampling_logp_difference/mean": 0.012129819951951504, "step": 530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 608.0, "completions/max_terminated_length": 608.0, "completions/mean_length": 360.15625, "completions/mean_terminated_length": 360.15625, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "entropy": 0.5003799200057983, "epoch": 0.6507352941176471, "frac_reward_zero_std": 1.0, "grad_norm": 0.032774878939498424, "kl": 0.031570933759212494, "learning_rate": 9.59623778180287e-07, "loss": 0.0003, "num_tokens": 22608896.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.2727454900741577, "sampling/importance_sampling_ratio/mean": 1.0001881122589111, "sampling/importance_sampling_ratio/min": 0.6459628343582153, "sampling/sampling_logp_difference/max": 0.4370133876800537, "sampling/sampling_logp_difference/mean": 0.014224998652935028, "step": 531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1509.0, "completions/max_terminated_length": 1509.0, "completions/mean_length": 589.8125, "completions/mean_terminated_length": 589.8125, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "entropy": 0.4944005608558655, "epoch": 0.6519607843137255, "frac_reward_zero_std": 0.75, "grad_norm": 0.4132792151066846, "kl": 0.027611345052719116, "learning_rate": 9.593428623051791e-07, "loss": -0.0163, "num_tokens": 22664596.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.5880495309829712, "sampling/importance_sampling_ratio/mean": 1.0002539157867432, "sampling/importance_sampling_ratio/min": 0.6459794640541077, "sampling/sampling_logp_difference/max": 0.4625065326690674, "sampling/sampling_logp_difference/mean": 0.013256685808300972, "step": 532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1092.0, "completions/max_terminated_length": 1092.0, "completions/mean_length": 410.46875, "completions/mean_terminated_length": 410.46875, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "entropy": 0.46653473377227783, "epoch": 0.6531862745098039, "frac_reward_zero_std": 1.0, "grad_norm": 0.02071457308723315, "kl": 0.027000848203897476, "learning_rate": 9.59061013993161e-07, "loss": 0.0002, "num_tokens": 22712530.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.5972484350204468, "sampling/importance_sampling_ratio/mean": 0.9996984004974365, "sampling/importance_sampling_ratio/min": 0.5396425724029541, "sampling/sampling_logp_difference/max": 0.6168482303619385, "sampling/sampling_logp_difference/mean": 0.013666179031133652, "step": 533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 985.0, "completions/max_terminated_length": 985.0, "completions/mean_length": 451.828125, "completions/mean_terminated_length": 451.828125, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.4429158568382263, "epoch": 0.6544117647058824, "frac_reward_zero_std": 0.5, "grad_norm": 0.6133920907728567, "kl": 0.026823662221431732, "learning_rate": 9.587782338163667e-07, "loss": -0.0368, "num_tokens": 22760423.0, "reward": 0.5, "reward_std": 0.34156501293182373, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.2914811372756958, "sampling/importance_sampling_ratio/mean": 0.9999823570251465, "sampling/importance_sampling_ratio/min": 0.042425040155649185, "sampling/sampling_logp_difference/max": 3.1600165367126465, "sampling/sampling_logp_difference/mean": 0.012915397062897682, "step": 534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1364.0, "completions/max_terminated_length": 1364.0, "completions/mean_length": 650.0, "completions/mean_terminated_length": 650.0, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "entropy": 0.5773513317108154, "epoch": 0.6556372549019608, "frac_reward_zero_std": 0.5, "grad_norm": 0.5637700472452856, "kl": 0.022749483585357666, "learning_rate": 9.584945223488226e-07, "loss": -0.0009, "num_tokens": 22822967.0, "reward": -0.1875, "reward_std": 0.40311288833618164, "rewards/decision_reward_func/mean": -0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 1.5252439975738525, "sampling/importance_sampling_ratio/mean": 1.0000197887420654, "sampling/importance_sampling_ratio/min": 0.7124374508857727, "sampling/sampling_logp_difference/max": 0.42215442657470703, "sampling/sampling_logp_difference/mean": 0.01395493745803833, "step": 535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 953.0, "completions/max_terminated_length": 953.0, "completions/mean_length": 468.8125, "completions/mean_terminated_length": 468.8125, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "entropy": 0.47090595960617065, "epoch": 0.6568627450980392, "frac_reward_zero_std": 1.0, "grad_norm": 0.019781311435643604, "kl": 0.029313715174794197, "learning_rate": 9.582098801664443e-07, "loss": 0.0003, "num_tokens": 22872395.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.341235637664795, "sampling/importance_sampling_ratio/mean": 1.0000468492507935, "sampling/importance_sampling_ratio/min": 0.7749277353286743, "sampling/sampling_logp_difference/max": 0.2935912609100342, "sampling/sampling_logp_difference/mean": 0.012354792095720768, "step": 536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1207.0, "completions/max_terminated_length": 1207.0, "completions/mean_length": 498.421875, "completions/mean_terminated_length": 498.421875, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "entropy": 0.5697422027587891, "epoch": 0.6580882352941176, "frac_reward_zero_std": 0.75, "grad_norm": 0.60672799839657, "kl": 0.02526729740202427, "learning_rate": 9.579243078470378e-07, "loss": -0.0138, "num_tokens": 22925206.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.4830189943313599, "sampling/importance_sampling_ratio/mean": 0.9995970129966736, "sampling/importance_sampling_ratio/min": 0.29544225335121155, "sampling/sampling_logp_difference/max": 1.2192819118499756, "sampling/sampling_logp_difference/mean": 0.01448576245456934, "step": 537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1118.0, "completions/max_terminated_length": 1118.0, "completions/mean_length": 514.640625, "completions/mean_terminated_length": 514.640625, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "entropy": 0.6678169965744019, "epoch": 0.6593137254901961, "frac_reward_zero_std": 0.5, "grad_norm": 0.7112679055671031, "kl": 0.03352707624435425, "learning_rate": 9.576378059702968e-07, "loss": 0.0521, "num_tokens": 22979263.0, "reward": 0.71875, "reward_std": 0.38319888710975647, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.497582197189331, "sampling/importance_sampling_ratio/mean": 0.9998127222061157, "sampling/importance_sampling_ratio/min": 0.6231463551521301, "sampling/sampling_logp_difference/max": 0.4729738235473633, "sampling/sampling_logp_difference/mean": 0.016786841675639153, "step": 538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 954.0, "completions/max_terminated_length": 954.0, "completions/mean_length": 487.578125, "completions/mean_terminated_length": 487.578125, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "entropy": 0.5274693965911865, "epoch": 0.6605392156862745, "frac_reward_zero_std": 1.0, "grad_norm": 0.022746537350492847, "kl": 0.028436612337827682, "learning_rate": 9.573503751178018e-07, "loss": 0.0003, "num_tokens": 23029764.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4536162614822388, "sampling/importance_sampling_ratio/mean": 0.9998650550842285, "sampling/importance_sampling_ratio/min": 0.6229181885719299, "sampling/sampling_logp_difference/max": 0.4733400344848633, "sampling/sampling_logp_difference/mean": 0.014275242574512959, "step": 539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1160.0, "completions/max_terminated_length": 1160.0, "completions/mean_length": 493.21875, "completions/mean_terminated_length": 493.21875, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "entropy": 0.6968632936477661, "epoch": 0.6617647058823529, "frac_reward_zero_std": 0.75, "grad_norm": 0.4286484830151705, "kl": 0.03280634433031082, "learning_rate": 9.570620158730194e-07, "loss": -0.0137, "num_tokens": 23088226.0, "reward": 0.8125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.4631199836730957, "sampling/importance_sampling_ratio/mean": 0.9998626112937927, "sampling/importance_sampling_ratio/min": 0.6574424505233765, "sampling/sampling_logp_difference/max": 0.41939806938171387, "sampling/sampling_logp_difference/mean": 0.017662307247519493, "step": 540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 745.0, "completions/max_terminated_length": 745.0, "completions/mean_length": 460.71875, "completions/mean_terminated_length": 460.71875, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "entropy": 0.45224708318710327, "epoch": 0.6629901960784313, "frac_reward_zero_std": 1.0, "grad_norm": 0.02000327819838919, "kl": 0.027944475412368774, "learning_rate": 9.567727288213004e-07, "loss": 0.0003, "num_tokens": 23135184.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.6251780986785889, "sampling/importance_sampling_ratio/mean": 1.0000793933868408, "sampling/importance_sampling_ratio/min": 0.5567717552185059, "sampling/sampling_logp_difference/max": 0.5855998992919922, "sampling/sampling_logp_difference/mean": 0.013088848441839218, "step": 541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 675.0, "completions/max_terminated_length": 675.0, "completions/mean_length": 416.578125, "completions/mean_terminated_length": 416.578125, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "entropy": 0.5076165199279785, "epoch": 0.6642156862745098, "frac_reward_zero_std": 0.75, "grad_norm": 0.5316746965962581, "kl": 0.02737901732325554, "learning_rate": 9.564825145498793e-07, "loss": 0.001, "num_tokens": 23179621.0, "reward": 0.78125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.5070008039474487, "sampling/importance_sampling_ratio/mean": 1.0000522136688232, "sampling/importance_sampling_ratio/min": 0.6197693943977356, "sampling/sampling_logp_difference/max": 0.4784078598022461, "sampling/sampling_logp_difference/mean": 0.013599265366792679, "step": 542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1462.0, "completions/max_terminated_length": 1462.0, "completions/mean_length": 678.703125, "completions/mean_terminated_length": 678.703125, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "entropy": 0.5846179723739624, "epoch": 0.6654411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 0.4799620769732908, "kl": 0.02414129674434662, "learning_rate": 9.561913736478728e-07, "loss": 0.0129, "num_tokens": 23243458.0, "reward": -0.03125, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": -0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.4143550395965576, "sampling/importance_sampling_ratio/mean": 0.9999186992645264, "sampling/importance_sampling_ratio/min": 0.6329386830329895, "sampling/sampling_logp_difference/max": 0.4573817253112793, "sampling/sampling_logp_difference/mean": 0.014912068843841553, "step": 543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1127.0, "completions/max_terminated_length": 1127.0, "completions/mean_length": 473.953125, "completions/mean_terminated_length": 473.953125, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "entropy": 0.4980081617832184, "epoch": 0.6666666666666666, "frac_reward_zero_std": 0.75, "grad_norm": 0.4389172509776869, "kl": 0.02681986428797245, "learning_rate": 9.558993067062784e-07, "loss": -0.012, "num_tokens": 23290799.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.4630790948867798, "sampling/importance_sampling_ratio/mean": 1.0000104904174805, "sampling/importance_sampling_ratio/min": 0.745302677154541, "sampling/sampling_logp_difference/max": 0.38054323196411133, "sampling/sampling_logp_difference/mean": 0.013975268229842186, "step": 544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1632.0, "completions/max_terminated_length": 1632.0, "completions/mean_length": 692.1875, "completions/mean_terminated_length": 692.1875, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "entropy": 0.4292387068271637, "epoch": 0.6678921568627451, "frac_reward_zero_std": 0.75, "grad_norm": 0.4657892149544558, "kl": 0.020169369876384735, "learning_rate": 9.556063143179735e-07, "loss": 0.0008, "num_tokens": 23359051.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.6610376834869385, "sampling/importance_sampling_ratio/mean": 1.0000238418579102, "sampling/importance_sampling_ratio/min": 0.3846254348754883, "sampling/sampling_logp_difference/max": 0.9554853439331055, "sampling/sampling_logp_difference/mean": 0.012021131813526154, "step": 545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 868.0, "completions/max_terminated_length": 868.0, "completions/mean_length": 470.640625, "completions/mean_terminated_length": 470.640625, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "entropy": 0.573645293712616, "epoch": 0.6691176470588235, "frac_reward_zero_std": 1.0, "grad_norm": 0.044962818744495245, "kl": 0.041726116091012955, "learning_rate": 9.55312397077714e-07, "loss": 0.0004, "num_tokens": 23405764.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3000352382659912, "sampling/importance_sampling_ratio/mean": 0.9999016523361206, "sampling/importance_sampling_ratio/min": 0.6008990406990051, "sampling/sampling_logp_difference/max": 0.5093283653259277, "sampling/sampling_logp_difference/mean": 0.01602035015821457, "step": 546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2340.0, "completions/max_terminated_length": 2340.0, "completions/mean_length": 659.609375, "completions/mean_terminated_length": 659.609375, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "entropy": 0.5088626742362976, "epoch": 0.6703431372549019, "frac_reward_zero_std": 0.5, "grad_norm": 0.5697736567003489, "kl": 0.0258655846118927, "learning_rate": 9.550175555821334e-07, "loss": 0.0735, "num_tokens": 23467387.0, "reward": 0.46875, "reward_std": 0.5061737298965454, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.4982061386108398, "sampling/importance_sampling_ratio/mean": 0.99997478723526, "sampling/importance_sampling_ratio/min": 0.6496109366416931, "sampling/sampling_logp_difference/max": 0.4313817024230957, "sampling/sampling_logp_difference/mean": 0.013464387506246567, "step": 547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1099.0, "completions/max_terminated_length": 1099.0, "completions/mean_length": 544.25, "completions/mean_terminated_length": 544.25, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "entropy": 0.5267872214317322, "epoch": 0.6715686274509803, "frac_reward_zero_std": 0.5, "grad_norm": 0.6759539936711337, "kl": 0.030149240046739578, "learning_rate": 9.547217904297409e-07, "loss": -0.0055, "num_tokens": 23520123.0, "reward": 0.34375, "reward_std": 0.42695626616477966, "rewards/decision_reward_func/mean": 0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 1.4010050296783447, "sampling/importance_sampling_ratio/mean": 0.9999127984046936, "sampling/importance_sampling_ratio/min": 0.6857926249504089, "sampling/sampling_logp_difference/max": 0.37717998027801514, "sampling/sampling_logp_difference/mean": 0.01345591340214014, "step": 548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1480.0, "completions/max_terminated_length": 1480.0, "completions/mean_length": 557.90625, "completions/mean_terminated_length": 557.90625, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "entropy": 0.48522427678108215, "epoch": 0.6727941176470589, "frac_reward_zero_std": 0.5, "grad_norm": 0.6925426103311577, "kl": 0.02468687668442726, "learning_rate": 9.544251022209216e-07, "loss": 0.0472, "num_tokens": 23579365.0, "reward": 0.875, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.653448224067688, "sampling/importance_sampling_ratio/mean": 0.9999819397926331, "sampling/importance_sampling_ratio/min": 0.42172759771347046, "sampling/sampling_logp_difference/max": 0.8633956909179688, "sampling/sampling_logp_difference/mean": 0.013139636255800724, "step": 549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 782.0, "completions/max_terminated_length": 782.0, "completions/mean_length": 330.171875, "completions/mean_terminated_length": 330.171875, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.3912215828895569, "epoch": 0.6740196078431373, "frac_reward_zero_std": 0.75, "grad_norm": 0.6167295125488061, "kl": 0.031140143051743507, "learning_rate": 9.541274915579334e-07, "loss": -0.0037, "num_tokens": 23615872.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.59721839427948, "sampling/importance_sampling_ratio/mean": 1.0002222061157227, "sampling/importance_sampling_ratio/min": 0.6665425300598145, "sampling/sampling_logp_difference/max": 0.4682636260986328, "sampling/sampling_logp_difference/mean": 0.013033216819167137, "step": 550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1454.0, "completions/max_terminated_length": 1454.0, "completions/mean_length": 573.21875, "completions/mean_terminated_length": 573.21875, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "entropy": 0.4008888602256775, "epoch": 0.6752450980392157, "frac_reward_zero_std": 0.75, "grad_norm": 0.4551038365379589, "kl": 0.01710035651922226, "learning_rate": 9.538289590449071e-07, "loss": 0.0055, "num_tokens": 23672014.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.597145676612854, "sampling/importance_sampling_ratio/mean": 1.0000133514404297, "sampling/importance_sampling_ratio/min": 0.615695059299469, "sampling/sampling_logp_difference/max": 0.4850034713745117, "sampling/sampling_logp_difference/mean": 0.01148258987814188, "step": 551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2096.0, "completions/max_terminated_length": 2096.0, "completions/mean_length": 611.828125, "completions/mean_terminated_length": 611.828125, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "entropy": 0.47205430269241333, "epoch": 0.6764705882352942, "frac_reward_zero_std": 1.0, "grad_norm": 0.0179902254195983, "kl": 0.022859439253807068, "learning_rate": 9.535295052878449e-07, "loss": 0.0002, "num_tokens": 23731059.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.933349609375, "sampling/importance_sampling_ratio/mean": 1.0001842975616455, "sampling/importance_sampling_ratio/min": 0.0008360014762729406, "sampling/sampling_logp_difference/max": 7.086880207061768, "sampling/sampling_logp_difference/mean": 0.013203752227127552, "step": 552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 628.0, "completions/max_terminated_length": 628.0, "completions/mean_length": 334.265625, "completions/mean_terminated_length": 334.265625, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.3833498954772949, "epoch": 0.6776960784313726, "frac_reward_zero_std": 1.0, "grad_norm": 0.028107418493224283, "kl": 0.03332946449518204, "learning_rate": 9.53229130894619e-07, "loss": 0.0003, "num_tokens": 23768708.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5072814226150513, "sampling/importance_sampling_ratio/mean": 0.9997947812080383, "sampling/importance_sampling_ratio/min": 0.6106178760528564, "sampling/sampling_logp_difference/max": 0.4932839274406433, "sampling/sampling_logp_difference/mean": 0.012641314417123795, "step": 553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 775.0, "completions/max_terminated_length": 775.0, "completions/mean_length": 470.34375, "completions/mean_terminated_length": 470.34375, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "entropy": 0.5440921783447266, "epoch": 0.678921568627451, "frac_reward_zero_std": 1.0, "grad_norm": 0.024544440623341723, "kl": 0.028057586401700974, "learning_rate": 9.529278364749702e-07, "loss": 0.0003, "num_tokens": 23818858.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6629903316497803, "sampling/importance_sampling_ratio/mean": 1.0001051425933838, "sampling/importance_sampling_ratio/min": 0.6739862561225891, "sampling/sampling_logp_difference/max": 0.5086174011230469, "sampling/sampling_logp_difference/mean": 0.01569173112511635, "step": 554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 671.0, "completions/max_terminated_length": 671.0, "completions/mean_length": 427.25, "completions/mean_terminated_length": 427.25, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "entropy": 0.6031075716018677, "epoch": 0.6801470588235294, "frac_reward_zero_std": 0.75, "grad_norm": 0.6010424549827459, "kl": 0.026670992374420166, "learning_rate": 9.526256226405073e-07, "loss": -0.011, "num_tokens": 23862714.0, "reward": 0.09375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.51223623752594, "sampling/importance_sampling_ratio/mean": 0.9996472597122192, "sampling/importance_sampling_ratio/min": 0.504997730255127, "sampling/sampling_logp_difference/max": 0.6832013130187988, "sampling/sampling_logp_difference/mean": 0.015845855697989464, "step": 555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 829.0, "completions/max_terminated_length": 829.0, "completions/mean_length": 494.8125, "completions/mean_terminated_length": 494.8125, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "entropy": 0.4929550886154175, "epoch": 0.6813725490196079, "frac_reward_zero_std": 0.25, "grad_norm": 0.8632041561283043, "kl": 0.027730241417884827, "learning_rate": 9.523224900047051e-07, "loss": 0.0447, "num_tokens": 23908446.0, "reward": 0.15625, "reward_std": 0.519389271736145, "rewards/decision_reward_func/mean": 0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.6279528141021729, "sampling/importance_sampling_ratio/mean": 1.000073790550232, "sampling/importance_sampling_ratio/min": 0.6149647831916809, "sampling/sampling_logp_difference/max": 0.4873232841491699, "sampling/sampling_logp_difference/mean": 0.014279766008257866, "step": 556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 856.0, "completions/max_terminated_length": 856.0, "completions/mean_length": 472.65625, "completions/mean_terminated_length": 472.65625, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "entropy": 0.5506010055541992, "epoch": 0.6825980392156863, "frac_reward_zero_std": 0.75, "grad_norm": 0.5095789944394084, "kl": 0.026844309642910957, "learning_rate": 9.520184391829036e-07, "loss": 0.0244, "num_tokens": 23959704.0, "reward": 0.625, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.6001272201538086, "sampling/importance_sampling_ratio/mean": 0.999930739402771, "sampling/importance_sampling_ratio/min": 0.6790419816970825, "sampling/sampling_logp_difference/max": 0.4700831174850464, "sampling/sampling_logp_difference/mean": 0.015897564589977264, "step": 557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1351.0, "completions/max_terminated_length": 1351.0, "completions/mean_length": 521.859375, "completions/mean_terminated_length": 521.859375, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "entropy": 0.6065237522125244, "epoch": 0.6838235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.01795110552840217, "kl": 0.02287733368575573, "learning_rate": 9.517134707923069e-07, "loss": 0.0002, "num_tokens": 24013231.0, "reward": -0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": -0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4595006704330444, "sampling/importance_sampling_ratio/mean": 0.9999644756317139, "sampling/importance_sampling_ratio/min": 0.6155210733413696, "sampling/sampling_logp_difference/max": 0.4852861166000366, "sampling/sampling_logp_difference/mean": 0.01618819870054722, "step": 558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 720.0, "completions/max_terminated_length": 720.0, "completions/mean_length": 414.46875, "completions/mean_terminated_length": 414.46875, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "entropy": 0.40668171644210815, "epoch": 0.6850490196078431, "frac_reward_zero_std": 1.0, "grad_norm": 0.022679018452386557, "kl": 0.022866852581501007, "learning_rate": 9.514075854519813e-07, "loss": 0.0002, "num_tokens": 24056221.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6018248796463013, "sampling/importance_sampling_ratio/mean": 0.999752938747406, "sampling/importance_sampling_ratio/min": 0.5038568377494812, "sampling/sampling_logp_difference/max": 0.6854631900787354, "sampling/sampling_logp_difference/mean": 0.012811537832021713, "step": 559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1353.0, "completions/max_terminated_length": 1353.0, "completions/mean_length": 467.78125, "completions/mean_terminated_length": 467.78125, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "entropy": 0.6265972852706909, "epoch": 0.6862745098039216, "frac_reward_zero_std": 0.75, "grad_norm": 0.516930661700663, "kl": 0.029223240911960602, "learning_rate": 9.511007837828548e-07, "loss": -0.0015, "num_tokens": 24108287.0, "reward": 0.28125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000124454498291, "sampling/importance_sampling_ratio/min": 0.6272248029708862, "sampling/sampling_logp_difference/max": 0.7253082990646362, "sampling/sampling_logp_difference/mean": 0.016647446900606155, "step": 560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1744.0, "completions/max_terminated_length": 1744.0, "completions/mean_length": 603.0, "completions/mean_terminated_length": 603.0, "completions/min_length": 301.0, "completions/min_terminated_length": 301.0, "entropy": 0.5804711580276489, "epoch": 0.6875, "frac_reward_zero_std": 1.0, "grad_norm": 0.01987701489021556, "kl": 0.02135775052011013, "learning_rate": 9.507930664077153e-07, "loss": 0.0002, "num_tokens": 24169839.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.4414596557617188, "sampling/importance_sampling_ratio/mean": 0.9996535181999207, "sampling/importance_sampling_ratio/min": 0.44329196214675903, "sampling/sampling_logp_difference/max": 0.8135266304016113, "sampling/sampling_logp_difference/mean": 0.015939921140670776, "step": 561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1135.0, "completions/max_terminated_length": 1135.0, "completions/mean_length": 552.96875, "completions/mean_terminated_length": 552.96875, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "entropy": 0.3961723744869232, "epoch": 0.6887254901960784, "frac_reward_zero_std": 0.75, "grad_norm": 0.42038979894105083, "kl": 0.017000645399093628, "learning_rate": 9.504844339512094e-07, "loss": -0.0332, "num_tokens": 24224765.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000801086425781, "sampling/importance_sampling_ratio/min": 0.38583576679229736, "sampling/sampling_logp_difference/max": 0.9523434638977051, "sampling/sampling_logp_difference/mean": 0.010993542149662971, "step": 562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1534.0, "completions/max_terminated_length": 1534.0, "completions/mean_length": 486.1875, "completions/mean_terminated_length": 486.1875, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "entropy": 0.6445464491844177, "epoch": 0.6899509803921569, "frac_reward_zero_std": 0.5, "grad_norm": 0.8746844787916703, "kl": 0.03163773566484451, "learning_rate": 9.501748870398419e-07, "loss": 0.0653, "num_tokens": 24273241.0, "reward": 0.59375, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.4038053750991821, "sampling/importance_sampling_ratio/mean": 0.9998147487640381, "sampling/importance_sampling_ratio/min": 0.6158912181854248, "sampling/sampling_logp_difference/max": 0.48468494415283203, "sampling/sampling_logp_difference/mean": 0.016614338383078575, "step": 563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 592.0, "completions/max_terminated_length": 592.0, "completions/mean_length": 283.4375, "completions/mean_terminated_length": 283.4375, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "entropy": 0.36341845989227295, "epoch": 0.6911764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.022490602694597375, "kl": 0.02640126645565033, "learning_rate": 9.498644263019731e-07, "loss": 0.0003, "num_tokens": 24311173.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5619163513183594, "sampling/importance_sampling_ratio/mean": 1.0005894899368286, "sampling/importance_sampling_ratio/min": 0.4612080454826355, "sampling/sampling_logp_difference/max": 0.7739059925079346, "sampling/sampling_logp_difference/mean": 0.012057418003678322, "step": 564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1051.0, "completions/max_terminated_length": 1051.0, "completions/mean_length": 544.640625, "completions/mean_terminated_length": 544.640625, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "entropy": 0.633879542350769, "epoch": 0.6924019607843137, "frac_reward_zero_std": 0.5, "grad_norm": 0.6432431288217303, "kl": 0.023445455357432365, "learning_rate": 9.495530523678186e-07, "loss": -0.0227, "num_tokens": 24365630.0, "reward": -0.6875, "reward_std": 0.4787135720252991, "rewards/decision_reward_func/mean": -0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.4929879903793335, "sampling/importance_sampling_ratio/mean": 1.0000312328338623, "sampling/importance_sampling_ratio/min": 0.6734943985939026, "sampling/sampling_logp_difference/max": 0.40077948570251465, "sampling/sampling_logp_difference/mean": 0.015802565962076187, "step": 565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 708.0, "completions/max_terminated_length": 708.0, "completions/mean_length": 369.578125, "completions/mean_terminated_length": 369.578125, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "entropy": 0.634357213973999, "epoch": 0.6936274509803921, "frac_reward_zero_std": 0.5, "grad_norm": 0.8856868270573508, "kl": 0.03149256855249405, "learning_rate": 9.492407658694477e-07, "loss": -0.0273, "num_tokens": 24405171.0, "reward": 0.15625, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": 0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.533385992050171, "sampling/importance_sampling_ratio/mean": 0.9999200105667114, "sampling/importance_sampling_ratio/min": 0.6590128540992737, "sampling/sampling_logp_difference/max": 0.4274783134460449, "sampling/sampling_logp_difference/mean": 0.017217829823493958, "step": 566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2001.0, "completions/max_terminated_length": 2001.0, "completions/mean_length": 460.625, "completions/mean_terminated_length": 460.625, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "entropy": 0.5628646016120911, "epoch": 0.6948529411764706, "frac_reward_zero_std": 0.75, "grad_norm": 0.41252476548861583, "kl": 0.02829194813966751, "learning_rate": 9.489275674407825e-07, "loss": 0.0069, "num_tokens": 24451435.0, "reward": -0.34375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": -0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 1.60416841506958, "sampling/importance_sampling_ratio/mean": 1.0000630617141724, "sampling/importance_sampling_ratio/min": 0.5490093231201172, "sampling/sampling_logp_difference/max": 0.599639892578125, "sampling/sampling_logp_difference/mean": 0.015585632063448429, "step": 567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 681.0, "completions/max_terminated_length": 681.0, "completions/mean_length": 336.796875, "completions/mean_terminated_length": 336.796875, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.4597318172454834, "epoch": 0.696078431372549, "frac_reward_zero_std": 1.0, "grad_norm": 0.02637728301286931, "kl": 0.03490458428859711, "learning_rate": 9.486134577175957e-07, "loss": 0.0003, "num_tokens": 24487374.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.451920509338379, "sampling/importance_sampling_ratio/mean": 0.9999220371246338, "sampling/importance_sampling_ratio/min": 0.2373952865600586, "sampling/sampling_logp_difference/max": 1.4380286931991577, "sampling/sampling_logp_difference/mean": 0.014213833957910538, "step": 568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1913.0, "completions/max_terminated_length": 1913.0, "completions/mean_length": 366.765625, "completions/mean_terminated_length": 366.765625, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.5529665946960449, "epoch": 0.6973039215686274, "frac_reward_zero_std": 0.75, "grad_norm": 0.6416460191710676, "kl": 0.03051997907459736, "learning_rate": 9.482984373375104e-07, "loss": -0.038, "num_tokens": 24530383.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.3347262144088745, "sampling/importance_sampling_ratio/mean": 0.9998863339424133, "sampling/importance_sampling_ratio/min": 0.6386717557907104, "sampling/sampling_logp_difference/max": 0.4483647346496582, "sampling/sampling_logp_difference/mean": 0.01647963747382164, "step": 569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 979.0, "completions/max_terminated_length": 979.0, "completions/mean_length": 468.953125, "completions/mean_terminated_length": 468.953125, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "entropy": 0.642076849937439, "epoch": 0.6985294117647058, "frac_reward_zero_std": 0.5, "grad_norm": 0.6544096027838541, "kl": 0.03085540607571602, "learning_rate": 9.479825069399977e-07, "loss": -0.0088, "num_tokens": 24578124.0, "reward": 0.1875, "reward_std": 0.42898139357566833, "rewards/decision_reward_func/mean": 0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 1.3816713094711304, "sampling/importance_sampling_ratio/mean": 0.9999761581420898, "sampling/importance_sampling_ratio/min": 0.618955135345459, "sampling/sampling_logp_difference/max": 0.4797224998474121, "sampling/sampling_logp_difference/mean": 0.016696695238351822, "step": 570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1073.0, "completions/max_terminated_length": 1073.0, "completions/mean_length": 483.328125, "completions/mean_terminated_length": 483.328125, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "entropy": 0.4904835820198059, "epoch": 0.6997549019607843, "frac_reward_zero_std": 0.25, "grad_norm": 0.9141154258452941, "kl": 0.023998580873012543, "learning_rate": 9.476656671663766e-07, "loss": 0.0619, "num_tokens": 24631121.0, "reward": 0.21875, "reward_std": 0.565913200378418, "rewards/decision_reward_func/mean": 0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 1.64661705493927, "sampling/importance_sampling_ratio/mean": 0.9998743534088135, "sampling/importance_sampling_ratio/min": 0.3802279829978943, "sampling/sampling_logp_difference/max": 0.9669842720031738, "sampling/sampling_logp_difference/mean": 0.01298012025654316, "step": 571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 839.0, "completions/max_terminated_length": 839.0, "completions/mean_length": 387.375, "completions/mean_terminated_length": 387.375, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "entropy": 0.4660078287124634, "epoch": 0.7009803921568627, "frac_reward_zero_std": 0.75, "grad_norm": 0.49611328624862844, "kl": 0.024422775954008102, "learning_rate": 9.473479186598114e-07, "loss": 0.0031, "num_tokens": 24672265.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.507544755935669, "sampling/importance_sampling_ratio/mean": 1.0001877546310425, "sampling/importance_sampling_ratio/min": 0.5382927656173706, "sampling/sampling_logp_difference/max": 0.6193526983261108, "sampling/sampling_logp_difference/mean": 0.014834181405603886, "step": 572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 583.0, "completions/max_terminated_length": 583.0, "completions/mean_length": 329.40625, "completions/mean_terminated_length": 329.40625, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.39807915687561035, "epoch": 0.7022058823529411, "frac_reward_zero_std": 1.0, "grad_norm": 0.01892665449606205, "kl": 0.023528946563601494, "learning_rate": 9.470292620653119e-07, "loss": 0.0002, "num_tokens": 24710339.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999773502349854, "sampling/importance_sampling_ratio/min": 0.48685958981513977, "sampling/sampling_logp_difference/max": 1.061455488204956, "sampling/sampling_logp_difference/mean": 0.012952408753335476, "step": 573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 751.0, "completions/max_terminated_length": 751.0, "completions/mean_length": 449.65625, "completions/mean_terminated_length": 449.65625, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "entropy": 0.6724563241004944, "epoch": 0.7034313725490197, "frac_reward_zero_std": 0.0, "grad_norm": 0.9210283149522736, "kl": 0.02513079345226288, "learning_rate": 9.467096980297304e-07, "loss": -0.0211, "num_tokens": 24757181.0, "reward": 0.40625, "reward_std": 0.7517197132110596, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.487856149673462, "sampling/importance_sampling_ratio/mean": 1.0000216960906982, "sampling/importance_sampling_ratio/min": 0.6247434020042419, "sampling/sampling_logp_difference/max": 0.47041428089141846, "sampling/sampling_logp_difference/mean": 0.017210060730576515, "step": 574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 451.96875, "completions/mean_terminated_length": 451.96875, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "entropy": 0.5142788290977478, "epoch": 0.7046568627450981, "frac_reward_zero_std": 0.25, "grad_norm": 0.8017810882767102, "kl": 0.021005287766456604, "learning_rate": 9.463892272017618e-07, "loss": -0.0263, "num_tokens": 24806395.0, "reward": 0.3125, "reward_std": 0.6116957664489746, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.3664398193359375, "sampling/importance_sampling_ratio/mean": 0.9999828934669495, "sampling/importance_sampling_ratio/min": 0.5309714078903198, "sampling/sampling_logp_difference/max": 0.6330471038818359, "sampling/sampling_logp_difference/mean": 0.014141961932182312, "step": 575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1113.0, "completions/max_terminated_length": 1113.0, "completions/mean_length": 411.671875, "completions/mean_terminated_length": 411.671875, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.6071540117263794, "epoch": 0.7058823529411765, "frac_reward_zero_std": 0.75, "grad_norm": 0.6841351406735178, "kl": 0.023821663111448288, "learning_rate": 9.460678502319416e-07, "loss": -0.045, "num_tokens": 24848598.0, "reward": 0.125, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 1.5018765926361084, "sampling/importance_sampling_ratio/mean": 1.000018835067749, "sampling/importance_sampling_ratio/min": 0.6266156435012817, "sampling/sampling_logp_difference/max": 0.4674220085144043, "sampling/sampling_logp_difference/mean": 0.016420794650912285, "step": 576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1298.0, "completions/max_terminated_length": 1298.0, "completions/mean_length": 522.890625, "completions/mean_terminated_length": 522.890625, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "entropy": 0.5901511311531067, "epoch": 0.7071078431372549, "frac_reward_zero_std": 0.25, "grad_norm": 1.0217864134295083, "kl": 0.01907465234398842, "learning_rate": 9.457455677726447e-07, "loss": -0.0538, "num_tokens": 24903215.0, "reward": 0.0625, "reward_std": 0.6267197132110596, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.8959153890609741, "sampling/importance_sampling_ratio/mean": 1.000115990638733, "sampling/importance_sampling_ratio/min": 0.5477979779243469, "sampling/sampling_logp_difference/max": 0.6397018432617188, "sampling/sampling_logp_difference/mean": 0.015500318259000778, "step": 577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 739.0, "completions/max_terminated_length": 739.0, "completions/mean_length": 411.0, "completions/mean_terminated_length": 411.0, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.6116553544998169, "epoch": 0.7083333333333334, "frac_reward_zero_std": 0.5, "grad_norm": 0.8137660101917906, "kl": 0.02387225814163685, "learning_rate": 9.454223804780841e-07, "loss": -0.0727, "num_tokens": 24947647.0, "reward": 0.40625, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.3854886293411255, "sampling/importance_sampling_ratio/mean": 1.0002297163009644, "sampling/importance_sampling_ratio/min": 0.2894440293312073, "sampling/sampling_logp_difference/max": 1.2397934198379517, "sampling/sampling_logp_difference/mean": 0.016612499952316284, "step": 578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1506.0, "completions/max_terminated_length": 1506.0, "completions/mean_length": 457.59375, "completions/mean_terminated_length": 457.59375, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "entropy": 0.578660249710083, "epoch": 0.7095588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 0.8434028152914462, "kl": 0.01985703594982624, "learning_rate": 9.450982890043094e-07, "loss": 0.0331, "num_tokens": 24997365.0, "reward": -0.125, "reward_std": 0.49553054571151733, "rewards/decision_reward_func/mean": -0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0003247261047363, "sampling/importance_sampling_ratio/min": 0.6771700382232666, "sampling/sampling_logp_difference/max": 0.7049181461334229, "sampling/sampling_logp_difference/mean": 0.015717139467597008, "step": 579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 787.0, "completions/max_terminated_length": 787.0, "completions/mean_length": 431.796875, "completions/mean_terminated_length": 431.796875, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "entropy": 0.6208999752998352, "epoch": 0.7107843137254902, "frac_reward_zero_std": 0.5, "grad_norm": 0.7996784428851639, "kl": 0.028589271008968353, "learning_rate": 9.447732940092059e-07, "loss": 0.0271, "num_tokens": 25045704.0, "reward": 0.90625, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.4356142282485962, "sampling/importance_sampling_ratio/mean": 0.9995872378349304, "sampling/importance_sampling_ratio/min": 0.538601279258728, "sampling/sampling_logp_difference/max": 0.618779718875885, "sampling/sampling_logp_difference/mean": 0.017343435436487198, "step": 580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1301.0, "completions/max_terminated_length": 1301.0, "completions/mean_length": 479.96875, "completions/mean_terminated_length": 479.96875, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "entropy": 0.6466704607009888, "epoch": 0.7120098039215687, "frac_reward_zero_std": 0.25, "grad_norm": 0.969159115230111, "kl": 0.024524793028831482, "learning_rate": 9.444473961524927e-07, "loss": -0.0287, "num_tokens": 25105526.0, "reward": 0.28125, "reward_std": 0.5539814233779907, "rewards/decision_reward_func/mean": 0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 1.8561475276947021, "sampling/importance_sampling_ratio/mean": 1.0001704692840576, "sampling/importance_sampling_ratio/min": 0.4141624867916107, "sampling/sampling_logp_difference/max": 0.8814969062805176, "sampling/sampling_logp_difference/mean": 0.016886036843061447, "step": 581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 724.0, "completions/max_terminated_length": 724.0, "completions/mean_length": 388.953125, "completions/mean_terminated_length": 388.953125, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "entropy": 0.5574477910995483, "epoch": 0.7132352941176471, "frac_reward_zero_std": 0.25, "grad_norm": 0.9869796811924687, "kl": 0.027546720579266548, "learning_rate": 9.441205960957219e-07, "loss": 0.0205, "num_tokens": 25150291.0, "reward": 0.09375, "reward_std": 0.6487700343132019, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.4610545635223389, "sampling/importance_sampling_ratio/mean": 1.0000156164169312, "sampling/importance_sampling_ratio/min": 0.6501104235649109, "sampling/sampling_logp_difference/max": 0.43061304092407227, "sampling/sampling_logp_difference/mean": 0.015648266300559044, "step": 582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 598.0, "completions/max_terminated_length": 598.0, "completions/mean_length": 350.1875, "completions/mean_terminated_length": 350.1875, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "entropy": 0.4796811640262604, "epoch": 0.7144607843137255, "frac_reward_zero_std": 0.75, "grad_norm": 0.6579907994685571, "kl": 0.023739352822303772, "learning_rate": 9.43792894502277e-07, "loss": 0.0451, "num_tokens": 25189151.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.4481943845748901, "sampling/importance_sampling_ratio/mean": 1.0002245903015137, "sampling/importance_sampling_ratio/min": 0.4881986379623413, "sampling/sampling_logp_difference/max": 0.7170329093933105, "sampling/sampling_logp_difference/mean": 0.014905143529176712, "step": 583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 874.0, "completions/max_terminated_length": 874.0, "completions/mean_length": 496.3125, "completions/mean_terminated_length": 496.3125, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "entropy": 0.6207811236381531, "epoch": 0.7156862745098039, "frac_reward_zero_std": 1.0, "grad_norm": 0.015684340604324706, "kl": 0.023293059319257736, "learning_rate": 9.434642920373713e-07, "loss": 0.0002, "num_tokens": 25242835.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5744351148605347, "sampling/importance_sampling_ratio/mean": 0.9996973872184753, "sampling/importance_sampling_ratio/min": 0.4993846118450165, "sampling/sampling_logp_difference/max": 0.6943787336349487, "sampling/sampling_logp_difference/mean": 0.01586994156241417, "step": 584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1166.0, "completions/max_terminated_length": 1166.0, "completions/mean_length": 396.1875, "completions/mean_terminated_length": 396.1875, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.6089230179786682, "epoch": 0.7169117647058824, "frac_reward_zero_std": 0.5, "grad_norm": 0.8041732199749558, "kl": 0.03364190459251404, "learning_rate": 9.431347893680472e-07, "loss": -0.1401, "num_tokens": 25282975.0, "reward": 0.15625, "reward_std": 0.42695626616477966, "rewards/decision_reward_func/mean": 0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.7543818950653076, "sampling/importance_sampling_ratio/mean": 1.0000500679016113, "sampling/importance_sampling_ratio/min": 0.6764377355575562, "sampling/sampling_logp_difference/max": 0.5621166229248047, "sampling/sampling_logp_difference/mean": 0.017036225646734238, "step": 585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 897.0, "completions/max_terminated_length": 897.0, "completions/mean_length": 404.84375, "completions/mean_terminated_length": 404.84375, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.3558861315250397, "epoch": 0.7181372549019608, "frac_reward_zero_std": 1.0, "grad_norm": 0.015856220376080726, "kl": 0.021082744002342224, "learning_rate": 9.428043871631739e-07, "loss": 0.0002, "num_tokens": 25324613.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4261430501937866, "sampling/importance_sampling_ratio/mean": 1.000077247619629, "sampling/importance_sampling_ratio/min": 0.614182710647583, "sampling/sampling_logp_difference/max": 0.48746275901794434, "sampling/sampling_logp_difference/mean": 0.011464684270322323, "step": 586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1945.0, "completions/max_terminated_length": 1945.0, "completions/mean_length": 542.5, "completions/mean_terminated_length": 542.5, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "entropy": 0.6306429505348206, "epoch": 0.7193627450980392, "frac_reward_zero_std": 0.25, "grad_norm": 0.9168255624315574, "kl": 0.025177374482154846, "learning_rate": 9.424730860934472e-07, "loss": 0.145, "num_tokens": 25382565.0, "reward": 0.375, "reward_std": 0.5765564441680908, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.4264792203903198, "sampling/importance_sampling_ratio/mean": 0.9998502135276794, "sampling/importance_sampling_ratio/min": 0.6253674030303955, "sampling/sampling_logp_difference/max": 0.4694160223007202, "sampling/sampling_logp_difference/mean": 0.017153173685073853, "step": 587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 786.0, "completions/max_terminated_length": 786.0, "completions/mean_length": 290.8125, "completions/mean_terminated_length": 290.8125, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.6511598825454712, "epoch": 0.7205882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 0.9272792972539258, "kl": 0.04600092023611069, "learning_rate": 9.421408868313873e-07, "loss": -0.0329, "num_tokens": 25415241.0, "reward": 0.3125, "reward_std": 0.42898139357566833, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.8095085620880127, "sampling/importance_sampling_ratio/mean": 0.9999561309814453, "sampling/importance_sampling_ratio/min": 0.7134637832641602, "sampling/sampling_logp_difference/max": 0.593055248260498, "sampling/sampling_logp_difference/mean": 0.01787680946290493, "step": 588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1164.0, "completions/max_terminated_length": 1164.0, "completions/mean_length": 385.515625, "completions/mean_terminated_length": 385.515625, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.5183377265930176, "epoch": 0.7218137254901961, "frac_reward_zero_std": 0.5, "grad_norm": 0.776349847534258, "kl": 0.0330752432346344, "learning_rate": 9.418077900513376e-07, "loss": 0.0162, "num_tokens": 25456698.0, "reward": 0.9375, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.6078790426254272, "sampling/importance_sampling_ratio/mean": 1.0001682043075562, "sampling/importance_sampling_ratio/min": 0.6117579340934753, "sampling/sampling_logp_difference/max": 0.49141860008239746, "sampling/sampling_logp_difference/mean": 0.014833377674221992, "step": 589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 755.0, "completions/max_terminated_length": 755.0, "completions/mean_length": 412.59375, "completions/mean_terminated_length": 412.59375, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "entropy": 0.6833481192588806, "epoch": 0.7230392156862745, "frac_reward_zero_std": 0.0, "grad_norm": 1.1062392894559008, "kl": 0.02897200733423233, "learning_rate": 9.414737964294634e-07, "loss": 0.0209, "num_tokens": 25503904.0, "reward": 0.71875, "reward_std": 0.6901718378067017, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.5990822315216064, "sampling/importance_sampling_ratio/mean": 1.0000629425048828, "sampling/importance_sampling_ratio/min": 0.7300212979316711, "sampling/sampling_logp_difference/max": 0.4694298505783081, "sampling/sampling_logp_difference/mean": 0.018649209290742874, "step": 590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1756.0, "completions/max_terminated_length": 1756.0, "completions/mean_length": 593.421875, "completions/mean_terminated_length": 593.421875, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "entropy": 0.5459839701652527, "epoch": 0.7242647058823529, "frac_reward_zero_std": 0.25, "grad_norm": 0.8573520298481547, "kl": 0.01998244598507881, "learning_rate": 9.411389066437507e-07, "loss": 0.0421, "num_tokens": 25563115.0, "reward": 0.3125, "reward_std": 0.7191373109817505, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.7949928045272827, "sampling/importance_sampling_ratio/mean": 1.0000786781311035, "sampling/importance_sampling_ratio/min": 0.23178201913833618, "sampling/sampling_logp_difference/max": 1.4619579315185547, "sampling/sampling_logp_difference/mean": 0.01466161198914051, "step": 591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1045.0, "completions/max_terminated_length": 1045.0, "completions/mean_length": 427.265625, "completions/mean_terminated_length": 427.265625, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.49822482466697693, "epoch": 0.7254901960784313, "frac_reward_zero_std": 0.5, "grad_norm": 0.7069616022069362, "kl": 0.024529246613383293, "learning_rate": 9.408031213740044e-07, "loss": 0.0172, "num_tokens": 25607372.0, "reward": 0.375, "reward_std": 0.49553054571151733, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.6939125061035156, "sampling/importance_sampling_ratio/mean": 1.000091552734375, "sampling/importance_sampling_ratio/min": 0.6368913650512695, "sampling/sampling_logp_difference/max": 0.527040958404541, "sampling/sampling_logp_difference/mean": 0.014103810302913189, "step": 592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1095.0, "completions/max_terminated_length": 1095.0, "completions/mean_length": 479.1875, "completions/mean_terminated_length": 479.1875, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "entropy": 0.6334912776947021, "epoch": 0.7267156862745098, "frac_reward_zero_std": 0.0, "grad_norm": 1.0126768928674257, "kl": 0.02536267414689064, "learning_rate": 9.404664413018476e-07, "loss": 0.0435, "num_tokens": 25659960.0, "reward": 0.09375, "reward_std": 0.6223389506340027, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.5295600891113281, "sampling/importance_sampling_ratio/mean": 0.9996810555458069, "sampling/importance_sampling_ratio/min": 0.5236582159996033, "sampling/sampling_logp_difference/max": 0.6469160318374634, "sampling/sampling_logp_difference/mean": 0.016735170036554337, "step": 593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1730.0, "completions/max_terminated_length": 1730.0, "completions/mean_length": 541.90625, "completions/mean_terminated_length": 541.90625, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "entropy": 0.7290093898773193, "epoch": 0.7279411764705882, "frac_reward_zero_std": 0.0, "grad_norm": 0.9555224175077176, "kl": 0.0285848006606102, "learning_rate": 9.401288671107193e-07, "loss": 0.019, "num_tokens": 25715010.0, "reward": -0.09375, "reward_std": 0.815913200378418, "rewards/decision_reward_func/mean": -0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.4083831310272217, "sampling/importance_sampling_ratio/mean": 0.9998269081115723, "sampling/importance_sampling_ratio/min": 0.5391208529472351, "sampling/sampling_logp_difference/max": 0.6178154945373535, "sampling/sampling_logp_difference/mean": 0.0185331292450428, "step": 594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 650.0, "completions/max_terminated_length": 650.0, "completions/mean_length": 394.59375, "completions/mean_terminated_length": 394.59375, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "entropy": 0.6354795694351196, "epoch": 0.7291666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.020503537412069107, "kl": 0.02672279067337513, "learning_rate": 9.397903994858735e-07, "loss": 0.0003, "num_tokens": 25760056.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.3384490013122559, "sampling/importance_sampling_ratio/mean": 1.0001461505889893, "sampling/importance_sampling_ratio/min": 0.6401627063751221, "sampling/sampling_logp_difference/max": 0.4460330009460449, "sampling/sampling_logp_difference/mean": 0.01577531173825264, "step": 595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1423.0, "completions/max_terminated_length": 1423.0, "completions/mean_length": 439.703125, "completions/mean_terminated_length": 439.703125, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "entropy": 0.569887101650238, "epoch": 0.7303921568627451, "frac_reward_zero_std": 0.5, "grad_norm": 0.7346523221731704, "kl": 0.02642681635916233, "learning_rate": 9.394510391143786e-07, "loss": 0.0428, "num_tokens": 25803653.0, "reward": 0.75, "reward_std": 0.3811737596988678, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.460864782333374, "sampling/importance_sampling_ratio/mean": 1.0000001192092896, "sampling/importance_sampling_ratio/min": 0.7662999033927917, "sampling/sampling_logp_difference/max": 0.3790286183357239, "sampling/sampling_logp_difference/mean": 0.015113351866602898, "step": 596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 837.0, "completions/max_terminated_length": 837.0, "completions/mean_length": 530.375, "completions/mean_terminated_length": 530.375, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "entropy": 0.43117833137512207, "epoch": 0.7316176470588235, "frac_reward_zero_std": 1.0, "grad_norm": 0.014802575627485735, "kl": 0.01772424578666687, "learning_rate": 9.391107866851142e-07, "loss": 0.0002, "num_tokens": 25870573.0, "reward": -0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": -0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4780479669570923, "sampling/importance_sampling_ratio/mean": 0.9999887347221375, "sampling/importance_sampling_ratio/min": 0.5096858739852905, "sampling/sampling_logp_difference/max": 0.6739606857299805, "sampling/sampling_logp_difference/mean": 0.012592419981956482, "step": 597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1031.0, "completions/max_terminated_length": 1031.0, "completions/mean_length": 465.625, "completions/mean_terminated_length": 465.625, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.6221768260002136, "epoch": 0.7328431372549019, "frac_reward_zero_std": 0.25, "grad_norm": 2.5994312858856756, "kl": 0.027061566710472107, "learning_rate": 9.387696428887715e-07, "loss": -0.0666, "num_tokens": 25914773.0, "reward": 0.0, "reward_std": 0.7054125070571899, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.3827100992202759, "sampling/importance_sampling_ratio/mean": 0.9997532367706299, "sampling/importance_sampling_ratio/min": 0.6483331322669983, "sampling/sampling_logp_difference/max": 0.4333505630493164, "sampling/sampling_logp_difference/mean": 0.01656033843755722, "step": 598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 774.0, "completions/max_terminated_length": 774.0, "completions/mean_length": 390.984375, "completions/mean_terminated_length": 390.984375, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.5310781002044678, "epoch": 0.7340686274509803, "frac_reward_zero_std": 0.5, "grad_norm": 0.8741711782790136, "kl": 0.03226190060377121, "learning_rate": 9.384276084178504e-07, "loss": 0.0051, "num_tokens": 25953988.0, "reward": 0.5625, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000702142715454, "sampling/importance_sampling_ratio/min": 0.6893627047538757, "sampling/sampling_logp_difference/max": 0.7500560283660889, "sampling/sampling_logp_difference/mean": 0.016109298914670944, "step": 599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 966.0, "completions/max_terminated_length": 966.0, "completions/mean_length": 467.09375, "completions/mean_terminated_length": 467.09375, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "entropy": 0.6982120275497437, "epoch": 0.7352941176470589, "frac_reward_zero_std": 0.5, "grad_norm": 0.7193298360900716, "kl": 0.02614929899573326, "learning_rate": 9.380846839666595e-07, "loss": 0.0071, "num_tokens": 26016714.0, "reward": 0.5, "reward_std": 0.5163977742195129, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5120320320129395, "sampling/importance_sampling_ratio/mean": 1.000007152557373, "sampling/importance_sampling_ratio/min": 0.6203175783157349, "sampling/sampling_logp_difference/max": 0.4775238037109375, "sampling/sampling_logp_difference/mean": 0.017695313319563866, "step": 600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 718.0, "completions/max_terminated_length": 718.0, "completions/mean_length": 435.46875, "completions/mean_terminated_length": 435.46875, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "entropy": 0.439250648021698, "epoch": 0.7365196078431373, "frac_reward_zero_std": 0.75, "grad_norm": 0.5069702145542417, "kl": 0.024352459236979485, "learning_rate": 9.377408702313136e-07, "loss": 0.0079, "num_tokens": 26060600.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.3940834999084473, "sampling/importance_sampling_ratio/mean": 1.0001263618469238, "sampling/importance_sampling_ratio/min": 0.637303352355957, "sampling/sampling_logp_difference/max": 0.45050954818725586, "sampling/sampling_logp_difference/mean": 0.013274111784994602, "step": 601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 750.0, "completions/max_terminated_length": 750.0, "completions/mean_length": 361.703125, "completions/mean_terminated_length": 361.703125, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.49526888132095337, "epoch": 0.7377450980392157, "frac_reward_zero_std": 1.0, "grad_norm": 0.023698859011630387, "kl": 0.032214947044849396, "learning_rate": 9.37396167909733e-07, "loss": 0.0003, "num_tokens": 26102501.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6007341146469116, "sampling/importance_sampling_ratio/mean": 0.9999775290489197, "sampling/importance_sampling_ratio/min": 0.6228374242782593, "sampling/sampling_logp_difference/max": 0.47346973419189453, "sampling/sampling_logp_difference/mean": 0.014259539544582367, "step": 602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1064.0, "completions/max_terminated_length": 1064.0, "completions/mean_length": 423.265625, "completions/mean_terminated_length": 423.265625, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "entropy": 0.5377273559570312, "epoch": 0.7389705882352942, "frac_reward_zero_std": 0.5, "grad_norm": 0.7725104454282505, "kl": 0.03061792068183422, "learning_rate": 9.370505777016413e-07, "loss": 0.0464, "num_tokens": 26145910.0, "reward": 0.5625, "reward_std": 0.3943893015384674, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.8683100938796997, "sampling/importance_sampling_ratio/mean": 0.9999960660934448, "sampling/importance_sampling_ratio/min": 0.6265191435813904, "sampling/sampling_logp_difference/max": 0.6250343322753906, "sampling/sampling_logp_difference/mean": 0.015744226053357124, "step": 603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1643.0, "completions/max_terminated_length": 1643.0, "completions/mean_length": 434.828125, "completions/mean_terminated_length": 434.828125, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "entropy": 0.5037078857421875, "epoch": 0.7401960784313726, "frac_reward_zero_std": 1.0, "grad_norm": 0.01721122105427505, "kl": 0.022013401612639427, "learning_rate": 9.367041003085648e-07, "loss": 0.0002, "num_tokens": 26192731.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5971465110778809, "sampling/importance_sampling_ratio/mean": 1.0002762079238892, "sampling/importance_sampling_ratio/min": 0.19096790254116058, "sampling/sampling_logp_difference/max": 1.6556499004364014, "sampling/sampling_logp_difference/mean": 0.015324097126722336, "step": 604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 990.0, "completions/max_terminated_length": 990.0, "completions/mean_length": 469.15625, "completions/mean_terminated_length": 469.15625, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "entropy": 0.5835174322128296, "epoch": 0.741421568627451, "frac_reward_zero_std": 0.75, "grad_norm": 0.5252351352629119, "kl": 0.021737253293395042, "learning_rate": 9.363567364338307e-07, "loss": -0.0053, "num_tokens": 26244581.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.5873950719833374, "sampling/importance_sampling_ratio/mean": 1.0002954006195068, "sampling/importance_sampling_ratio/min": 0.6058905720710754, "sampling/sampling_logp_difference/max": 0.5010559558868408, "sampling/sampling_logp_difference/mean": 0.015284528955817223, "step": 605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1049.0, "completions/max_terminated_length": 1049.0, "completions/mean_length": 403.578125, "completions/mean_terminated_length": 403.578125, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "entropy": 0.43514421582221985, "epoch": 0.7426470588235294, "frac_reward_zero_std": 0.75, "grad_norm": 0.6144520737253122, "kl": 0.024056516587734222, "learning_rate": 9.360084867825658e-07, "loss": -0.0312, "num_tokens": 26290250.0, "reward": 0.65625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000759363174438, "sampling/importance_sampling_ratio/min": 0.6254570484161377, "sampling/sampling_logp_difference/max": 0.7048544883728027, "sampling/sampling_logp_difference/mean": 0.014662097208201885, "step": 606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 956.0, "completions/max_terminated_length": 956.0, "completions/mean_length": 448.046875, "completions/mean_terminated_length": 448.046875, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.46617454290390015, "epoch": 0.7438725490196079, "frac_reward_zero_std": 1.0, "grad_norm": 0.01719503709020969, "kl": 0.021540995687246323, "learning_rate": 9.356593520616946e-07, "loss": 0.0002, "num_tokens": 26348301.0, "reward": -0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": -0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5587425231933594, "sampling/importance_sampling_ratio/mean": 1.0001598596572876, "sampling/importance_sampling_ratio/min": 0.6313877701759338, "sampling/sampling_logp_difference/max": 0.4598350524902344, "sampling/sampling_logp_difference/mean": 0.013453586958348751, "step": 607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 905.0, "completions/max_terminated_length": 905.0, "completions/mean_length": 390.71875, "completions/mean_terminated_length": 390.71875, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "entropy": 0.49433019757270813, "epoch": 0.7450980392156863, "frac_reward_zero_std": 0.75, "grad_norm": 0.6207965454045192, "kl": 0.023923953995108604, "learning_rate": 9.353093329799386e-07, "loss": -0.0055, "num_tokens": 26387915.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.4266630411148071, "sampling/importance_sampling_ratio/mean": 0.9997216463088989, "sampling/importance_sampling_ratio/min": 0.6482435464859009, "sampling/sampling_logp_difference/max": 0.4334888458251953, "sampling/sampling_logp_difference/mean": 0.01417953334748745, "step": 608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 798.0, "completions/max_terminated_length": 798.0, "completions/mean_length": 383.078125, "completions/mean_terminated_length": 383.078125, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.4231610894203186, "epoch": 0.7463235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.01707443859079594, "kl": 0.02121073752641678, "learning_rate": 9.349584302478144e-07, "loss": 0.0002, "num_tokens": 26432208.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6556415557861328, "sampling/importance_sampling_ratio/mean": 0.9999006986618042, "sampling/importance_sampling_ratio/min": 0.7017588019371033, "sampling/sampling_logp_difference/max": 0.5041885375976562, "sampling/sampling_logp_difference/mean": 0.012092387303709984, "step": 609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 706.0, "completions/max_terminated_length": 706.0, "completions/mean_length": 423.546875, "completions/mean_terminated_length": 423.546875, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "entropy": 0.4184129238128662, "epoch": 0.7475490196078431, "frac_reward_zero_std": 0.75, "grad_norm": 0.5472046595116093, "kl": 0.024645529687404633, "learning_rate": 9.346066445776321e-07, "loss": 0.0466, "num_tokens": 26477459.0, "reward": 0.59375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.4728178977966309, "sampling/importance_sampling_ratio/mean": 0.9998929500579834, "sampling/importance_sampling_ratio/min": 0.6972441077232361, "sampling/sampling_logp_difference/max": 0.3871774673461914, "sampling/sampling_logp_difference/mean": 0.01188274659216404, "step": 610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 874.0, "completions/max_terminated_length": 874.0, "completions/mean_length": 471.421875, "completions/mean_terminated_length": 471.421875, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "entropy": 0.47616058588027954, "epoch": 0.7487745098039216, "frac_reward_zero_std": 0.5, "grad_norm": 0.61595329450844, "kl": 0.023676306009292603, "learning_rate": 9.342539766834945e-07, "loss": -0.0013, "num_tokens": 26524510.0, "reward": 0.71875, "reward_std": 0.42695626616477966, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.3632348775863647, "sampling/importance_sampling_ratio/mean": 0.9998892545700073, "sampling/importance_sampling_ratio/min": 0.695529580116272, "sampling/sampling_logp_difference/max": 0.363081693649292, "sampling/sampling_logp_difference/mean": 0.013951239176094532, "step": 611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1615.0, "completions/max_terminated_length": 1615.0, "completions/mean_length": 542.53125, "completions/mean_terminated_length": 542.53125, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "entropy": 0.4706161618232727, "epoch": 0.75, "frac_reward_zero_std": 0.75, "grad_norm": 0.34963376965970133, "kl": 0.026893895119428635, "learning_rate": 9.339004272812949e-07, "loss": 0.0067, "num_tokens": 26578880.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.3021268844604492, "sampling/importance_sampling_ratio/mean": 0.9998231530189514, "sampling/importance_sampling_ratio/min": 0.6922945380210876, "sampling/sampling_logp_difference/max": 0.36774373054504395, "sampling/sampling_logp_difference/mean": 0.013505726121366024, "step": 612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 772.0, "completions/max_terminated_length": 772.0, "completions/mean_length": 421.453125, "completions/mean_terminated_length": 421.453125, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.4359239339828491, "epoch": 0.7512254901960784, "frac_reward_zero_std": 1.0, "grad_norm": 0.02079437304722995, "kl": 0.021470699459314346, "learning_rate": 9.335459970887165e-07, "loss": 0.0002, "num_tokens": 26622717.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5975730419158936, "sampling/importance_sampling_ratio/mean": 1.0000839233398438, "sampling/importance_sampling_ratio/min": 0.6260515451431274, "sampling/sampling_logp_difference/max": 0.46848559379577637, "sampling/sampling_logp_difference/mean": 0.013946772553026676, "step": 613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 866.0, "completions/max_terminated_length": 866.0, "completions/mean_length": 403.203125, "completions/mean_terminated_length": 403.203125, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.4341870844364166, "epoch": 0.7524509803921569, "frac_reward_zero_std": 1.0, "grad_norm": 0.016551580926637454, "kl": 0.020923104137182236, "learning_rate": 9.331906868252299e-07, "loss": 0.0002, "num_tokens": 26669754.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.292677640914917, "sampling/importance_sampling_ratio/mean": 0.9998643398284912, "sampling/importance_sampling_ratio/min": 0.6992118954658508, "sampling/sampling_logp_difference/max": 0.3578014373779297, "sampling/sampling_logp_difference/mean": 0.013617651537060738, "step": 614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1182.0, "completions/max_terminated_length": 1182.0, "completions/mean_length": 520.40625, "completions/mean_terminated_length": 520.40625, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "entropy": 0.5055110454559326, "epoch": 0.7536764705882353, "frac_reward_zero_std": 0.75, "grad_norm": 0.4692936361885431, "kl": 0.02156968042254448, "learning_rate": 9.328344972120925e-07, "loss": 0.0144, "num_tokens": 26723460.0, "reward": 0.28125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 1.546732783317566, "sampling/importance_sampling_ratio/mean": 1.000054121017456, "sampling/importance_sampling_ratio/min": 0.613470196723938, "sampling/sampling_logp_difference/max": 0.48862361907958984, "sampling/sampling_logp_difference/mean": 0.014120728708803654, "step": 615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1688.0, "completions/max_terminated_length": 1688.0, "completions/mean_length": 497.734375, "completions/mean_terminated_length": 497.734375, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.5403317213058472, "epoch": 0.7549019607843137, "frac_reward_zero_std": 0.5, "grad_norm": 0.7535284049968056, "kl": 0.022725850343704224, "learning_rate": 9.324774289723467e-07, "loss": 0.0616, "num_tokens": 26776019.0, "reward": 0.5625, "reward_std": 0.44091323018074036, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.999675989151001, "sampling/importance_sampling_ratio/min": 0.4156835377216339, "sampling/sampling_logp_difference/max": 0.91853928565979, "sampling/sampling_logp_difference/mean": 0.015114616602659225, "step": 616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 795.0, "completions/max_terminated_length": 795.0, "completions/mean_length": 404.671875, "completions/mean_terminated_length": 404.671875, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "entropy": 0.40489697456359863, "epoch": 0.7561274509803921, "frac_reward_zero_std": 0.75, "grad_norm": 0.4698882424098627, "kl": 0.019222069531679153, "learning_rate": 9.321194828308183e-07, "loss": -0.0193, "num_tokens": 26816574.0, "reward": 0.8125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.45786452293396, "sampling/importance_sampling_ratio/mean": 0.999880313873291, "sampling/importance_sampling_ratio/min": 0.7135676145553589, "sampling/sampling_logp_difference/max": 0.37697267532348633, "sampling/sampling_logp_difference/mean": 0.012104982510209084, "step": 617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 867.0, "completions/max_terminated_length": 867.0, "completions/mean_length": 472.296875, "completions/mean_terminated_length": 472.296875, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "entropy": 0.5050849318504333, "epoch": 0.7573529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 0.639553959912802, "kl": 0.024261418730020523, "learning_rate": 9.317606595141155e-07, "loss": 0.0041, "num_tokens": 26864033.0, "reward": 0.40625, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.609838843345642, "sampling/importance_sampling_ratio/mean": 1.0003278255462646, "sampling/importance_sampling_ratio/min": 0.7049079537391663, "sampling/sampling_logp_difference/max": 0.4761340618133545, "sampling/sampling_logp_difference/mean": 0.015163315460085869, "step": 618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1181.0, "completions/max_terminated_length": 1181.0, "completions/mean_length": 426.84375, "completions/mean_terminated_length": 426.84375, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "entropy": 0.5320031642913818, "epoch": 0.758578431372549, "frac_reward_zero_std": 0.5, "grad_norm": 0.8684802481925105, "kl": 0.026291808113455772, "learning_rate": 9.314009597506265e-07, "loss": -0.0417, "num_tokens": 26905303.0, "reward": 0.65625, "reward_std": 0.47978055477142334, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.9582428932189941, "sampling/importance_sampling_ratio/mean": 0.9999774098396301, "sampling/importance_sampling_ratio/min": 0.6254591941833496, "sampling/sampling_logp_difference/max": 0.6720476150512695, "sampling/sampling_logp_difference/mean": 0.015814159065485, "step": 619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 779.0, "completions/max_terminated_length": 779.0, "completions/mean_length": 424.515625, "completions/mean_terminated_length": 424.515625, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.5607396364212036, "epoch": 0.7598039215686274, "frac_reward_zero_std": 0.75, "grad_norm": 0.4298087818561554, "kl": 0.02431371435523033, "learning_rate": 9.310403842705194e-07, "loss": 0.0022, "num_tokens": 26948712.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.5942869186401367, "sampling/importance_sampling_ratio/mean": 0.9998846054077148, "sampling/importance_sampling_ratio/min": 0.6074296236038208, "sampling/sampling_logp_difference/max": 0.4985189437866211, "sampling/sampling_logp_difference/mean": 0.015451867133378983, "step": 620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3111.0, "completions/max_terminated_length": 3111.0, "completions/mean_length": 642.953125, "completions/mean_terminated_length": 642.953125, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "entropy": 0.5139337182044983, "epoch": 0.7610294117647058, "frac_reward_zero_std": 0.5, "grad_norm": 0.6506636949456971, "kl": 0.019846903160214424, "learning_rate": 9.306789338057393e-07, "loss": -0.1295, "num_tokens": 27010757.0, "reward": 0.46875, "reward_std": 0.3723389506340027, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.468320608139038, "sampling/importance_sampling_ratio/mean": 0.999975323677063, "sampling/importance_sampling_ratio/min": 0.6680107712745667, "sampling/sampling_logp_difference/max": 0.40345096588134766, "sampling/sampling_logp_difference/mean": 0.014169791713356972, "step": 621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2522.0, "completions/max_terminated_length": 2522.0, "completions/mean_length": 520.78125, "completions/mean_terminated_length": 520.78125, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.5305465459823608, "epoch": 0.7622549019607843, "frac_reward_zero_std": 1.0, "grad_norm": 0.01655852471204441, "kl": 0.023109328001737595, "learning_rate": 9.303166090900081e-07, "loss": 0.0002, "num_tokens": 27058615.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.684673547744751, "sampling/importance_sampling_ratio/mean": 0.9999440312385559, "sampling/importance_sampling_ratio/min": 0.5676781535148621, "sampling/sampling_logp_difference/max": 0.5662006139755249, "sampling/sampling_logp_difference/mean": 0.015981923788785934, "step": 622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1333.0, "completions/max_terminated_length": 1333.0, "completions/mean_length": 434.46875, "completions/mean_terminated_length": 434.46875, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "entropy": 0.47774145007133484, "epoch": 0.7634803921568627, "frac_reward_zero_std": 0.75, "grad_norm": 0.5847553970542352, "kl": 0.021677838638424873, "learning_rate": 9.299534108588217e-07, "loss": 0.0036, "num_tokens": 27103941.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.42008376121521, "sampling/importance_sampling_ratio/mean": 0.9997971057891846, "sampling/importance_sampling_ratio/min": 0.7065232396125793, "sampling/sampling_logp_difference/max": 0.35071587562561035, "sampling/sampling_logp_difference/mean": 0.012862499803304672, "step": 623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1007.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 475.25, "completions/mean_terminated_length": 475.25, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "entropy": 0.41291379928588867, "epoch": 0.7647058823529411, "frac_reward_zero_std": 1.0, "grad_norm": 0.01681552939425342, "kl": 0.02174215018749237, "learning_rate": 9.295893398494497e-07, "loss": 0.0002, "num_tokens": 27152981.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5053166151046753, "sampling/importance_sampling_ratio/mean": 1.0002691745758057, "sampling/importance_sampling_ratio/min": 0.6059513092041016, "sampling/sampling_logp_difference/max": 0.5009555816650391, "sampling/sampling_logp_difference/mean": 0.01228900533169508, "step": 624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 993.0, "completions/max_terminated_length": 993.0, "completions/mean_length": 415.625, "completions/mean_terminated_length": 415.625, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.4880215525627136, "epoch": 0.7659313725490197, "frac_reward_zero_std": 1.0, "grad_norm": 0.0167892372312355, "kl": 0.023854173719882965, "learning_rate": 9.29224396800933e-07, "loss": 0.0002, "num_tokens": 27195709.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4539663791656494, "sampling/importance_sampling_ratio/mean": 0.9997464418411255, "sampling/importance_sampling_ratio/min": 0.44975149631500244, "sampling/sampling_logp_difference/max": 0.7990601062774658, "sampling/sampling_logp_difference/mean": 0.014614695683121681, "step": 625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 592.0, "completions/max_terminated_length": 592.0, "completions/mean_length": 349.140625, "completions/mean_terminated_length": 349.140625, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "entropy": 0.543928861618042, "epoch": 0.7671568627450981, "frac_reward_zero_std": 0.5, "grad_norm": 0.954722688878693, "kl": 0.029604492709040642, "learning_rate": 9.288585824540832e-07, "loss": -0.0664, "num_tokens": 27237318.0, "reward": 0.15625, "reward_std": 0.3723389506340027, "rewards/decision_reward_func/mean": 0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.7708712816238403, "sampling/importance_sampling_ratio/mean": 1.00002920627594, "sampling/importance_sampling_ratio/min": 0.5195263624191284, "sampling/sampling_logp_difference/max": 0.6548377275466919, "sampling/sampling_logp_difference/mean": 0.016109716147184372, "step": 626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 568.0, "completions/max_terminated_length": 568.0, "completions/mean_length": 336.265625, "completions/mean_terminated_length": 336.265625, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.4065493941307068, "epoch": 0.7683823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.019128160342439828, "kl": 0.0276415403932333, "learning_rate": 9.284918975514797e-07, "loss": 0.0003, "num_tokens": 27273335.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5517747402191162, "sampling/importance_sampling_ratio/mean": 1.000075340270996, "sampling/importance_sampling_ratio/min": 0.6986286640167236, "sampling/sampling_logp_difference/max": 0.43939924240112305, "sampling/sampling_logp_difference/mean": 0.013617411255836487, "step": 627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1272.0, "completions/max_terminated_length": 1272.0, "completions/mean_length": 557.34375, "completions/mean_terminated_length": 557.34375, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "entropy": 0.5765098333358765, "epoch": 0.7696078431372549, "frac_reward_zero_std": 0.5, "grad_norm": 0.5426713339147488, "kl": 0.02759575843811035, "learning_rate": 9.281243428374701e-07, "loss": -0.0104, "num_tokens": 27322781.0, "reward": -0.84375, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": -0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.636060118675232, "sampling/importance_sampling_ratio/mean": 0.999933123588562, "sampling/importance_sampling_ratio/min": 0.16337381303310394, "sampling/sampling_logp_difference/max": 1.8117144107818604, "sampling/sampling_logp_difference/mean": 0.015431111678481102, "step": 628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1545.0, "completions/max_terminated_length": 1545.0, "completions/mean_length": 558.484375, "completions/mean_terminated_length": 558.484375, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.5055005550384521, "epoch": 0.7708333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.0175859024016253, "kl": 0.023129941895604134, "learning_rate": 9.277559190581669e-07, "loss": 0.0002, "num_tokens": 27382940.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.656096339225769, "sampling/importance_sampling_ratio/mean": 0.9997800588607788, "sampling/importance_sampling_ratio/min": 0.474325031042099, "sampling/sampling_logp_difference/max": 0.7458624839782715, "sampling/sampling_logp_difference/mean": 0.014675969257950783, "step": 629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 908.0, "completions/max_terminated_length": 908.0, "completions/mean_length": 347.03125, "completions/mean_terminated_length": 347.03125, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.6446160078048706, "epoch": 0.7720588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 1.0256200653640704, "kl": 0.037383630871772766, "learning_rate": 9.273866269614473e-07, "loss": -0.0205, "num_tokens": 27419966.0, "reward": 0.4375, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.5772995948791504, "sampling/importance_sampling_ratio/mean": 0.9996761083602905, "sampling/importance_sampling_ratio/min": 0.6709465980529785, "sampling/sampling_logp_difference/max": 0.45571422576904297, "sampling/sampling_logp_difference/mean": 0.01765836402773857, "step": 630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1467.0, "completions/max_terminated_length": 1467.0, "completions/mean_length": 540.046875, "completions/mean_terminated_length": 540.046875, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "entropy": 0.6185691356658936, "epoch": 0.7732843137254902, "frac_reward_zero_std": 0.25, "grad_norm": 0.7634327395071042, "kl": 0.0294437687844038, "learning_rate": 9.270164672969507e-07, "loss": 0.0244, "num_tokens": 27467505.0, "reward": -0.09375, "reward_std": 0.5431214570999146, "rewards/decision_reward_func/mean": -0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0002198219299316, "sampling/importance_sampling_ratio/min": 0.6056970953941345, "sampling/sampling_logp_difference/max": 0.7729051113128662, "sampling/sampling_logp_difference/mean": 0.016987737268209457, "step": 631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1008.0, "completions/max_terminated_length": 1008.0, "completions/mean_length": 475.078125, "completions/mean_terminated_length": 475.078125, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "entropy": 0.5089071393013, "epoch": 0.7745098039215687, "frac_reward_zero_std": 1.0, "grad_norm": 0.020759376703139532, "kl": 0.02374081313610077, "learning_rate": 9.266454408160777e-07, "loss": 0.0002, "num_tokens": 27514534.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.361603021621704, "sampling/importance_sampling_ratio/mean": 1.0001943111419678, "sampling/importance_sampling_ratio/min": 0.2413155436515808, "sampling/sampling_logp_difference/max": 1.4216499328613281, "sampling/sampling_logp_difference/mean": 0.014605330303311348, "step": 632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 670.0, "completions/max_terminated_length": 670.0, "completions/mean_length": 280.609375, "completions/mean_terminated_length": 280.609375, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.4312605559825897, "epoch": 0.7757352941176471, "frac_reward_zero_std": 1.0, "grad_norm": 0.02624533955677278, "kl": 0.03609738498926163, "learning_rate": 9.262735482719887e-07, "loss": 0.0003, "num_tokens": 27546077.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5893055200576782, "sampling/importance_sampling_ratio/mean": 0.9997495412826538, "sampling/importance_sampling_ratio/min": 0.6623602509498596, "sampling/sampling_logp_difference/max": 0.46329712867736816, "sampling/sampling_logp_difference/mean": 0.0152990547940135, "step": 633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 962.0, "completions/max_terminated_length": 962.0, "completions/mean_length": 470.578125, "completions/mean_terminated_length": 470.578125, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "entropy": 0.43052971363067627, "epoch": 0.7769607843137255, "frac_reward_zero_std": 1.0, "grad_norm": 0.016600778852966087, "kl": 0.024250593036413193, "learning_rate": 9.259007904196021e-07, "loss": 0.0002, "num_tokens": 27594738.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000197410583496, "sampling/importance_sampling_ratio/min": 0.6483204364776611, "sampling/sampling_logp_difference/max": 1.1109819412231445, "sampling/sampling_logp_difference/mean": 0.013745343312621117, "step": 634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 850.0, "completions/max_terminated_length": 850.0, "completions/mean_length": 477.015625, "completions/mean_terminated_length": 477.015625, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "entropy": 0.6299978494644165, "epoch": 0.7781862745098039, "frac_reward_zero_std": 0.75, "grad_norm": 0.5018350050472019, "kl": 0.02788667008280754, "learning_rate": 9.255271680155923e-07, "loss": 0.0027, "num_tokens": 27644803.0, "reward": 0.375, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.398545742034912, "sampling/importance_sampling_ratio/mean": 1.0000615119934082, "sampling/importance_sampling_ratio/min": 0.26042139530181885, "sampling/sampling_logp_difference/max": 1.345454216003418, "sampling/sampling_logp_difference/mean": 0.016797080636024475, "step": 635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 624.0, "completions/max_terminated_length": 624.0, "completions/mean_length": 368.90625, "completions/mean_terminated_length": 368.90625, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "entropy": 0.5094464421272278, "epoch": 0.7794117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.02222921391650698, "kl": 0.02750411629676819, "learning_rate": 9.251526818183896e-07, "loss": 0.0003, "num_tokens": 27689629.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4359092712402344, "sampling/importance_sampling_ratio/mean": 1.0000011920928955, "sampling/importance_sampling_ratio/min": 0.7075722217559814, "sampling/sampling_logp_difference/max": 0.3617982864379883, "sampling/sampling_logp_difference/mean": 0.015413761138916016, "step": 636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 705.0, "completions/max_terminated_length": 705.0, "completions/mean_length": 363.1875, "completions/mean_terminated_length": 363.1875, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "entropy": 0.3835618495941162, "epoch": 0.7806372549019608, "frac_reward_zero_std": 1.0, "grad_norm": 0.017573581716050824, "kl": 0.021298378705978394, "learning_rate": 9.247773325881769e-07, "loss": 0.0002, "num_tokens": 27728409.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9997230768203735, "sampling/importance_sampling_ratio/min": 0.6392403841018677, "sampling/sampling_logp_difference/max": 0.7237452268600464, "sampling/sampling_logp_difference/mean": 0.011871840804815292, "step": 637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 635.0, "completions/max_terminated_length": 635.0, "completions/mean_length": 403.53125, "completions/mean_terminated_length": 403.53125, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "entropy": 0.5360077023506165, "epoch": 0.7818627450980392, "frac_reward_zero_std": 0.75, "grad_norm": 0.5416777694475564, "kl": 0.029953764751553535, "learning_rate": 9.244011210868895e-07, "loss": 0.0311, "num_tokens": 27772235.0, "reward": 0.8125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.4581115245819092, "sampling/importance_sampling_ratio/mean": 0.9997045993804932, "sampling/importance_sampling_ratio/min": 0.672667384147644, "sampling/sampling_logp_difference/max": 0.396504282951355, "sampling/sampling_logp_difference/mean": 0.016007032245397568, "step": 638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 581.0, "completions/max_terminated_length": 581.0, "completions/mean_length": 399.609375, "completions/mean_terminated_length": 399.609375, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "entropy": 0.39221125841140747, "epoch": 0.7830882352941176, "frac_reward_zero_std": 1.0, "grad_norm": 0.016188411532412482, "kl": 0.021412506699562073, "learning_rate": 9.240240480782129e-07, "loss": 0.0002, "num_tokens": 27814290.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.47077214717865, "sampling/importance_sampling_ratio/mean": 0.9997571706771851, "sampling/importance_sampling_ratio/min": 0.6298390030860901, "sampling/sampling_logp_difference/max": 0.46229100227355957, "sampling/sampling_logp_difference/mean": 0.012807202525436878, "step": 639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1204.0, "completions/max_terminated_length": 1204.0, "completions/mean_length": 413.84375, "completions/mean_terminated_length": 413.84375, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "entropy": 0.5946134328842163, "epoch": 0.7843137254901961, "frac_reward_zero_std": 0.5, "grad_norm": 0.7653876186002535, "kl": 0.02765747904777527, "learning_rate": 9.236461143275815e-07, "loss": -0.0066, "num_tokens": 27860744.0, "reward": 0.65625, "reward_std": 0.47978055477142334, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.3632352352142334, "sampling/importance_sampling_ratio/mean": 1.0001475811004639, "sampling/importance_sampling_ratio/min": 0.7046020030975342, "sampling/sampling_logp_difference/max": 0.35012221336364746, "sampling/sampling_logp_difference/mean": 0.015973515808582306, "step": 640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1112.0, "completions/max_terminated_length": 1112.0, "completions/mean_length": 429.84375, "completions/mean_terminated_length": 429.84375, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.5665589570999146, "epoch": 0.7855392156862745, "frac_reward_zero_std": 0.75, "grad_norm": 0.5530157840536909, "kl": 0.029303185641765594, "learning_rate": 9.232673206021767e-07, "loss": -0.0028, "num_tokens": 27902638.0, "reward": 0.28125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 1.5479487180709839, "sampling/importance_sampling_ratio/mean": 0.9999139904975891, "sampling/importance_sampling_ratio/min": 0.6193941831588745, "sampling/sampling_logp_difference/max": 0.47901344299316406, "sampling/sampling_logp_difference/mean": 0.016818411648273468, "step": 641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 603.0, "completions/max_terminated_length": 603.0, "completions/mean_length": 318.515625, "completions/mean_terminated_length": 318.515625, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.41463521122932434, "epoch": 0.7867647058823529, "frac_reward_zero_std": 0.75, "grad_norm": 0.6569167646224405, "kl": 0.027141470462083817, "learning_rate": 9.228876676709259e-07, "loss": 0.0167, "num_tokens": 27938655.0, "reward": 0.84375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.5017032623291016, "sampling/importance_sampling_ratio/mean": 1.0002378225326538, "sampling/importance_sampling_ratio/min": 0.6147668361663818, "sampling/sampling_logp_difference/max": 0.4865122437477112, "sampling/sampling_logp_difference/mean": 0.013212975114583969, "step": 642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1174.0, "completions/max_terminated_length": 1174.0, "completions/mean_length": 454.421875, "completions/mean_terminated_length": 454.421875, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.485168993473053, "epoch": 0.7879901960784313, "frac_reward_zero_std": 0.5, "grad_norm": 0.8253351839095092, "kl": 0.024212254211306572, "learning_rate": 9.225071563045006e-07, "loss": -0.0359, "num_tokens": 27982906.0, "reward": 0.53125, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001986026763916, "sampling/importance_sampling_ratio/min": 0.662236213684082, "sampling/sampling_logp_difference/max": 0.721818208694458, "sampling/sampling_logp_difference/mean": 0.014464938081800938, "step": 643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.0, "completions/max_terminated_length": 532.0, "completions/mean_length": 310.75, "completions/mean_terminated_length": 310.75, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.5021410584449768, "epoch": 0.7892156862745098, "frac_reward_zero_std": 1.0, "grad_norm": 0.02372217312753795, "kl": 0.030539492145180702, "learning_rate": 9.221257872753144e-07, "loss": 0.0003, "num_tokens": 28019674.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6963704824447632, "sampling/importance_sampling_ratio/mean": 0.9997937083244324, "sampling/importance_sampling_ratio/min": 0.6263241767883301, "sampling/sampling_logp_difference/max": 0.5284909009933472, "sampling/sampling_logp_difference/mean": 0.016404200345277786, "step": 644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 971.0, "completions/max_terminated_length": 971.0, "completions/mean_length": 310.453125, "completions/mean_terminated_length": 310.453125, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.513056755065918, "epoch": 0.7904411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.025264392693258957, "kl": 0.03592563420534134, "learning_rate": 9.217435613575226e-07, "loss": 0.0003, "num_tokens": 28054775.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4591017961502075, "sampling/importance_sampling_ratio/mean": 1.0006626844406128, "sampling/importance_sampling_ratio/min": 0.5304657816886902, "sampling/sampling_logp_difference/max": 0.6339998245239258, "sampling/sampling_logp_difference/mean": 0.016930852085351944, "step": 645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 658.0, "completions/max_terminated_length": 658.0, "completions/mean_length": 338.140625, "completions/mean_terminated_length": 338.140625, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.3598152995109558, "epoch": 0.7916666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.0182972294865589, "kl": 0.020780488848686218, "learning_rate": 9.213604793270196e-07, "loss": 0.0002, "num_tokens": 28091072.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6478437185287476, "sampling/importance_sampling_ratio/mean": 0.9997435808181763, "sampling/importance_sampling_ratio/min": 0.6209517121315002, "sampling/sampling_logp_difference/max": 0.4994676113128662, "sampling/sampling_logp_difference/mean": 0.012984920293092728, "step": 646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 683.0, "completions/max_terminated_length": 683.0, "completions/mean_length": 376.9375, "completions/mean_terminated_length": 376.9375, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.5292526483535767, "epoch": 0.7928921568627451, "frac_reward_zero_std": 0.0, "grad_norm": 1.081993997521435, "kl": 0.031115485355257988, "learning_rate": 9.209765419614373e-07, "loss": -0.0478, "num_tokens": 28130108.0, "reward": 0.34375, "reward_std": 0.7366957664489746, "rewards/decision_reward_func/mean": 0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 1.4684175252914429, "sampling/importance_sampling_ratio/mean": 1.0000196695327759, "sampling/importance_sampling_ratio/min": 0.697420060634613, "sampling/sampling_logp_difference/max": 0.3841853141784668, "sampling/sampling_logp_difference/mean": 0.01550869457423687, "step": 647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2735.0, "completions/max_terminated_length": 2735.0, "completions/mean_length": 606.453125, "completions/mean_terminated_length": 606.453125, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "entropy": 0.42076659202575684, "epoch": 0.7941176470588235, "frac_reward_zero_std": 0.75, "grad_norm": 0.3142339202072955, "kl": 0.017058253288269043, "learning_rate": 9.205917500401447e-07, "loss": 0.0153, "num_tokens": 28187529.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.929645299911499, "sampling/importance_sampling_ratio/mean": 0.9996740818023682, "sampling/importance_sampling_ratio/min": 0.30905482172966003, "sampling/sampling_logp_difference/max": 1.174236536026001, "sampling/sampling_logp_difference/mean": 0.012659020721912384, "step": 648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 773.0, "completions/max_terminated_length": 773.0, "completions/mean_length": 418.40625, "completions/mean_terminated_length": 418.40625, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "entropy": 0.4715825915336609, "epoch": 0.7953431372549019, "frac_reward_zero_std": 1.0, "grad_norm": 0.018048489389450393, "kl": 0.025506971403956413, "learning_rate": 9.202061043442447e-07, "loss": 0.0002, "num_tokens": 28229683.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5864918231964111, "sampling/importance_sampling_ratio/mean": 0.9997325539588928, "sampling/importance_sampling_ratio/min": 0.5106446146965027, "sampling/sampling_logp_difference/max": 0.672081470489502, "sampling/sampling_logp_difference/mean": 0.014682807959616184, "step": 649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 650.0, "completions/max_terminated_length": 650.0, "completions/mean_length": 355.890625, "completions/mean_terminated_length": 355.890625, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.53687983751297, "epoch": 0.7965686274509803, "frac_reward_zero_std": 0.75, "grad_norm": 0.801439406025226, "kl": 0.02337462268769741, "learning_rate": 9.198196056565738e-07, "loss": -0.0316, "num_tokens": 28271372.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.4061212539672852, "sampling/importance_sampling_ratio/mean": 1.0002796649932861, "sampling/importance_sampling_ratio/min": 0.49405932426452637, "sampling/sampling_logp_difference/max": 0.7050997018814087, "sampling/sampling_logp_difference/mean": 0.017136849462985992, "step": 650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1196.0, "completions/max_terminated_length": 1196.0, "completions/mean_length": 422.359375, "completions/mean_terminated_length": 422.359375, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "entropy": 0.5068777799606323, "epoch": 0.7977941176470589, "frac_reward_zero_std": 1.0, "grad_norm": 0.18715074456006353, "kl": 0.023373842239379883, "learning_rate": 9.194322547616997e-07, "loss": 0.0002, "num_tokens": 28316371.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000038504600525, "sampling/importance_sampling_ratio/min": 0.043391499668359756, "sampling/sampling_logp_difference/max": 3.1374917030334473, "sampling/sampling_logp_difference/mean": 0.014584937132894993, "step": 651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 891.0, "completions/max_terminated_length": 891.0, "completions/mean_length": 492.8125, "completions/mean_terminated_length": 492.8125, "completions/min_length": 275.0, "completions/min_terminated_length": 275.0, "entropy": 0.5148261785507202, "epoch": 0.7990196078431373, "frac_reward_zero_std": 0.5, "grad_norm": 0.6382801734378155, "kl": 0.02428807131946087, "learning_rate": 9.190440524459202e-07, "loss": -0.042, "num_tokens": 28368679.0, "reward": 0.53125, "reward_std": 0.5061737298965454, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.6173124313354492, "sampling/importance_sampling_ratio/mean": 1.000253677368164, "sampling/importance_sampling_ratio/min": 0.5521020889282227, "sampling/sampling_logp_difference/max": 0.594022274017334, "sampling/sampling_logp_difference/mean": 0.015186277218163013, "step": 652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1614.0, "completions/max_terminated_length": 1614.0, "completions/mean_length": 516.15625, "completions/mean_terminated_length": 516.15625, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "entropy": 0.37252548336982727, "epoch": 0.8002450980392157, "frac_reward_zero_std": 1.0, "grad_norm": 0.013250116194190694, "kl": 0.0161866694688797, "learning_rate": 9.186549994972616e-07, "loss": 0.0002, "num_tokens": 28420945.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.573258399963379, "sampling/importance_sampling_ratio/mean": 1.0000747442245483, "sampling/importance_sampling_ratio/min": 0.4597857892513275, "sampling/sampling_logp_difference/max": 0.7769945859909058, "sampling/sampling_logp_difference/mean": 0.011466283351182938, "step": 653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 948.0, "completions/max_terminated_length": 948.0, "completions/mean_length": 433.5, "completions/mean_terminated_length": 433.5, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "entropy": 0.5408629775047302, "epoch": 0.8014705882352942, "frac_reward_zero_std": 0.5, "grad_norm": 0.7219374649651352, "kl": 0.0276799313724041, "learning_rate": 9.182650967054766e-07, "loss": 0.0162, "num_tokens": 28468321.0, "reward": 0.28125, "reward_std": 0.4629635810852051, "rewards/decision_reward_func/mean": 0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0004653930664062, "sampling/importance_sampling_ratio/min": 0.6828696727752686, "sampling/sampling_logp_difference/max": 1.7619588375091553, "sampling/sampling_logp_difference/mean": 0.014974364079535007, "step": 654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 880.0, "completions/max_terminated_length": 880.0, "completions/mean_length": 412.46875, "completions/mean_terminated_length": 412.46875, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "entropy": 0.6268619894981384, "epoch": 0.8026960784313726, "frac_reward_zero_std": 0.0, "grad_norm": 1.068340965527355, "kl": 0.033283282071352005, "learning_rate": 9.178743448620431e-07, "loss": 0.0357, "num_tokens": 28512207.0, "reward": -0.03125, "reward_std": 0.7129635810852051, "rewards/decision_reward_func/mean": -0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.6356161832809448, "sampling/importance_sampling_ratio/mean": 1.000330924987793, "sampling/importance_sampling_ratio/min": 0.40683212876319885, "sampling/sampling_logp_difference/max": 0.8993546962738037, "sampling/sampling_logp_difference/mean": 0.01794968545436859, "step": 655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 896.0, "completions/max_terminated_length": 896.0, "completions/mean_length": 376.59375, "completions/mean_terminated_length": 376.59375, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.4603009819984436, "epoch": 0.803921568627451, "frac_reward_zero_std": 0.75, "grad_norm": 0.5884100055530439, "kl": 0.022422660142183304, "learning_rate": 9.174827447601627e-07, "loss": 0.0049, "num_tokens": 28552309.0, "reward": 0.40625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.5920987129211426, "sampling/importance_sampling_ratio/mean": 1.0000126361846924, "sampling/importance_sampling_ratio/min": 0.5905491709709167, "sampling/sampling_logp_difference/max": 0.5267022848129272, "sampling/sampling_logp_difference/mean": 0.013919908553361893, "step": 656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1486.0, "completions/max_terminated_length": 1486.0, "completions/mean_length": 473.5625, "completions/mean_terminated_length": 473.5625, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.5914380550384521, "epoch": 0.8051470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 0.7823135780690114, "kl": 0.03117978200316429, "learning_rate": 9.170902971947588e-07, "loss": 0.066, "num_tokens": 28600537.0, "reward": 0.5, "reward_std": 0.4787135720252991, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6235054731369019, "sampling/importance_sampling_ratio/mean": 1.000133752822876, "sampling/importance_sampling_ratio/min": 0.691336452960968, "sampling/sampling_logp_difference/max": 0.4845876693725586, "sampling/sampling_logp_difference/mean": 0.017927473410964012, "step": 657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 611.0, "completions/max_terminated_length": 611.0, "completions/mean_length": 288.6875, "completions/mean_terminated_length": 288.6875, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.4323112964630127, "epoch": 0.8063725490196079, "frac_reward_zero_std": 1.0, "grad_norm": 0.02055169612713594, "kl": 0.021140070632100105, "learning_rate": 9.166970029624749e-07, "loss": 0.0002, "num_tokens": 28632965.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.5971448421478271, "sampling/importance_sampling_ratio/mean": 1.000010371208191, "sampling/importance_sampling_ratio/min": 0.6017833352088928, "sampling/sampling_logp_difference/max": 0.5078577995300293, "sampling/sampling_logp_difference/mean": 0.014734002761542797, "step": 658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/max_terminated_length": 547.0, "completions/mean_length": 301.15625, "completions/mean_terminated_length": 301.15625, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "entropy": 0.3768772482872009, "epoch": 0.8075980392156863, "frac_reward_zero_std": 1.0, "grad_norm": 0.018397651088844256, "kl": 0.02106613852083683, "learning_rate": 9.163028628616738e-07, "loss": 0.0002, "num_tokens": 28670415.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5003224611282349, "sampling/importance_sampling_ratio/mean": 0.9998708367347717, "sampling/importance_sampling_ratio/min": 0.6254510879516602, "sampling/sampling_logp_difference/max": 0.4692821502685547, "sampling/sampling_logp_difference/mean": 0.012717245146632195, "step": 659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 762.0, "completions/max_terminated_length": 762.0, "completions/mean_length": 430.34375, "completions/mean_terminated_length": 430.34375, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.583654522895813, "epoch": 0.8088235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.8278954790995049, "kl": 0.022226419299840927, "learning_rate": 9.159078776924345e-07, "loss": 0.0271, "num_tokens": 28715765.0, "reward": 0.53125, "reward_std": 0.48935678601264954, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.6235759258270264, "sampling/importance_sampling_ratio/mean": 1.000198483467102, "sampling/importance_sampling_ratio/min": 0.6191027164459229, "sampling/sampling_logp_difference/max": 0.4846310615539551, "sampling/sampling_logp_difference/mean": 0.01733073964715004, "step": 660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 719.0, "completions/max_terminated_length": 719.0, "completions/mean_length": 370.4375, "completions/mean_terminated_length": 370.4375, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "entropy": 0.4277341961860657, "epoch": 0.8100490196078431, "frac_reward_zero_std": 0.75, "grad_norm": 0.6346711802611644, "kl": 0.021520165726542473, "learning_rate": 9.155120482565519e-07, "loss": -0.0308, "num_tokens": 28756865.0, "reward": -0.15625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": -0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.3952908515930176, "sampling/importance_sampling_ratio/mean": 1.0001299381256104, "sampling/importance_sampling_ratio/min": 0.4438421130180359, "sampling/sampling_logp_difference/max": 0.812286376953125, "sampling/sampling_logp_difference/mean": 0.013074219226837158, "step": 661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/max_terminated_length": 537.0, "completions/mean_length": 296.046875, "completions/mean_terminated_length": 296.046875, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.43035760521888733, "epoch": 0.8112745098039216, "frac_reward_zero_std": 0.75, "grad_norm": 0.7404442164424665, "kl": 0.02301495149731636, "learning_rate": 9.15115375357535e-07, "loss": -0.0258, "num_tokens": 28790628.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.4671577215194702, "sampling/importance_sampling_ratio/mean": 1.0001304149627686, "sampling/importance_sampling_ratio/min": 0.7685949206352234, "sampling/sampling_logp_difference/max": 0.38332700729370117, "sampling/sampling_logp_difference/mean": 0.014184681698679924, "step": 662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 569.0, "completions/max_terminated_length": 569.0, "completions/mean_length": 316.90625, "completions/mean_terminated_length": 316.90625, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.4658503532409668, "epoch": 0.8125, "frac_reward_zero_std": 0.75, "grad_norm": 0.5731043884228366, "kl": 0.032778300344944, "learning_rate": 9.147178598006044e-07, "loss": -0.0137, "num_tokens": 28826910.0, "reward": 0.59375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.3734588623046875, "sampling/importance_sampling_ratio/mean": 1.0000901222229004, "sampling/importance_sampling_ratio/min": 0.747981071472168, "sampling/sampling_logp_difference/max": 0.31733226776123047, "sampling/sampling_logp_difference/mean": 0.014142541214823723, "step": 663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 483.0, "completions/max_terminated_length": 483.0, "completions/mean_length": 279.765625, "completions/mean_terminated_length": 279.765625, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "entropy": 0.603342592716217, "epoch": 0.8137254901960784, "frac_reward_zero_std": 0.25, "grad_norm": 1.1595936453442754, "kl": 0.03173351660370827, "learning_rate": 9.143195023926917e-07, "loss": 0.0359, "num_tokens": 28859183.0, "reward": 0.3125, "reward_std": 0.5351393222808838, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.5570191144943237, "sampling/importance_sampling_ratio/mean": 1.0000560283660889, "sampling/importance_sampling_ratio/min": 0.5666036605834961, "sampling/sampling_logp_difference/max": 0.5680952072143555, "sampling/sampling_logp_difference/mean": 0.018185969442129135, "step": 664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1061.0, "completions/max_terminated_length": 1061.0, "completions/mean_length": 396.6875, "completions/mean_terminated_length": 396.6875, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.5798118710517883, "epoch": 0.8149509803921569, "frac_reward_zero_std": 0.75, "grad_norm": 0.660880432083815, "kl": 0.019471071660518646, "learning_rate": 9.139203039424368e-07, "loss": -0.0092, "num_tokens": 28901851.0, "reward": 0.15625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.5767513513565063, "sampling/importance_sampling_ratio/mean": 1.000427007675171, "sampling/importance_sampling_ratio/min": 0.664268970489502, "sampling/sampling_logp_difference/max": 0.4553666114807129, "sampling/sampling_logp_difference/mean": 0.01650768518447876, "step": 665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 283.234375, "completions/mean_terminated_length": 283.234375, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.6035599708557129, "epoch": 0.8161764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.095057744720858, "kl": 0.029612168669700623, "learning_rate": 9.135202652601876e-07, "loss": -0.0748, "num_tokens": 28935754.0, "reward": 0.28125, "reward_std": 0.4515564441680908, "rewards/decision_reward_func/mean": 0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000035047531128, "sampling/importance_sampling_ratio/min": 0.6551541686058044, "sampling/sampling_logp_difference/max": 0.6998968124389648, "sampling/sampling_logp_difference/mean": 0.018596667796373367, "step": 666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 660.0, "completions/max_terminated_length": 660.0, "completions/mean_length": 331.75, "completions/mean_terminated_length": 331.75, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.4460306167602539, "epoch": 0.8174019607843137, "frac_reward_zero_std": 0.5, "grad_norm": 0.925557031667489, "kl": 0.02463204599916935, "learning_rate": 9.131193871579974e-07, "loss": -0.0384, "num_tokens": 28984826.0, "reward": 0.5, "reward_std": 0.40311288833618164, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.8649508953094482, "sampling/importance_sampling_ratio/mean": 0.9999654293060303, "sampling/importance_sampling_ratio/min": 0.41387939453125, "sampling/sampling_logp_difference/max": 0.8821806907653809, "sampling/sampling_logp_difference/mean": 0.015177693217992783, "step": 667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 726.0, "completions/max_terminated_length": 726.0, "completions/mean_length": 296.34375, "completions/mean_terminated_length": 296.34375, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.47225019335746765, "epoch": 0.8186274509803921, "frac_reward_zero_std": 0.5, "grad_norm": 0.9314315572832845, "kl": 0.027171503752470016, "learning_rate": 9.127176704496231e-07, "loss": 0.0151, "num_tokens": 29025024.0, "reward": 0.90625, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.799231767654419, "sampling/importance_sampling_ratio/mean": 1.0000457763671875, "sampling/importance_sampling_ratio/min": 0.649482786655426, "sampling/sampling_logp_difference/max": 0.5873597860336304, "sampling/sampling_logp_difference/mean": 0.015436802059412003, "step": 668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 873.0, "completions/max_terminated_length": 873.0, "completions/mean_length": 335.5, "completions/mean_terminated_length": 335.5, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.4785599112510681, "epoch": 0.8198529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 0.8839923811877707, "kl": 0.03189854323863983, "learning_rate": 9.123151159505241e-07, "loss": -0.0285, "num_tokens": 29060208.0, "reward": -0.03125, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": -0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.364910364151001, "sampling/importance_sampling_ratio/mean": 1.00001060962677, "sampling/importance_sampling_ratio/min": 0.6416495442390442, "sampling/sampling_logp_difference/max": 0.4437129497528076, "sampling/sampling_logp_difference/mean": 0.01423981599509716, "step": 669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 600.0, "completions/max_terminated_length": 600.0, "completions/mean_length": 336.59375, "completions/mean_terminated_length": 336.59375, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.47182199358940125, "epoch": 0.821078431372549, "frac_reward_zero_std": 0.75, "grad_norm": 0.6375899902259867, "kl": 0.022385332733392715, "learning_rate": 9.119117244778607e-07, "loss": 0.0095, "num_tokens": 29101574.0, "reward": 0.84375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.6309034824371338, "sampling/importance_sampling_ratio/mean": 1.0002764463424683, "sampling/importance_sampling_ratio/min": 0.6167231798171997, "sampling/sampling_logp_difference/max": 0.48913419246673584, "sampling/sampling_logp_difference/mean": 0.014701426029205322, "step": 670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 528.0, "completions/max_terminated_length": 528.0, "completions/mean_length": 283.484375, "completions/mean_terminated_length": 283.484375, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "entropy": 0.47811123728752136, "epoch": 0.8223039215686274, "frac_reward_zero_std": 0.75, "grad_norm": 0.7569794575929324, "kl": 0.021481478586792946, "learning_rate": 9.115074968504921e-07, "loss": 0.0274, "num_tokens": 29141349.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.8783122301101685, "sampling/importance_sampling_ratio/mean": 0.9996462464332581, "sampling/importance_sampling_ratio/min": 0.6267091631889343, "sampling/sampling_logp_difference/max": 0.6303735971450806, "sampling/sampling_logp_difference/mean": 0.015143398195505142, "step": 671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 579.0, "completions/max_terminated_length": 579.0, "completions/mean_length": 307.6875, "completions/mean_terminated_length": 307.6875, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.43028807640075684, "epoch": 0.8235294117647058, "frac_reward_zero_std": 1.0, "grad_norm": 0.019012903381597276, "kl": 0.02027730643749237, "learning_rate": 9.111024338889746e-07, "loss": 0.0002, "num_tokens": 29175185.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.524336576461792, "sampling/importance_sampling_ratio/mean": 0.9998748302459717, "sampling/importance_sampling_ratio/min": 0.6182277798652649, "sampling/sampling_logp_difference/max": 0.480898380279541, "sampling/sampling_logp_difference/mean": 0.014346718788146973, "step": 672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 703.0, "completions/max_terminated_length": 703.0, "completions/mean_length": 336.71875, "completions/mean_terminated_length": 336.71875, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.4774381220340729, "epoch": 0.8247549019607843, "frac_reward_zero_std": 0.5, "grad_norm": 0.8987339803036836, "kl": 0.019648464396595955, "learning_rate": 9.106965364155605e-07, "loss": -0.0367, "num_tokens": 29216047.0, "reward": 0.125, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 1.626283884048462, "sampling/importance_sampling_ratio/mean": 0.9999353885650635, "sampling/importance_sampling_ratio/min": 0.6622886657714844, "sampling/sampling_logp_difference/max": 0.486297607421875, "sampling/sampling_logp_difference/mean": 0.015515013597905636, "step": 673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1262.0, "completions/max_terminated_length": 1262.0, "completions/mean_length": 322.625, "completions/mean_terminated_length": 322.625, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.4955322742462158, "epoch": 0.8259803921568627, "frac_reward_zero_std": 0.75, "grad_norm": 0.7249342862155759, "kl": 0.022652728483080864, "learning_rate": 9.102898052541957e-07, "loss": -0.0005, "num_tokens": 29257383.0, "reward": 0.75, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.4399265050888062, "sampling/importance_sampling_ratio/mean": 1.0003396272659302, "sampling/importance_sampling_ratio/min": 0.6256870031356812, "sampling/sampling_logp_difference/max": 0.468904972076416, "sampling/sampling_logp_difference/mean": 0.01538359746336937, "step": 674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 452.0, "completions/max_terminated_length": 452.0, "completions/mean_length": 248.890625, "completions/mean_terminated_length": 248.890625, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.37374693155288696, "epoch": 0.8272058823529411, "frac_reward_zero_std": 1.0, "grad_norm": 0.022835147463738194, "kl": 0.026311898604035378, "learning_rate": 9.09882241230519e-07, "loss": 0.0002, "num_tokens": 29288672.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3667004108428955, "sampling/importance_sampling_ratio/mean": 1.0000147819519043, "sampling/importance_sampling_ratio/min": 0.47555723786354065, "sampling/sampling_logp_difference/max": 0.7432680130004883, "sampling/sampling_logp_difference/mean": 0.013161120936274529, "step": 675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 763.0, "completions/max_terminated_length": 763.0, "completions/mean_length": 405.03125, "completions/mean_terminated_length": 405.03125, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "entropy": 0.35625386238098145, "epoch": 0.8284313725490197, "frac_reward_zero_std": 1.0, "grad_norm": 0.015354626263078226, "kl": 0.015123963356018066, "learning_rate": 9.094738451718593e-07, "loss": 0.0001, "num_tokens": 29330658.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4129716157913208, "sampling/importance_sampling_ratio/mean": 0.9998320937156677, "sampling/importance_sampling_ratio/min": 0.6669929027557373, "sampling/sampling_logp_difference/max": 0.40497589111328125, "sampling/sampling_logp_difference/mean": 0.011878539808094501, "step": 676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 656.0, "completions/max_terminated_length": 656.0, "completions/mean_length": 280.96875, "completions/mean_terminated_length": 280.96875, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.46380460262298584, "epoch": 0.8296568627450981, "frac_reward_zero_std": 0.75, "grad_norm": 0.7036670847199894, "kl": 0.026105191558599472, "learning_rate": 9.09064617907235e-07, "loss": -0.0199, "num_tokens": 29362768.0, "reward": 0.375, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.295134425163269, "sampling/importance_sampling_ratio/mean": 1.0003639459609985, "sampling/importance_sampling_ratio/min": 0.6602379679679871, "sampling/sampling_logp_difference/max": 0.41515493392944336, "sampling/sampling_logp_difference/mean": 0.014787452295422554, "step": 677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 283.3125, "completions/mean_terminated_length": 283.3125, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.42619043588638306, "epoch": 0.8308823529411765, "frac_reward_zero_std": 0.75, "grad_norm": 0.8193224825009561, "kl": 0.02275916561484337, "learning_rate": 9.086545602673513e-07, "loss": -0.0121, "num_tokens": 29395684.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.4821025133132935, "sampling/importance_sampling_ratio/mean": 1.0002894401550293, "sampling/importance_sampling_ratio/min": 0.6545161008834839, "sampling/sampling_logp_difference/max": 0.4238591194152832, "sampling/sampling_logp_difference/mean": 0.01420265156775713, "step": 678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 759.0, "completions/max_terminated_length": 759.0, "completions/mean_length": 389.0, "completions/mean_terminated_length": 389.0, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "entropy": 0.4470488727092743, "epoch": 0.8321078431372549, "frac_reward_zero_std": 0.75, "grad_norm": 0.47578380432605405, "kl": 0.018051322549581528, "learning_rate": 9.082436730845993e-07, "loss": 0.0047, "num_tokens": 29438356.0, "reward": 0.40625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.3001091480255127, "sampling/importance_sampling_ratio/mean": 1.0004690885543823, "sampling/importance_sampling_ratio/min": 0.5788013935089111, "sampling/sampling_logp_difference/max": 0.5467958450317383, "sampling/sampling_logp_difference/mean": 0.013391317799687386, "step": 679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 883.0, "completions/max_terminated_length": 883.0, "completions/mean_length": 374.6875, "completions/mean_terminated_length": 374.6875, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.4741774797439575, "epoch": 0.8333333333333334, "frac_reward_zero_std": 0.5, "grad_norm": 0.8768173413817039, "kl": 0.02072528935968876, "learning_rate": 9.07831957193054e-07, "loss": -0.0202, "num_tokens": 29484048.0, "reward": -0.3125, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": -0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.437049150466919, "sampling/importance_sampling_ratio/mean": 1.000245451927185, "sampling/importance_sampling_ratio/min": 0.5516537427902222, "sampling/sampling_logp_difference/max": 0.5948346853256226, "sampling/sampling_logp_difference/mean": 0.013711447827517986, "step": 680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/max_terminated_length": 496.0, "completions/mean_length": 279.734375, "completions/mean_terminated_length": 279.734375, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.5056754946708679, "epoch": 0.8345588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 0.9403074091847332, "kl": 0.02644583210349083, "learning_rate": 9.074194134284725e-07, "loss": -0.0035, "num_tokens": 29520447.0, "reward": 0.90625, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.5747820138931274, "sampling/importance_sampling_ratio/mean": 1.0002191066741943, "sampling/importance_sampling_ratio/min": 0.6644045114517212, "sampling/sampling_logp_difference/max": 0.4541168212890625, "sampling/sampling_logp_difference/mean": 0.016629818826913834, "step": 681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 611.0, "completions/max_terminated_length": 611.0, "completions/mean_length": 332.71875, "completions/mean_terminated_length": 332.71875, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.5383155345916748, "epoch": 0.8357843137254902, "frac_reward_zero_std": 0.5, "grad_norm": 0.9499255561928616, "kl": 0.02250475250184536, "learning_rate": 9.070060426282924e-07, "loss": -0.0283, "num_tokens": 29560829.0, "reward": 0.65625, "reward_std": 0.375, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.6540143489837646, "sampling/importance_sampling_ratio/mean": 1.0002341270446777, "sampling/importance_sampling_ratio/min": 0.6521726250648499, "sampling/sampling_logp_difference/max": 0.5032052993774414, "sampling/sampling_logp_difference/mean": 0.01608927734196186, "step": 682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 650.0, "completions/max_terminated_length": 650.0, "completions/mean_length": 358.125, "completions/mean_terminated_length": 358.125, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.5889772772789001, "epoch": 0.8370098039215687, "frac_reward_zero_std": 0.5, "grad_norm": 0.8589078634702219, "kl": 0.019764846190810204, "learning_rate": 9.065918456316303e-07, "loss": -0.0031, "num_tokens": 29598805.0, "reward": 0.28125, "reward_std": 0.42516323924064636, "rewards/decision_reward_func/mean": 0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 1.5901957750320435, "sampling/importance_sampling_ratio/mean": 1.0004217624664307, "sampling/importance_sampling_ratio/min": 0.6515827178955078, "sampling/sampling_logp_difference/max": 0.46385717391967773, "sampling/sampling_logp_difference/mean": 0.01740207150578499, "step": 683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 927.0, "completions/max_terminated_length": 927.0, "completions/mean_length": 496.984375, "completions/mean_terminated_length": 496.984375, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "entropy": 0.48557600378990173, "epoch": 0.8382352941176471, "frac_reward_zero_std": 0.25, "grad_norm": 0.6800959461204533, "kl": 0.01984775811433792, "learning_rate": 9.061768232792802e-07, "loss": 0.0472, "num_tokens": 29655876.0, "reward": -0.4375, "reward_std": 0.6393726468086243, "rewards/decision_reward_func/mean": -0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.826867938041687, "sampling/importance_sampling_ratio/mean": 1.0000208616256714, "sampling/importance_sampling_ratio/min": 0.5358911752700806, "sampling/sampling_logp_difference/max": 0.6238241195678711, "sampling/sampling_logp_difference/mean": 0.013095609843730927, "step": 684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1216.0, "completions/max_terminated_length": 1216.0, "completions/mean_length": 439.34375, "completions/mean_terminated_length": 439.34375, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.506422221660614, "epoch": 0.8394607843137255, "frac_reward_zero_std": 0.5, "grad_norm": 0.6404555863942724, "kl": 0.021289872005581856, "learning_rate": 9.057609764137109e-07, "loss": 0.0487, "num_tokens": 29703914.0, "reward": 0.09375, "reward_std": 0.4101392924785614, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.6602140665054321, "sampling/importance_sampling_ratio/mean": 1.0000650882720947, "sampling/importance_sampling_ratio/min": 0.6484804153442383, "sampling/sampling_logp_difference/max": 0.5069465637207031, "sampling/sampling_logp_difference/mean": 0.014684144407510757, "step": 685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 954.0, "completions/max_terminated_length": 954.0, "completions/mean_length": 436.296875, "completions/mean_terminated_length": 436.296875, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "entropy": 0.597631573677063, "epoch": 0.8406862745098039, "frac_reward_zero_std": 0.5, "grad_norm": 0.697022331531046, "kl": 0.024693887680768967, "learning_rate": 9.053443058790651e-07, "loss": -0.015, "num_tokens": 29751133.0, "reward": 0.0, "reward_std": 0.34156501293182373, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.4346925020217896, "sampling/importance_sampling_ratio/mean": 1.0002212524414062, "sampling/importance_sampling_ratio/min": 0.3532833755016327, "sampling/sampling_logp_difference/max": 1.0404847860336304, "sampling/sampling_logp_difference/mean": 0.0165217537432909, "step": 686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 641.0, "completions/max_terminated_length": 641.0, "completions/mean_length": 364.203125, "completions/mean_terminated_length": 364.203125, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.5378164649009705, "epoch": 0.8419117647058824, "frac_reward_zero_std": 0.5, "grad_norm": 0.6605426859353407, "kl": 0.02419346570968628, "learning_rate": 9.049268125211575e-07, "loss": -0.0048, "num_tokens": 29791722.0, "reward": 0.4375, "reward_std": 0.3943893015384674, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.6049288511276245, "sampling/importance_sampling_ratio/mean": 1.0004515647888184, "sampling/importance_sampling_ratio/min": 0.5943827033042908, "sampling/sampling_logp_difference/max": 0.5202319622039795, "sampling/sampling_logp_difference/mean": 0.014875588938593864, "step": 687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 794.0, "completions/max_terminated_length": 794.0, "completions/mean_length": 334.40625, "completions/mean_terminated_length": 334.40625, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.2714303135871887, "epoch": 0.8431372549019608, "frac_reward_zero_std": 1.0, "grad_norm": 0.01604506953570414, "kl": 0.022675104439258575, "learning_rate": 9.045084971874737e-07, "loss": 0.0002, "num_tokens": 29830452.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9376097917556763, "sampling/importance_sampling_ratio/mean": 0.9998912215232849, "sampling/importance_sampling_ratio/min": 0.6466982364654541, "sampling/sampling_logp_difference/max": 0.6614551544189453, "sampling/sampling_logp_difference/mean": 0.011525522917509079, "step": 688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1577.0, "completions/max_terminated_length": 1577.0, "completions/mean_length": 542.21875, "completions/mean_terminated_length": 542.21875, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.44198617339134216, "epoch": 0.8443627450980392, "frac_reward_zero_std": 0.5, "grad_norm": 0.628688558260542, "kl": 0.019932780414819717, "learning_rate": 9.040893607271668e-07, "loss": 0.0055, "num_tokens": 29890050.0, "reward": 0.8125, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.3620685338974, "sampling/importance_sampling_ratio/mean": 0.9997965693473816, "sampling/importance_sampling_ratio/min": 0.48103955388069153, "sampling/sampling_logp_difference/max": 0.7318058013916016, "sampling/sampling_logp_difference/mean": 0.01346408762037754, "step": 689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1504.0, "completions/max_terminated_length": 1504.0, "completions/mean_length": 368.859375, "completions/mean_terminated_length": 368.859375, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.4946761727333069, "epoch": 0.8455882352941176, "frac_reward_zero_std": 0.75, "grad_norm": 0.6177159924749556, "kl": 0.022021617740392685, "learning_rate": 9.036694039910576e-07, "loss": -0.0313, "num_tokens": 29929641.0, "reward": 0.40625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.77708101272583, "sampling/importance_sampling_ratio/mean": 1.0001999139785767, "sampling/importance_sampling_ratio/min": 0.608414351940155, "sampling/sampling_logp_difference/max": 0.5749721527099609, "sampling/sampling_logp_difference/mean": 0.01626063510775566, "step": 690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1060.0, "completions/max_terminated_length": 1060.0, "completions/mean_length": 352.453125, "completions/mean_terminated_length": 352.453125, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.5227857232093811, "epoch": 0.8468137254901961, "frac_reward_zero_std": 0.5, "grad_norm": 0.9370930102246604, "kl": 0.027432329952716827, "learning_rate": 9.032486278316313e-07, "loss": 0.0622, "num_tokens": 29969910.0, "reward": -0.1875, "reward_std": 0.42898139357566833, "rewards/decision_reward_func/mean": -0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 1.5680373907089233, "sampling/importance_sampling_ratio/mean": 1.0002317428588867, "sampling/importance_sampling_ratio/min": 0.6631356477737427, "sampling/sampling_logp_difference/max": 0.44982481002807617, "sampling/sampling_logp_difference/mean": 0.015638258308172226, "step": 691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 700.0, "completions/max_terminated_length": 700.0, "completions/mean_length": 347.359375, "completions/mean_terminated_length": 347.359375, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "entropy": 0.49333685636520386, "epoch": 0.8480392156862745, "frac_reward_zero_std": 0.75, "grad_norm": 0.6039575585933364, "kl": 0.022086523473262787, "learning_rate": 9.028270331030372e-07, "loss": 0.0139, "num_tokens": 30010237.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.6288621425628662, "sampling/importance_sampling_ratio/mean": 1.000130534172058, "sampling/importance_sampling_ratio/min": 0.6147577166557312, "sampling/sampling_logp_difference/max": 0.4878816604614258, "sampling/sampling_logp_difference/mean": 0.015523107722401619, "step": 692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 646.0, "completions/max_terminated_length": 646.0, "completions/mean_length": 332.265625, "completions/mean_terminated_length": 332.265625, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.5326172113418579, "epoch": 0.8492647058823529, "frac_reward_zero_std": 0.75, "grad_norm": 0.654505222023555, "kl": 0.027355201542377472, "learning_rate": 9.024046206610857e-07, "loss": -0.0083, "num_tokens": 30050414.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.5234898328781128, "sampling/importance_sampling_ratio/mean": 0.9999322891235352, "sampling/importance_sampling_ratio/min": 0.6160730123519897, "sampling/sampling_logp_difference/max": 0.4843897819519043, "sampling/sampling_logp_difference/mean": 0.01627400517463684, "step": 693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 637.0, "completions/max_terminated_length": 637.0, "completions/mean_length": 364.203125, "completions/mean_terminated_length": 364.203125, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "entropy": 0.42574116587638855, "epoch": 0.8504901960784313, "frac_reward_zero_std": 0.75, "grad_norm": 0.5541295707224719, "kl": 0.019531676545739174, "learning_rate": 9.019813913632475e-07, "loss": 0.0127, "num_tokens": 30091611.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.4520457983016968, "sampling/importance_sampling_ratio/mean": 0.999841570854187, "sampling/importance_sampling_ratio/min": 0.6497071385383606, "sampling/sampling_logp_difference/max": 0.43123364448547363, "sampling/sampling_logp_difference/mean": 0.013972226530313492, "step": 694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 991.0, "completions/max_terminated_length": 991.0, "completions/mean_length": 335.21875, "completions/mean_terminated_length": 335.21875, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.48542913794517517, "epoch": 0.8517156862745098, "frac_reward_zero_std": 1.0, "grad_norm": 0.016641284653607497, "kl": 0.02430626004934311, "learning_rate": 9.015573460686509e-07, "loss": 0.0002, "num_tokens": 30129609.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6310129165649414, "sampling/importance_sampling_ratio/mean": 1.0003182888031006, "sampling/importance_sampling_ratio/min": 0.6675378084182739, "sampling/sampling_logp_difference/max": 0.48920130729675293, "sampling/sampling_logp_difference/mean": 0.016119666397571564, "step": 695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 768.0, "completions/max_terminated_length": 768.0, "completions/mean_length": 315.484375, "completions/mean_terminated_length": 315.484375, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.5227433443069458, "epoch": 0.8529411764705882, "frac_reward_zero_std": 0.75, "grad_norm": 0.8459355049355585, "kl": 0.028807630762457848, "learning_rate": 9.011324856380813e-07, "loss": -0.0444, "num_tokens": 30166744.0, "reward": 0.125, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 1.5670193433761597, "sampling/importance_sampling_ratio/mean": 0.9999303817749023, "sampling/importance_sampling_ratio/min": 0.7226687669754028, "sampling/sampling_logp_difference/max": 0.4491753578186035, "sampling/sampling_logp_difference/mean": 0.016579223796725273, "step": 696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 914.0, "completions/max_terminated_length": 914.0, "completions/mean_length": 525.125, "completions/mean_terminated_length": 525.125, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "entropy": 0.47968608140945435, "epoch": 0.8541666666666666, "frac_reward_zero_std": 0.75, "grad_norm": 0.42001730658680797, "kl": 0.014978294260799885, "learning_rate": 9.007068109339783e-07, "loss": -0.0306, "num_tokens": 30219344.0, "reward": 0.40625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.5629929304122925, "sampling/importance_sampling_ratio/mean": 1.0000066757202148, "sampling/importance_sampling_ratio/min": 0.5694761872291565, "sampling/sampling_logp_difference/max": 0.5630382299423218, "sampling/sampling_logp_difference/mean": 0.013584841974079609, "step": 697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 821.0, "completions/max_terminated_length": 821.0, "completions/mean_length": 376.4375, "completions/mean_terminated_length": 376.4375, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.4292042553424835, "epoch": 0.8553921568627451, "frac_reward_zero_std": 1.0, "grad_norm": 0.015785184935149944, "kl": 0.02235559932887554, "learning_rate": 9.002803228204348e-07, "loss": 0.0002, "num_tokens": 30262876.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4184337854385376, "sampling/importance_sampling_ratio/mean": 1.0002660751342773, "sampling/importance_sampling_ratio/min": 0.6771603226661682, "sampling/sampling_logp_difference/max": 0.3898472785949707, "sampling/sampling_logp_difference/mean": 0.012922734022140503, "step": 698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 722.0, "completions/max_terminated_length": 722.0, "completions/mean_length": 360.78125, "completions/mean_terminated_length": 360.78125, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "entropy": 0.4596289396286011, "epoch": 0.8566176470588235, "frac_reward_zero_std": 1.0, "grad_norm": 0.01993135889655386, "kl": 0.022292353212833405, "learning_rate": 8.998530221631941e-07, "loss": 0.0002, "num_tokens": 30306014.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6595524549484253, "sampling/importance_sampling_ratio/mean": 1.0004026889801025, "sampling/importance_sampling_ratio/min": 0.6185416579246521, "sampling/sampling_logp_difference/max": 0.5065479278564453, "sampling/sampling_logp_difference/mean": 0.014771121554076672, "step": 699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1213.0, "completions/max_terminated_length": 1213.0, "completions/mean_length": 425.765625, "completions/mean_terminated_length": 425.765625, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "entropy": 0.4893128275871277, "epoch": 0.8578431372549019, "frac_reward_zero_std": 0.5, "grad_norm": 0.7898373956626724, "kl": 0.02412707544863224, "learning_rate": 8.994249098296502e-07, "loss": 0.0152, "num_tokens": 30350959.0, "reward": 0.0625, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.475465178489685, "sampling/importance_sampling_ratio/mean": 0.9999678134918213, "sampling/importance_sampling_ratio/min": 0.640938937664032, "sampling/sampling_logp_difference/max": 0.4448211193084717, "sampling/sampling_logp_difference/mean": 0.01452726125717163, "step": 700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 809.0, "completions/max_terminated_length": 809.0, "completions/mean_length": 474.828125, "completions/mean_terminated_length": 474.828125, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "entropy": 0.4295782148838043, "epoch": 0.8590686274509803, "frac_reward_zero_std": 0.75, "grad_norm": 0.5112253522604674, "kl": 0.018711410462856293, "learning_rate": 8.989959866888437e-07, "loss": 0.0262, "num_tokens": 30401012.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.4633970260620117, "sampling/importance_sampling_ratio/mean": 0.9999186992645264, "sampling/importance_sampling_ratio/min": 0.7287550568580627, "sampling/sampling_logp_difference/max": 0.38076043128967285, "sampling/sampling_logp_difference/mean": 0.013084682635962963, "step": 701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 310.328125, "completions/mean_terminated_length": 310.328125, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.39822226762771606, "epoch": 0.8602941176470589, "frac_reward_zero_std": 1.0, "grad_norm": 0.017408720154562747, "kl": 0.023206371814012527, "learning_rate": 8.985662536114612e-07, "loss": 0.0002, "num_tokens": 30440393.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.404929757118225, "sampling/importance_sampling_ratio/mean": 0.9996552467346191, "sampling/importance_sampling_ratio/min": 0.6310294270515442, "sampling/sampling_logp_difference/max": 0.4604027271270752, "sampling/sampling_logp_difference/mean": 0.014070119708776474, "step": 702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1197.0, "completions/max_terminated_length": 1197.0, "completions/mean_length": 394.5625, "completions/mean_terminated_length": 394.5625, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "entropy": 0.37109243869781494, "epoch": 0.8615196078431373, "frac_reward_zero_std": 0.75, "grad_norm": 0.4790825359017458, "kl": 0.020038090646266937, "learning_rate": 8.981357114698338e-07, "loss": 0.0136, "num_tokens": 30487533.0, "reward": 0.59375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0007641315460205, "sampling/importance_sampling_ratio/min": 0.609726071357727, "sampling/sampling_logp_difference/max": 0.9975347518920898, "sampling/sampling_logp_difference/mean": 0.012069333344697952, "step": 703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 905.0, "completions/max_terminated_length": 905.0, "completions/mean_length": 365.09375, "completions/mean_terminated_length": 365.09375, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "entropy": 0.43533891439437866, "epoch": 0.8627450980392157, "frac_reward_zero_std": 1.0, "grad_norm": 0.028945125303351173, "kl": 0.01868702843785286, "learning_rate": 8.977043611379349e-07, "loss": 0.0002, "num_tokens": 30528083.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4892568588256836, "sampling/importance_sampling_ratio/mean": 1.0000648498535156, "sampling/importance_sampling_ratio/min": 0.5531142950057983, "sampling/sampling_logp_difference/max": 0.592190682888031, "sampling/sampling_logp_difference/mean": 0.013963067904114723, "step": 704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 791.0, "completions/max_terminated_length": 791.0, "completions/mean_length": 408.21875, "completions/mean_terminated_length": 408.21875, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "entropy": 0.43073317408561707, "epoch": 0.8639705882352942, "frac_reward_zero_std": 1.0, "grad_norm": 0.013166446499770931, "kl": 0.01549526397138834, "learning_rate": 8.972722034913781e-07, "loss": 0.0002, "num_tokens": 30577841.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5288033485412598, "sampling/importance_sampling_ratio/mean": 1.0001356601715088, "sampling/importance_sampling_ratio/min": 0.6794670820236206, "sampling/sampling_logp_difference/max": 0.42448532581329346, "sampling/sampling_logp_difference/mean": 0.013769428245723248, "step": 705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 631.0, "completions/max_terminated_length": 631.0, "completions/mean_length": 340.09375, "completions/mean_terminated_length": 340.09375, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.5550956726074219, "epoch": 0.8651960784313726, "frac_reward_zero_std": 0.5, "grad_norm": 0.7431898001991829, "kl": 0.032094866037368774, "learning_rate": 8.968392394074163e-07, "loss": -0.0209, "num_tokens": 30615671.0, "reward": 0.125, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": 0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 1.6112583875656128, "sampling/importance_sampling_ratio/mean": 0.9997324347496033, "sampling/importance_sampling_ratio/min": 0.6310673356056213, "sampling/sampling_logp_difference/max": 0.47701549530029297, "sampling/sampling_logp_difference/mean": 0.01641049236059189, "step": 706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 630.0, "completions/max_terminated_length": 630.0, "completions/mean_length": 316.0625, "completions/mean_terminated_length": 316.0625, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.4902362823486328, "epoch": 0.866421568627451, "frac_reward_zero_std": 0.75, "grad_norm": 0.7748865240721106, "kl": 0.022882036864757538, "learning_rate": 8.964054697649388e-07, "loss": -0.0235, "num_tokens": 30654987.0, "reward": 0.75, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.4385961294174194, "sampling/importance_sampling_ratio/mean": 0.9999081492424011, "sampling/importance_sampling_ratio/min": 0.5480189323425293, "sampling/sampling_logp_difference/max": 0.6014454364776611, "sampling/sampling_logp_difference/mean": 0.015397557057440281, "step": 707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 624.0, "completions/max_terminated_length": 624.0, "completions/mean_length": 336.59375, "completions/mean_terminated_length": 336.59375, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "entropy": 0.37051692605018616, "epoch": 0.8676470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.014311632427486069, "kl": 0.01798148639500141, "learning_rate": 8.959708954444708e-07, "loss": 0.0002, "num_tokens": 30689969.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.2983293533325195, "sampling/importance_sampling_ratio/mean": 1.0000379085540771, "sampling/importance_sampling_ratio/min": 0.6997737884521484, "sampling/sampling_logp_difference/max": 0.3569982051849365, "sampling/sampling_logp_difference/mean": 0.012473361566662788, "step": 708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 614.0, "completions/max_terminated_length": 614.0, "completions/mean_length": 385.125, "completions/mean_terminated_length": 385.125, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.4827260375022888, "epoch": 0.8688725490196079, "frac_reward_zero_std": 1.0, "grad_norm": 0.015024087064209617, "kl": 0.01851494237780571, "learning_rate": 8.955355173281707e-07, "loss": 0.0002, "num_tokens": 30729865.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3339203596115112, "sampling/importance_sampling_ratio/mean": 0.9998549222946167, "sampling/importance_sampling_ratio/min": 0.6453644037246704, "sampling/sampling_logp_difference/max": 0.4379401206970215, "sampling/sampling_logp_difference/mean": 0.0145803764462471, "step": 709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 582.0, "completions/max_terminated_length": 582.0, "completions/mean_length": 359.90625, "completions/mean_terminated_length": 359.90625, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "entropy": 0.4095238149166107, "epoch": 0.8700980392156863, "frac_reward_zero_std": 1.0, "grad_norm": 0.01728535135110995, "kl": 0.019466537982225418, "learning_rate": 8.95099336299828e-07, "loss": 0.0002, "num_tokens": 30771715.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000145435333252, "sampling/importance_sampling_ratio/min": 0.47583886981010437, "sampling/sampling_logp_difference/max": 1.0642147064208984, "sampling/sampling_logp_difference/mean": 0.014873587526381016, "step": 710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 810.0, "completions/max_terminated_length": 810.0, "completions/mean_length": 414.734375, "completions/mean_terminated_length": 414.734375, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.42103826999664307, "epoch": 0.8713235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.01364624616781679, "kl": 0.0170944482088089, "learning_rate": 8.946623532448631e-07, "loss": 0.0001, "num_tokens": 30816706.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4964550733566284, "sampling/importance_sampling_ratio/mean": 1.0001229047775269, "sampling/importance_sampling_ratio/min": 0.5532041788101196, "sampling/sampling_logp_difference/max": 0.5920281410217285, "sampling/sampling_logp_difference/mean": 0.014076121151447296, "step": 711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/max_terminated_length": 516.0, "completions/mean_length": 287.359375, "completions/mean_terminated_length": 287.359375, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.39210981130599976, "epoch": 0.8725490196078431, "frac_reward_zero_std": 0.75, "grad_norm": 0.7124865286891022, "kl": 0.023384779691696167, "learning_rate": 8.942245690503238e-07, "loss": 0.0002, "num_tokens": 30850889.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.5487998723983765, "sampling/importance_sampling_ratio/mean": 0.9999485611915588, "sampling/importance_sampling_ratio/min": 0.671283483505249, "sampling/sampling_logp_difference/max": 0.4374803304672241, "sampling/sampling_logp_difference/mean": 0.013755500316619873, "step": 712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 580.0, "completions/max_terminated_length": 580.0, "completions/mean_length": 345.234375, "completions/mean_terminated_length": 345.234375, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "entropy": 0.32174766063690186, "epoch": 0.8737745098039216, "frac_reward_zero_std": 1.0, "grad_norm": 0.014805164441209965, "kl": 0.01743542216718197, "learning_rate": 8.937859846048842e-07, "loss": 0.0002, "num_tokens": 30889640.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4739784002304077, "sampling/importance_sampling_ratio/mean": 0.999937117099762, "sampling/importance_sampling_ratio/min": 0.6974900960922241, "sampling/sampling_logp_difference/max": 0.3879650831222534, "sampling/sampling_logp_difference/mean": 0.010732530616223812, "step": 713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 791.0, "completions/max_terminated_length": 791.0, "completions/mean_length": 396.21875, "completions/mean_terminated_length": 396.21875, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.41016513109207153, "epoch": 0.875, "frac_reward_zero_std": 1.0, "grad_norm": 0.01397275242382541, "kl": 0.02357039600610733, "learning_rate": 8.933466007988429e-07, "loss": 0.0002, "num_tokens": 30931750.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999474287033081, "sampling/importance_sampling_ratio/min": 0.7148044109344482, "sampling/sampling_logp_difference/max": 0.9235944747924805, "sampling/sampling_logp_difference/mean": 0.012909088283777237, "step": 714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 559.0, "completions/max_terminated_length": 559.0, "completions/mean_length": 316.640625, "completions/mean_terminated_length": 316.640625, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.34647172689437866, "epoch": 0.8762254901960784, "frac_reward_zero_std": 1.0, "grad_norm": 0.016525355607888398, "kl": 0.01966424286365509, "learning_rate": 8.929064185241212e-07, "loss": 0.0002, "num_tokens": 30964143.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.5019327402114868, "sampling/importance_sampling_ratio/mean": 1.0001810789108276, "sampling/importance_sampling_ratio/min": 0.6117680072784424, "sampling/sampling_logp_difference/max": 0.49140214920043945, "sampling/sampling_logp_difference/mean": 0.012148333713412285, "step": 715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 605.0, "completions/max_terminated_length": 605.0, "completions/mean_length": 286.09375, "completions/mean_terminated_length": 286.09375, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.39167314767837524, "epoch": 0.8774509803921569, "frac_reward_zero_std": 0.75, "grad_norm": 0.777644471175359, "kl": 0.024472245946526527, "learning_rate": 8.924654386742611e-07, "loss": 0.013, "num_tokens": 30999173.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999997019767761, "sampling/importance_sampling_ratio/min": 0.6855020523071289, "sampling/sampling_logp_difference/max": 1.0046401023864746, "sampling/sampling_logp_difference/mean": 0.013427122496068478, "step": 716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 683.0, "completions/max_terminated_length": 683.0, "completions/mean_length": 297.296875, "completions/mean_terminated_length": 297.296875, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.38246965408325195, "epoch": 0.8786764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.9911292255037413, "kl": 0.022942353039979935, "learning_rate": 8.920236621444242e-07, "loss": 0.0258, "num_tokens": 31035864.0, "reward": 0.5, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4327000379562378, "sampling/importance_sampling_ratio/mean": 1.0004740953445435, "sampling/importance_sampling_ratio/min": 0.7431004047393799, "sampling/sampling_logp_difference/max": 0.3595607280731201, "sampling/sampling_logp_difference/mean": 0.013057740405201912, "step": 717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/max_terminated_length": 499.0, "completions/mean_length": 276.890625, "completions/mean_terminated_length": 276.890625, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.44520217180252075, "epoch": 0.8799019607843137, "frac_reward_zero_std": 0.75, "grad_norm": 0.6449667376639696, "kl": 0.023685146123170853, "learning_rate": 8.915810898313884e-07, "loss": 0.0034, "num_tokens": 31073393.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.8556653261184692, "sampling/importance_sampling_ratio/mean": 0.9999388456344604, "sampling/importance_sampling_ratio/min": 0.3159169554710388, "sampling/sampling_logp_difference/max": 1.1522759199142456, "sampling/sampling_logp_difference/mean": 0.015152106992900372, "step": 718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 724.0, "completions/max_terminated_length": 724.0, "completions/mean_length": 369.546875, "completions/mean_terminated_length": 369.546875, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.4615427851676941, "epoch": 0.8811274509803921, "frac_reward_zero_std": 0.5, "grad_norm": 0.7763612799665791, "kl": 0.021126801148056984, "learning_rate": 8.911377226335478e-07, "loss": 0.0182, "num_tokens": 31119268.0, "reward": -0.125, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": -0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 1.3641375303268433, "sampling/importance_sampling_ratio/mean": 0.9999488592147827, "sampling/importance_sampling_ratio/min": 0.6423540115356445, "sampling/sampling_logp_difference/max": 0.4426157474517822, "sampling/sampling_logp_difference/mean": 0.013408897444605827, "step": 719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 801.0, "completions/max_terminated_length": 801.0, "completions/mean_length": 338.125, "completions/mean_terminated_length": 338.125, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.501103401184082, "epoch": 0.8823529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 0.9710555571314374, "kl": 0.026367144659161568, "learning_rate": 8.906935614509095e-07, "loss": -0.0332, "num_tokens": 31156620.0, "reward": 0.0625, "reward_std": 0.5081988573074341, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.4963816404342651, "sampling/importance_sampling_ratio/mean": 0.9999760389328003, "sampling/importance_sampling_ratio/min": 0.5035948753356934, "sampling/sampling_logp_difference/max": 0.6859831809997559, "sampling/sampling_logp_difference/mean": 0.01602405123412609, "step": 720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 681.0, "completions/max_terminated_length": 681.0, "completions/mean_length": 342.109375, "completions/mean_terminated_length": 342.109375, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "entropy": 0.6586544513702393, "epoch": 0.883578431372549, "frac_reward_zero_std": 0.5, "grad_norm": 0.7944001474671742, "kl": 0.02925422042608261, "learning_rate": 8.902486071850926e-07, "loss": 0.0087, "num_tokens": 31201427.0, "reward": 0.21875, "reward_std": 0.38319888710975647, "rewards/decision_reward_func/mean": 0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 1.295644760131836, "sampling/importance_sampling_ratio/mean": 0.9996585845947266, "sampling/importance_sampling_ratio/min": 0.6885751485824585, "sampling/sampling_logp_difference/max": 0.37313079833984375, "sampling/sampling_logp_difference/mean": 0.018404096364974976, "step": 721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 612.0, "completions/max_terminated_length": 612.0, "completions/mean_length": 264.875, "completions/mean_terminated_length": 264.875, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.5536998510360718, "epoch": 0.8848039215686274, "frac_reward_zero_std": 0.75, "grad_norm": 0.9256244995302348, "kl": 0.03384938836097717, "learning_rate": 8.89802860739326e-07, "loss": -0.0557, "num_tokens": 31239083.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.4681706428527832, "sampling/importance_sampling_ratio/mean": 0.9991198778152466, "sampling/importance_sampling_ratio/min": 0.7249538898468018, "sampling/sampling_logp_difference/max": 0.38401710987091064, "sampling/sampling_logp_difference/mean": 0.018321342766284943, "step": 722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 566.0, "completions/max_terminated_length": 566.0, "completions/mean_length": 376.625, "completions/mean_terminated_length": 376.625, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "entropy": 0.4550607204437256, "epoch": 0.8860294117647058, "frac_reward_zero_std": 0.75, "grad_norm": 0.5380417138216471, "kl": 0.02007366344332695, "learning_rate": 8.89356323018447e-07, "loss": 0.0135, "num_tokens": 31282387.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.445811152458191, "sampling/importance_sampling_ratio/mean": 1.0000183582305908, "sampling/importance_sampling_ratio/min": 0.6075330376625061, "sampling/sampling_logp_difference/max": 0.4983487129211426, "sampling/sampling_logp_difference/mean": 0.014206192456185818, "step": 723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 625.0, "completions/max_terminated_length": 625.0, "completions/mean_length": 297.84375, "completions/mean_terminated_length": 297.84375, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.5446144938468933, "epoch": 0.8872549019607843, "frac_reward_zero_std": 0.5, "grad_norm": 0.7897190183541916, "kl": 0.033268094062805176, "learning_rate": 8.889089949288986e-07, "loss": 0.0082, "num_tokens": 31315561.0, "reward": 0.5625, "reward_std": 0.3943893015384674, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.5146278142929077, "sampling/importance_sampling_ratio/mean": 0.9997566938400269, "sampling/importance_sampling_ratio/min": 0.6263217926025391, "sampling/sampling_logp_difference/max": 0.46789097785949707, "sampling/sampling_logp_difference/mean": 0.017867328599095345, "step": 724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 870.0, "completions/max_terminated_length": 870.0, "completions/mean_length": 329.1875, "completions/mean_terminated_length": 329.1875, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "entropy": 0.3259159326553345, "epoch": 0.8884803921568627, "frac_reward_zero_std": 1.0, "grad_norm": 0.01275379738446722, "kl": 0.020558089017868042, "learning_rate": 8.884608773787288e-07, "loss": 0.0002, "num_tokens": 31350069.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5929149389266968, "sampling/importance_sampling_ratio/mean": 1.0002954006195068, "sampling/importance_sampling_ratio/min": 0.6208478212356567, "sampling/sampling_logp_difference/max": 0.4766693115234375, "sampling/sampling_logp_difference/mean": 0.012566398829221725, "step": 725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 851.0, "completions/max_terminated_length": 851.0, "completions/mean_length": 416.5, "completions/mean_terminated_length": 416.5, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "entropy": 0.45964986085891724, "epoch": 0.8897058823529411, "frac_reward_zero_std": 0.5, "grad_norm": 0.817804693990055, "kl": 0.0235702283680439, "learning_rate": 8.880119712775875e-07, "loss": 0.0554, "num_tokens": 31395157.0, "reward": 0.34375, "reward_std": 0.3723389506340027, "rewards/decision_reward_func/mean": 0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9997406601905823, "sampling/importance_sampling_ratio/min": 0.6297717094421387, "sampling/sampling_logp_difference/max": 0.7287595272064209, "sampling/sampling_logp_difference/mean": 0.014322982169687748, "step": 726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1033.0, "completions/max_terminated_length": 1033.0, "completions/mean_length": 437.078125, "completions/mean_terminated_length": 437.078125, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.4758882522583008, "epoch": 0.8909313725490197, "frac_reward_zero_std": 0.75, "grad_norm": 0.5672353181245967, "kl": 0.017963998019695282, "learning_rate": 8.875622775367259e-07, "loss": 0.0646, "num_tokens": 31439050.0, "reward": -0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": -0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.5383410453796387, "sampling/importance_sampling_ratio/mean": 1.0000653266906738, "sampling/importance_sampling_ratio/min": 0.6087182760238647, "sampling/sampling_logp_difference/max": 0.4963996410369873, "sampling/sampling_logp_difference/mean": 0.01347433216869831, "step": 727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 909.0, "completions/max_terminated_length": 909.0, "completions/mean_length": 428.140625, "completions/mean_terminated_length": 428.140625, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "entropy": 0.532041072845459, "epoch": 0.8921568627450981, "frac_reward_zero_std": 1.0, "grad_norm": 0.013913378796712022, "kl": 0.01613861694931984, "learning_rate": 8.871117970689937e-07, "loss": 0.0002, "num_tokens": 31485475.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.43590247631073, "sampling/importance_sampling_ratio/mean": 0.9998980164527893, "sampling/importance_sampling_ratio/min": 0.36422720551490784, "sampling/sampling_logp_difference/max": 1.0099774599075317, "sampling/sampling_logp_difference/mean": 0.015434736385941505, "step": 728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 845.0, "completions/max_terminated_length": 845.0, "completions/mean_length": 328.796875, "completions/mean_terminated_length": 328.796875, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.35722512006759644, "epoch": 0.8933823529411765, "frac_reward_zero_std": 0.75, "grad_norm": 0.6404767852366403, "kl": 0.021408095955848694, "learning_rate": 8.866605307888376e-07, "loss": 0.0089, "num_tokens": 31523510.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.6221520900726318, "sampling/importance_sampling_ratio/mean": 1.0003913640975952, "sampling/importance_sampling_ratio/min": 0.6338248252868652, "sampling/sampling_logp_difference/max": 0.48375368118286133, "sampling/sampling_logp_difference/mean": 0.01221582479774952, "step": 729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 731.0, "completions/max_terminated_length": 731.0, "completions/mean_length": 368.765625, "completions/mean_terminated_length": 368.765625, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.39628538489341736, "epoch": 0.8946078431372549, "frac_reward_zero_std": 1.0, "grad_norm": 0.014629832693195472, "kl": 0.019953759387135506, "learning_rate": 8.862084796122997e-07, "loss": 0.0002, "num_tokens": 31566327.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4131771326065063, "sampling/importance_sampling_ratio/mean": 1.0001435279846191, "sampling/importance_sampling_ratio/min": 0.6047029495239258, "sampling/sampling_logp_difference/max": 0.5030179023742676, "sampling/sampling_logp_difference/mean": 0.013723521493375301, "step": 730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1031.0, "completions/max_terminated_length": 1031.0, "completions/mean_length": 362.203125, "completions/mean_terminated_length": 362.203125, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "entropy": 0.5532274842262268, "epoch": 0.8958333333333334, "frac_reward_zero_std": 0.75, "grad_norm": 0.7040495775551174, "kl": 0.022968608886003494, "learning_rate": 8.857556444570153e-07, "loss": -0.0088, "num_tokens": 31607284.0, "reward": 0.71875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.4358900785446167, "sampling/importance_sampling_ratio/mean": 1.000300645828247, "sampling/importance_sampling_ratio/min": 0.6004891395568848, "sampling/sampling_logp_difference/max": 0.5100107192993164, "sampling/sampling_logp_difference/mean": 0.015881342813372612, "step": 731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 658.0, "completions/max_terminated_length": 658.0, "completions/mean_length": 359.484375, "completions/mean_terminated_length": 359.484375, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.43632203340530396, "epoch": 0.8970588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.01891636204159928, "kl": 0.023500952869653702, "learning_rate": 8.853020262422109e-07, "loss": 0.0002, "num_tokens": 31644467.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6544971466064453, "sampling/importance_sampling_ratio/mean": 1.0004334449768066, "sampling/importance_sampling_ratio/min": 0.6148766875267029, "sampling/sampling_logp_difference/max": 0.5034971237182617, "sampling/sampling_logp_difference/mean": 0.015029322355985641, "step": 732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 799.0, "completions/max_terminated_length": 799.0, "completions/mean_length": 342.65625, "completions/mean_terminated_length": 342.65625, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "entropy": 0.35945552587509155, "epoch": 0.8982843137254902, "frac_reward_zero_std": 0.75, "grad_norm": 0.7041923020437988, "kl": 0.019180793315172195, "learning_rate": 8.84847625888703e-07, "loss": -0.008, "num_tokens": 31687789.0, "reward": -0.40625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": -0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.561460018157959, "sampling/importance_sampling_ratio/mean": 1.00018310546875, "sampling/importance_sampling_ratio/min": 0.6182036995887756, "sampling/sampling_logp_difference/max": 0.48093724250793457, "sampling/sampling_logp_difference/mean": 0.011956815607845783, "step": 733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 691.0, "completions/max_terminated_length": 691.0, "completions/mean_length": 284.453125, "completions/mean_terminated_length": 284.453125, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.49316346645355225, "epoch": 0.8995098039215687, "frac_reward_zero_std": 0.75, "grad_norm": 0.7480565449839165, "kl": 0.027410639449954033, "learning_rate": 8.843924443188953e-07, "loss": 0.0253, "num_tokens": 31725402.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.8893797397613525, "sampling/importance_sampling_ratio/mean": 0.9998879432678223, "sampling/importance_sampling_ratio/min": 0.5631163120269775, "sampling/sampling_logp_difference/max": 0.6362485885620117, "sampling/sampling_logp_difference/mean": 0.016136718913912773, "step": 734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 693.0, "completions/max_terminated_length": 693.0, "completions/mean_length": 319.875, "completions/mean_terminated_length": 319.875, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "entropy": 0.556473970413208, "epoch": 0.9007352941176471, "frac_reward_zero_std": 0.5, "grad_norm": 0.8336523946851211, "kl": 0.040443070232868195, "learning_rate": 8.839364824567775e-07, "loss": 0.0229, "num_tokens": 31762546.0, "reward": 0.125, "reward_std": 0.42078250646591187, "rewards/decision_reward_func/mean": 0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 1.6837835311889648, "sampling/importance_sampling_ratio/mean": 0.9999013543128967, "sampling/importance_sampling_ratio/min": 0.43724527955055237, "sampling/sampling_logp_difference/max": 0.8272610306739807, "sampling/sampling_logp_difference/mean": 0.017163347452878952, "step": 735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1241.0, "completions/max_terminated_length": 1241.0, "completions/mean_length": 509.4375, "completions/mean_terminated_length": 509.4375, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.44953274726867676, "epoch": 0.9019607843137255, "frac_reward_zero_std": 0.5, "grad_norm": 0.6400098889389364, "kl": 0.020778927952051163, "learning_rate": 8.834797412279235e-07, "loss": -0.0634, "num_tokens": 31815934.0, "reward": 0.75, "reward_std": 0.3811737596988678, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.6256163120269775, "sampling/importance_sampling_ratio/mean": 1.0003368854522705, "sampling/importance_sampling_ratio/min": 0.6810584664344788, "sampling/sampling_logp_difference/max": 0.4858870506286621, "sampling/sampling_logp_difference/mean": 0.013490157201886177, "step": 736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 628.0, "completions/max_terminated_length": 628.0, "completions/mean_length": 256.125, "completions/mean_terminated_length": 256.125, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.44903990626335144, "epoch": 0.9031862745098039, "frac_reward_zero_std": 0.75, "grad_norm": 0.6931252285179034, "kl": 0.042038414627313614, "learning_rate": 8.83022221559489e-07, "loss": 0.0028, "num_tokens": 31846646.0, "reward": 0.375, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.5467334985733032, "sampling/importance_sampling_ratio/mean": 1.0000722408294678, "sampling/importance_sampling_ratio/min": 0.6189770102500916, "sampling/sampling_logp_difference/max": 0.4796872138977051, "sampling/sampling_logp_difference/mean": 0.014955378137528896, "step": 737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/max_terminated_length": 499.0, "completions/mean_length": 307.5, "completions/mean_terminated_length": 307.5, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.49277564883232117, "epoch": 0.9044117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.019329843289040385, "kl": 0.025892823934555054, "learning_rate": 8.825639243802098e-07, "loss": 0.0002, "num_tokens": 31888582.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4402774572372437, "sampling/importance_sampling_ratio/mean": 0.9999434947967529, "sampling/importance_sampling_ratio/min": 0.6719549894332886, "sampling/sampling_logp_difference/max": 0.3975639343261719, "sampling/sampling_logp_difference/mean": 0.015849536284804344, "step": 738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 935.0, "completions/max_terminated_length": 935.0, "completions/mean_length": 372.859375, "completions/mean_terminated_length": 372.859375, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.38229697942733765, "epoch": 0.9056372549019608, "frac_reward_zero_std": 1.0, "grad_norm": 0.013627822610147092, "kl": 0.022295242175459862, "learning_rate": 8.821048506204005e-07, "loss": 0.0002, "num_tokens": 31926541.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.657578468322754, "sampling/importance_sampling_ratio/mean": 1.0000666379928589, "sampling/importance_sampling_ratio/min": 0.698795735836029, "sampling/sampling_logp_difference/max": 0.5053577423095703, "sampling/sampling_logp_difference/mean": 0.01312587782740593, "step": 739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 760.0, "completions/max_terminated_length": 760.0, "completions/mean_length": 395.25, "completions/mean_terminated_length": 395.25, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "entropy": 0.5226728916168213, "epoch": 0.9068627450980392, "frac_reward_zero_std": 0.75, "grad_norm": 0.6163787590902893, "kl": 0.0198008194565773, "learning_rate": 8.816450012119513e-07, "loss": 0.001, "num_tokens": 31975405.0, "reward": 0.15625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.6274622678756714, "sampling/importance_sampling_ratio/mean": 1.0003511905670166, "sampling/importance_sampling_ratio/min": 0.6868337392807007, "sampling/sampling_logp_difference/max": 0.48702192306518555, "sampling/sampling_logp_difference/mean": 0.015741616487503052, "step": 740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 605.0, "completions/max_terminated_length": 605.0, "completions/mean_length": 284.15625, "completions/mean_terminated_length": 284.15625, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.4839494228363037, "epoch": 0.9080882352941176, "frac_reward_zero_std": 1.0, "grad_norm": 0.017210449425352974, "kl": 0.025250347331166267, "learning_rate": 8.811843770883276e-07, "loss": 0.0002, "num_tokens": 32011463.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4211852550506592, "sampling/importance_sampling_ratio/mean": 1.0002727508544922, "sampling/importance_sampling_ratio/min": 0.7050158381462097, "sampling/sampling_logp_difference/max": 0.35149121284484863, "sampling/sampling_logp_difference/mean": 0.01510937325656414, "step": 741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 875.0, "completions/max_terminated_length": 875.0, "completions/mean_length": 392.46875, "completions/mean_terminated_length": 392.46875, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "entropy": 0.5024335980415344, "epoch": 0.9093137254901961, "frac_reward_zero_std": 0.75, "grad_norm": 0.6019128812448213, "kl": 0.02074708789587021, "learning_rate": 8.807229791845671e-07, "loss": -0.0115, "num_tokens": 32056309.0, "reward": 0.1875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 1.5075002908706665, "sampling/importance_sampling_ratio/mean": 0.9999599456787109, "sampling/importance_sampling_ratio/min": 0.5254800319671631, "sampling/sampling_logp_difference/max": 0.6434431076049805, "sampling/sampling_logp_difference/mean": 0.014140726998448372, "step": 742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 800.0, "completions/max_terminated_length": 800.0, "completions/mean_length": 398.328125, "completions/mean_terminated_length": 398.328125, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.3965330719947815, "epoch": 0.9105392156862745, "frac_reward_zero_std": 0.75, "grad_norm": 0.5838747751048728, "kl": 0.020657097920775414, "learning_rate": 8.802608084372785e-07, "loss": -0.0096, "num_tokens": 32103082.0, "reward": 0.375, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.491632342338562, "sampling/importance_sampling_ratio/mean": 1.0002247095108032, "sampling/importance_sampling_ratio/min": 0.48887839913368225, "sampling/sampling_logp_difference/max": 0.7156414985656738, "sampling/sampling_logp_difference/mean": 0.0129943136125803, "step": 743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1426.0, "completions/max_terminated_length": 1426.0, "completions/mean_length": 414.0625, "completions/mean_terminated_length": 414.0625, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "entropy": 0.3834943175315857, "epoch": 0.9117647058823529, "frac_reward_zero_std": 0.75, "grad_norm": 0.8609348435845935, "kl": 0.018454642966389656, "learning_rate": 8.79797865784639e-07, "loss": -0.138, "num_tokens": 32146622.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.5326391458511353, "sampling/importance_sampling_ratio/mean": 0.9998395442962646, "sampling/importance_sampling_ratio/min": 0.2869154214859009, "sampling/sampling_logp_difference/max": 1.248567819595337, "sampling/sampling_logp_difference/mean": 0.011907264590263367, "step": 744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1274.0, "completions/max_terminated_length": 1274.0, "completions/mean_length": 375.1875, "completions/mean_terminated_length": 375.1875, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.4463527798652649, "epoch": 0.9129901960784313, "frac_reward_zero_std": 1.0, "grad_norm": 0.01876392984039807, "kl": 0.020286226645112038, "learning_rate": 8.793341521663928e-07, "loss": 0.0002, "num_tokens": 32189338.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3221745491027832, "sampling/importance_sampling_ratio/mean": 0.9999237656593323, "sampling/importance_sampling_ratio/min": 0.7147690057754517, "sampling/sampling_logp_difference/max": 0.33579587936401367, "sampling/sampling_logp_difference/mean": 0.013206128031015396, "step": 745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 987.0, "completions/max_terminated_length": 987.0, "completions/mean_length": 417.1875, "completions/mean_terminated_length": 417.1875, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "entropy": 0.4341197609901428, "epoch": 0.9142156862745098, "frac_reward_zero_std": 1.0, "grad_norm": 0.013530111671297279, "kl": 0.022070419043302536, "learning_rate": 8.788696685238494e-07, "loss": 0.0002, "num_tokens": 32234374.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000077486038208, "sampling/importance_sampling_ratio/min": 0.6133654713630676, "sampling/sampling_logp_difference/max": 0.7048373222351074, "sampling/sampling_logp_difference/mean": 0.013925082981586456, "step": 746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 690.0, "completions/max_terminated_length": 690.0, "completions/mean_length": 310.921875, "completions/mean_terminated_length": 310.921875, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "entropy": 0.37180620431900024, "epoch": 0.9154411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.027842341199526055, "kl": 0.025955578312277794, "learning_rate": 8.784044157998809e-07, "loss": 0.0003, "num_tokens": 32268817.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.528430700302124, "sampling/importance_sampling_ratio/mean": 1.0001084804534912, "sampling/importance_sampling_ratio/min": 0.5604355931282043, "sampling/sampling_logp_difference/max": 0.5790410041809082, "sampling/sampling_logp_difference/mean": 0.012442877516150475, "step": 747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1177.0, "completions/max_terminated_length": 1177.0, "completions/mean_length": 431.5, "completions/mean_terminated_length": 431.5, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "entropy": 0.5724613666534424, "epoch": 0.9166666666666666, "frac_reward_zero_std": 0.5, "grad_norm": 0.7515077372673653, "kl": 0.035352710634469986, "learning_rate": 8.779383949389208e-07, "loss": 0.0253, "num_tokens": 32315585.0, "reward": 0.5, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5651153326034546, "sampling/importance_sampling_ratio/mean": 1.0002895593643188, "sampling/importance_sampling_ratio/min": 0.6223650574684143, "sampling/sampling_logp_difference/max": 0.4742283821105957, "sampling/sampling_logp_difference/mean": 0.01628047786653042, "step": 748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/max_terminated_length": 508.0, "completions/mean_length": 283.625, "completions/mean_terminated_length": 283.625, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.37133079767227173, "epoch": 0.9178921568627451, "frac_reward_zero_std": 1.0, "grad_norm": 0.017530524748961513, "kl": 0.02519168332219124, "learning_rate": 8.774716068869623e-07, "loss": 0.0002, "num_tokens": 32350057.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4340932369232178, "sampling/importance_sampling_ratio/mean": 1.00054132938385, "sampling/importance_sampling_ratio/min": 0.6725956201553345, "sampling/sampling_logp_difference/max": 0.3966110348701477, "sampling/sampling_logp_difference/mean": 0.012611262500286102, "step": 749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1471.0, "completions/max_terminated_length": 1471.0, "completions/mean_length": 581.359375, "completions/mean_terminated_length": 581.359375, "completions/min_length": 327.0, "completions/min_terminated_length": 327.0, "entropy": 0.48914778232574463, "epoch": 0.9191176470588235, "frac_reward_zero_std": 1.0, "grad_norm": 0.00954940993292785, "kl": 0.013180272653698921, "learning_rate": 8.770040525915553e-07, "loss": 0.0001, "num_tokens": 32417568.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.8179789781570435, "sampling/importance_sampling_ratio/mean": 1.0000007152557373, "sampling/importance_sampling_ratio/min": 0.4797821640968323, "sampling/sampling_logp_difference/max": 0.7344231605529785, "sampling/sampling_logp_difference/mean": 0.014325776137411594, "step": 750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1308.0, "completions/max_terminated_length": 1308.0, "completions/mean_length": 370.921875, "completions/mean_terminated_length": 370.921875, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.4034190773963928, "epoch": 0.9203431372549019, "frac_reward_zero_std": 0.75, "grad_norm": 0.5112086227041801, "kl": 0.021904047578573227, "learning_rate": 8.765357330018055e-07, "loss": 0.0048, "num_tokens": 32458091.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.5517206192016602, "sampling/importance_sampling_ratio/mean": 0.9999547004699707, "sampling/importance_sampling_ratio/min": 0.6151249408721924, "sampling/sampling_logp_difference/max": 0.48592984676361084, "sampling/sampling_logp_difference/mean": 0.013530261814594269, "step": 751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 842.0, "completions/max_terminated_length": 842.0, "completions/mean_length": 420.6875, "completions/mean_terminated_length": 420.6875, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "entropy": 0.4275951087474823, "epoch": 0.9215686274509803, "frac_reward_zero_std": 1.0, "grad_norm": 0.020189907886555617, "kl": 0.02280484139919281, "learning_rate": 8.760666490683719e-07, "loss": 0.0002, "num_tokens": 32501671.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.4025825262069702, "sampling/importance_sampling_ratio/mean": 0.9999439716339111, "sampling/importance_sampling_ratio/min": 0.7065174579620361, "sampling/sampling_logp_difference/max": 0.34740734100341797, "sampling/sampling_logp_difference/mean": 0.013040612451732159, "step": 752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 892.0, "completions/max_terminated_length": 892.0, "completions/mean_length": 384.046875, "completions/mean_terminated_length": 384.046875, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "entropy": 0.5414404273033142, "epoch": 0.9227941176470589, "frac_reward_zero_std": 0.75, "grad_norm": 0.5478586401732926, "kl": 0.02901121973991394, "learning_rate": 8.755968017434651e-07, "loss": 0.0236, "num_tokens": 32543114.0, "reward": -0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": -0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.458870768547058, "sampling/importance_sampling_ratio/mean": 1.000278115272522, "sampling/importance_sampling_ratio/min": 0.6762279272079468, "sampling/sampling_logp_difference/max": 0.39122509956359863, "sampling/sampling_logp_difference/mean": 0.016880135983228683, "step": 753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 911.0, "completions/max_terminated_length": 911.0, "completions/mean_length": 426.171875, "completions/mean_terminated_length": 426.171875, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.4162246584892273, "epoch": 0.9240196078431373, "frac_reward_zero_std": 1.0, "grad_norm": 0.011701159312105405, "kl": 0.018623705953359604, "learning_rate": 8.751261919808457e-07, "loss": 0.0002, "num_tokens": 32590965.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6827442646026611, "sampling/importance_sampling_ratio/mean": 0.9998997449874878, "sampling/importance_sampling_ratio/min": 0.0006267652497626841, "sampling/sampling_logp_difference/max": 7.374938488006592, "sampling/sampling_logp_difference/mean": 0.013455774635076523, "step": 754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 794.0, "completions/max_terminated_length": 794.0, "completions/mean_length": 381.578125, "completions/mean_terminated_length": 381.578125, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.36337247490882874, "epoch": 0.9252450980392157, "frac_reward_zero_std": 1.0, "grad_norm": 0.014293116723183525, "kl": 0.018057968467473984, "learning_rate": 8.746548207358215e-07, "loss": 0.0002, "num_tokens": 32638746.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4427088499069214, "sampling/importance_sampling_ratio/mean": 1.0000278949737549, "sampling/importance_sampling_ratio/min": 0.6498275399208069, "sampling/sampling_logp_difference/max": 0.43104827404022217, "sampling/sampling_logp_difference/mean": 0.01198431197553873, "step": 755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1252.0, "completions/max_terminated_length": 1252.0, "completions/mean_length": 472.46875, "completions/mean_terminated_length": 472.46875, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "entropy": 0.5639067888259888, "epoch": 0.9264705882352942, "frac_reward_zero_std": 1.0, "grad_norm": 0.011787061912905851, "kl": 0.01830962300300598, "learning_rate": 8.741826889652463e-07, "loss": 0.0002, "num_tokens": 32692088.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4069997072219849, "sampling/importance_sampling_ratio/mean": 0.9997031688690186, "sampling/importance_sampling_ratio/min": 0.4462621510028839, "sampling/sampling_logp_difference/max": 0.8068487644195557, "sampling/sampling_logp_difference/mean": 0.016279976814985275, "step": 756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 660.0, "completions/max_terminated_length": 660.0, "completions/mean_length": 419.8125, "completions/mean_terminated_length": 419.8125, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.3742711544036865, "epoch": 0.9276960784313726, "frac_reward_zero_std": 1.0, "grad_norm": 0.011722956826782444, "kl": 0.01765141263604164, "learning_rate": 8.737097976275176e-07, "loss": 0.0002, "num_tokens": 32734668.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.365173578262329, "sampling/importance_sampling_ratio/mean": 0.9999525547027588, "sampling/importance_sampling_ratio/min": 0.6793873906135559, "sampling/sampling_logp_difference/max": 0.386563777923584, "sampling/sampling_logp_difference/mean": 0.011362585239112377, "step": 757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 609.0, "completions/max_terminated_length": 609.0, "completions/mean_length": 371.5625, "completions/mean_terminated_length": 371.5625, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "entropy": 0.4998852014541626, "epoch": 0.928921568627451, "frac_reward_zero_std": 0.75, "grad_norm": 0.6791716349517898, "kl": 0.021393822506070137, "learning_rate": 8.73236147682575e-07, "loss": 0.0075, "num_tokens": 32783744.0, "reward": 0.6875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.4753928184509277, "sampling/importance_sampling_ratio/mean": 0.9998161196708679, "sampling/importance_sampling_ratio/min": 0.23717880249023438, "sampling/sampling_logp_difference/max": 1.4389410018920898, "sampling/sampling_logp_difference/mean": 0.014953481033444405, "step": 758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 647.0, "completions/max_terminated_length": 647.0, "completions/mean_length": 354.015625, "completions/mean_terminated_length": 354.015625, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "entropy": 0.4339810013771057, "epoch": 0.9301470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.019549296899575012, "kl": 0.024197572842240334, "learning_rate": 8.727617400918978e-07, "loss": 0.0002, "num_tokens": 32826913.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.8100659847259521, "sampling/importance_sampling_ratio/mean": 1.0001351833343506, "sampling/importance_sampling_ratio/min": 0.6980865001678467, "sampling/sampling_logp_difference/max": 0.5933632850646973, "sampling/sampling_logp_difference/mean": 0.012737960554659367, "step": 759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1071.0, "completions/max_terminated_length": 1071.0, "completions/mean_length": 487.59375, "completions/mean_terminated_length": 487.59375, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "entropy": 0.5857847332954407, "epoch": 0.9313725490196079, "frac_reward_zero_std": 0.75, "grad_norm": 0.5140119348516102, "kl": 0.021159294992685318, "learning_rate": 8.722865758185035e-07, "loss": -0.0328, "num_tokens": 32875927.0, "reward": -0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": -0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.4891201257705688, "sampling/importance_sampling_ratio/mean": 0.999872624874115, "sampling/importance_sampling_ratio/min": 0.532627284526825, "sampling/sampling_logp_difference/max": 0.6299333572387695, "sampling/sampling_logp_difference/mean": 0.015791919082403183, "step": 760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1161.0, "completions/max_terminated_length": 1161.0, "completions/mean_length": 460.046875, "completions/mean_terminated_length": 460.046875, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "entropy": 0.4879535734653473, "epoch": 0.9325980392156863, "frac_reward_zero_std": 0.75, "grad_norm": 0.4364535824617912, "kl": 0.019509416073560715, "learning_rate": 8.718106558269452e-07, "loss": 0.0091, "num_tokens": 32924330.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.4774715900421143, "sampling/importance_sampling_ratio/mean": 0.9999589920043945, "sampling/importance_sampling_ratio/min": 0.6147370934486389, "sampling/sampling_logp_difference/max": 0.486560583114624, "sampling/sampling_logp_difference/mean": 0.014054717496037483, "step": 761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 824.0, "completions/max_terminated_length": 824.0, "completions/mean_length": 307.765625, "completions/mean_terminated_length": 307.765625, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.4420529901981354, "epoch": 0.9338235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.017857338178772084, "kl": 0.026087211444973946, "learning_rate": 8.713339810833105e-07, "loss": 0.0002, "num_tokens": 32957451.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.544487476348877, "sampling/importance_sampling_ratio/mean": 0.9999828338623047, "sampling/importance_sampling_ratio/min": 0.6100528240203857, "sampling/sampling_logp_difference/max": 0.49420976638793945, "sampling/sampling_logp_difference/mean": 0.016059590503573418, "step": 762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 853.0, "completions/max_terminated_length": 853.0, "completions/mean_length": 390.390625, "completions/mean_terminated_length": 390.390625, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "entropy": 0.4430007338523865, "epoch": 0.9350490196078431, "frac_reward_zero_std": 0.75, "grad_norm": 0.7479477498266563, "kl": 0.023601781576871872, "learning_rate": 8.708565525552189e-07, "loss": 0.0455, "num_tokens": 33000916.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.5776598453521729, "sampling/importance_sampling_ratio/mean": 1.0001369714736938, "sampling/importance_sampling_ratio/min": 0.6990218758583069, "sampling/sampling_logp_difference/max": 0.45594263076782227, "sampling/sampling_logp_difference/mean": 0.01377125270664692, "step": 763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 902.0, "completions/max_terminated_length": 902.0, "completions/mean_length": 466.90625, "completions/mean_terminated_length": 466.90625, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "entropy": 0.5332295894622803, "epoch": 0.9362745098039216, "frac_reward_zero_std": 0.75, "grad_norm": 0.5332477556084468, "kl": 0.02071942761540413, "learning_rate": 8.703783712118202e-07, "loss": 0.0153, "num_tokens": 33054206.0, "reward": 0.375, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.498400092124939, "sampling/importance_sampling_ratio/mean": 0.9999392032623291, "sampling/importance_sampling_ratio/min": 0.5586374998092651, "sampling/sampling_logp_difference/max": 0.5822544097900391, "sampling/sampling_logp_difference/mean": 0.0155078349635005, "step": 764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 623.0, "completions/max_terminated_length": 623.0, "completions/mean_length": 339.359375, "completions/mean_terminated_length": 339.359375, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "entropy": 0.4022904932498932, "epoch": 0.9375, "frac_reward_zero_std": 1.0, "grad_norm": 0.014200839523558093, "kl": 0.02079068124294281, "learning_rate": 8.69899438023792e-07, "loss": 0.0002, "num_tokens": 33092085.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4555063247680664, "sampling/importance_sampling_ratio/mean": 0.9998932480812073, "sampling/importance_sampling_ratio/min": 0.602207601070404, "sampling/sampling_logp_difference/max": 0.5071530342102051, "sampling/sampling_logp_difference/mean": 0.013255606405436993, "step": 765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1097.0, "completions/max_terminated_length": 1097.0, "completions/mean_length": 551.328125, "completions/mean_terminated_length": 551.328125, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "entropy": 0.3926410675048828, "epoch": 0.9387254901960784, "frac_reward_zero_std": 0.5, "grad_norm": 0.5825876400504038, "kl": 0.014321057125926018, "learning_rate": 8.694197539633385e-07, "loss": 0.0353, "num_tokens": 33147354.0, "reward": -0.03125, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": -0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.884231686592102, "sampling/importance_sampling_ratio/mean": 1.000187873840332, "sampling/importance_sampling_ratio/min": 0.7368322610855103, "sampling/sampling_logp_difference/max": 0.6335201263427734, "sampling/sampling_logp_difference/mean": 0.011570739559829235, "step": 766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1115.0, "completions/max_terminated_length": 1115.0, "completions/mean_length": 317.328125, "completions/mean_terminated_length": 317.328125, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.5542346239089966, "epoch": 0.9399509803921569, "frac_reward_zero_std": 0.75, "grad_norm": 0.6842125500674675, "kl": 0.023935165256261826, "learning_rate": 8.689393200041878e-07, "loss": 0.0287, "num_tokens": 33187471.0, "reward": 0.84375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.6238120794296265, "sampling/importance_sampling_ratio/mean": 1.0001353025436401, "sampling/importance_sampling_ratio/min": 0.5973368287086487, "sampling/sampling_logp_difference/max": 0.5152740478515625, "sampling/sampling_logp_difference/mean": 0.017830558121204376, "step": 767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 931.0, "completions/max_terminated_length": 931.0, "completions/mean_length": 384.40625, "completions/mean_terminated_length": 384.40625, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.5426250100135803, "epoch": 0.9411764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.837349939289904, "kl": 0.029200609773397446, "learning_rate": 8.684581371215904e-07, "loss": -0.053, "num_tokens": 33236025.0, "reward": 0.65625, "reward_std": 0.375, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9998906850814819, "sampling/importance_sampling_ratio/min": 0.3890766203403473, "sampling/sampling_logp_difference/max": 0.943979024887085, "sampling/sampling_logp_difference/mean": 0.01616964116692543, "step": 768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 720.0, "completions/max_terminated_length": 720.0, "completions/mean_length": 308.5625, "completions/mean_terminated_length": 308.5625, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.5316200256347656, "epoch": 0.9424019607843137, "frac_reward_zero_std": 0.5, "grad_norm": 0.9760659659855797, "kl": 0.036749474704265594, "learning_rate": 8.679762062923175e-07, "loss": -0.0466, "num_tokens": 33272045.0, "reward": 0.40625, "reward_std": 0.47978055477142334, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.492100477218628, "sampling/importance_sampling_ratio/mean": 1.0002315044403076, "sampling/importance_sampling_ratio/min": 0.6574996113777161, "sampling/sampling_logp_difference/max": 0.41931116580963135, "sampling/sampling_logp_difference/mean": 0.01727050170302391, "step": 769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 616.0, "completions/max_terminated_length": 616.0, "completions/mean_length": 346.234375, "completions/mean_terminated_length": 346.234375, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.37126997113227844, "epoch": 0.9436274509803921, "frac_reward_zero_std": 1.0, "grad_norm": 0.018978138810309488, "kl": 0.02302279695868492, "learning_rate": 8.674935284946576e-07, "loss": 0.0002, "num_tokens": 33307404.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4042887687683105, "sampling/importance_sampling_ratio/mean": 0.9995453357696533, "sampling/importance_sampling_ratio/min": 0.7006571888923645, "sampling/sampling_logp_difference/max": 0.35573649406433105, "sampling/sampling_logp_difference/mean": 0.012772996909916401, "step": 770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 608.0, "completions/max_terminated_length": 608.0, "completions/mean_length": 309.890625, "completions/mean_terminated_length": 309.890625, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.34347817301750183, "epoch": 0.9448529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.013628329759017689, "kl": 0.017877968028187752, "learning_rate": 8.670101047084162e-07, "loss": 0.0002, "num_tokens": 33342549.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.2777674198150635, "sampling/importance_sampling_ratio/mean": 0.9996640682220459, "sampling/importance_sampling_ratio/min": 0.6257290840148926, "sampling/sampling_logp_difference/max": 0.4688377380371094, "sampling/sampling_logp_difference/mean": 0.01090667862445116, "step": 771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1275.0, "completions/max_terminated_length": 1275.0, "completions/mean_length": 414.234375, "completions/mean_terminated_length": 414.234375, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "entropy": 0.5014985799789429, "epoch": 0.946078431372549, "frac_reward_zero_std": 0.75, "grad_norm": 0.5301035220460291, "kl": 0.022226277738809586, "learning_rate": 8.66525935914913e-07, "loss": -0.0281, "num_tokens": 33383796.0, "reward": 0.28125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 1.3828449249267578, "sampling/importance_sampling_ratio/mean": 0.9997643828392029, "sampling/importance_sampling_ratio/min": 0.6352885365486145, "sampling/sampling_logp_difference/max": 0.4536759853363037, "sampling/sampling_logp_difference/mean": 0.016124963760375977, "step": 772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1066.0, "completions/max_terminated_length": 1066.0, "completions/mean_length": 351.390625, "completions/mean_terminated_length": 351.390625, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "entropy": 0.5142149925231934, "epoch": 0.9473039215686274, "frac_reward_zero_std": 0.75, "grad_norm": 0.7478030127854184, "kl": 0.023353274911642075, "learning_rate": 8.660410230969804e-07, "loss": 0.0386, "num_tokens": 33422349.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.4789059162139893, "sampling/importance_sampling_ratio/mean": 0.9996322393417358, "sampling/importance_sampling_ratio/min": 0.6452317833900452, "sampling/sampling_logp_difference/max": 0.4381457567214966, "sampling/sampling_logp_difference/mean": 0.0156090147793293, "step": 773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 643.0, "completions/max_terminated_length": 643.0, "completions/mean_length": 304.71875, "completions/mean_terminated_length": 304.71875, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.42543014883995056, "epoch": 0.9485294117647058, "frac_reward_zero_std": 1.0, "grad_norm": 0.017432519861553416, "kl": 0.023180879652500153, "learning_rate": 8.655553672389599e-07, "loss": 0.0002, "num_tokens": 33457995.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5036791563034058, "sampling/importance_sampling_ratio/mean": 1.0003209114074707, "sampling/importance_sampling_ratio/min": 0.7028471231460571, "sampling/sampling_logp_difference/max": 0.4079148769378662, "sampling/sampling_logp_difference/mean": 0.014168776571750641, "step": 774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1025.0, "completions/max_terminated_length": 1025.0, "completions/mean_length": 390.1875, "completions/mean_terminated_length": 390.1875, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.45940449833869934, "epoch": 0.9497549019607843, "frac_reward_zero_std": 1.0, "grad_norm": 0.013102404721414534, "kl": 0.01975955069065094, "learning_rate": 8.650689693267026e-07, "loss": 0.0002, "num_tokens": 33505703.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000032663345337, "sampling/importance_sampling_ratio/min": 0.26294881105422974, "sampling/sampling_logp_difference/max": 1.5432929992675781, "sampling/sampling_logp_difference/mean": 0.014355039224028587, "step": 775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 584.0, "completions/max_terminated_length": 584.0, "completions/mean_length": 325.90625, "completions/mean_terminated_length": 325.90625, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.5109738111495972, "epoch": 0.9509803921568627, "frac_reward_zero_std": 0.75, "grad_norm": 0.6150933041939846, "kl": 0.029240088537335396, "learning_rate": 8.645818303475654e-07, "loss": 0.0069, "num_tokens": 33544017.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.3374592065811157, "sampling/importance_sampling_ratio/mean": 1.0002509355545044, "sampling/importance_sampling_ratio/min": 0.6825330257415771, "sampling/sampling_logp_difference/max": 0.3819444179534912, "sampling/sampling_logp_difference/mean": 0.015017151832580566, "step": 776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1457.0, "completions/max_terminated_length": 1457.0, "completions/mean_length": 404.328125, "completions/mean_terminated_length": 404.328125, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.41790202260017395, "epoch": 0.9522058823529411, "frac_reward_zero_std": 1.0, "grad_norm": 0.015184516407552785, "kl": 0.023636966943740845, "learning_rate": 8.640939512904095e-07, "loss": 0.0002, "num_tokens": 33589718.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5971765518188477, "sampling/importance_sampling_ratio/mean": 1.0001696348190308, "sampling/importance_sampling_ratio/min": 0.6354519724845886, "sampling/sampling_logp_difference/max": 0.46823740005493164, "sampling/sampling_logp_difference/mean": 0.0133011220023036, "step": 777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 721.0, "completions/max_terminated_length": 721.0, "completions/mean_length": 358.890625, "completions/mean_terminated_length": 358.890625, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.4736621081829071, "epoch": 0.9534313725490197, "frac_reward_zero_std": 1.0, "grad_norm": 0.013535720024682464, "kl": 0.017960095778107643, "learning_rate": 8.636053331455986e-07, "loss": 0.0002, "num_tokens": 33630591.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6090198755264282, "sampling/importance_sampling_ratio/mean": 0.9995553493499756, "sampling/importance_sampling_ratio/min": 0.7386454343795776, "sampling/sampling_logp_difference/max": 0.47562527656555176, "sampling/sampling_logp_difference/mean": 0.014501764439046383, "step": 778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1111.0, "completions/max_terminated_length": 1111.0, "completions/mean_length": 473.34375, "completions/mean_terminated_length": 473.34375, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "entropy": 0.4213956296443939, "epoch": 0.9546568627450981, "frac_reward_zero_std": 1.0, "grad_norm": 0.01224713679391843, "kl": 0.018748031929135323, "learning_rate": 8.631159769049964e-07, "loss": 0.0002, "num_tokens": 33682021.0, "reward": -0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": -0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.489452600479126, "sampling/importance_sampling_ratio/mean": 0.9996762275695801, "sampling/importance_sampling_ratio/min": 0.6198264360427856, "sampling/sampling_logp_difference/max": 0.4783158302307129, "sampling/sampling_logp_difference/mean": 0.01329081505537033, "step": 779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 943.0, "completions/max_terminated_length": 943.0, "completions/mean_length": 376.109375, "completions/mean_terminated_length": 376.109375, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.5324567556381226, "epoch": 0.9558823529411765, "frac_reward_zero_std": 0.75, "grad_norm": 0.59564462410675, "kl": 0.027574826031923294, "learning_rate": 8.626258835619653e-07, "loss": -0.0439, "num_tokens": 33722300.0, "reward": -0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": -0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.435339093208313, "sampling/importance_sampling_ratio/mean": 0.9996751546859741, "sampling/importance_sampling_ratio/min": 0.5892667770385742, "sampling/sampling_logp_difference/max": 0.5288763046264648, "sampling/sampling_logp_difference/mean": 0.015323950909078121, "step": 780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1073.0, "completions/max_terminated_length": 1073.0, "completions/mean_length": 493.0625, "completions/mean_terminated_length": 493.0625, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "entropy": 0.47125011682510376, "epoch": 0.9571078431372549, "frac_reward_zero_std": 1.0, "grad_norm": 0.010258146287448432, "kl": 0.01772441528737545, "learning_rate": 8.621350541113636e-07, "loss": 0.0002, "num_tokens": 33771968.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3770774602890015, "sampling/importance_sampling_ratio/mean": 0.999889612197876, "sampling/importance_sampling_ratio/min": 0.39073821902275085, "sampling/sampling_logp_difference/max": 0.9397174715995789, "sampling/sampling_logp_difference/mean": 0.013897532597184181, "step": 781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 835.0, "completions/max_terminated_length": 835.0, "completions/mean_length": 309.546875, "completions/mean_terminated_length": 309.546875, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.46683269739151, "epoch": 0.9583333333333334, "frac_reward_zero_std": 0.75, "grad_norm": 0.7653388830883125, "kl": 0.02702484093606472, "learning_rate": 8.616434895495439e-07, "loss": 0.0303, "num_tokens": 33805059.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.323447585105896, "sampling/importance_sampling_ratio/mean": 1.0003453493118286, "sampling/importance_sampling_ratio/min": 0.621431291103363, "sampling/sampling_logp_difference/max": 0.47572994232177734, "sampling/sampling_logp_difference/mean": 0.015248659998178482, "step": 782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 672.0, "completions/max_terminated_length": 672.0, "completions/mean_length": 412.6875, "completions/mean_terminated_length": 412.6875, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "entropy": 0.5164862275123596, "epoch": 0.9595588235294118, "frac_reward_zero_std": 0.75, "grad_norm": 0.5578046752934228, "kl": 0.023028280586004257, "learning_rate": 8.611511908743514e-07, "loss": -0.0248, "num_tokens": 33846127.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.5708072185516357, "sampling/importance_sampling_ratio/mean": 0.9998116493225098, "sampling/importance_sampling_ratio/min": 0.5551521182060242, "sampling/sampling_logp_difference/max": 0.5885131359100342, "sampling/sampling_logp_difference/mean": 0.014850430190563202, "step": 783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 607.0, "completions/max_terminated_length": 607.0, "completions/mean_length": 319.96875, "completions/mean_terminated_length": 319.96875, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.46724486351013184, "epoch": 0.9607843137254902, "frac_reward_zero_std": 1.0, "grad_norm": 0.016332395355383937, "kl": 0.029078485444188118, "learning_rate": 8.606581590851208e-07, "loss": 0.0002, "num_tokens": 33881645.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.474380612373352, "sampling/importance_sampling_ratio/mean": 0.9995901584625244, "sampling/importance_sampling_ratio/min": 0.6537459492683411, "sampling/sampling_logp_difference/max": 0.4250364303588867, "sampling/sampling_logp_difference/mean": 0.015130121260881424, "step": 784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 708.0, "completions/max_terminated_length": 708.0, "completions/mean_length": 420.0625, "completions/mean_terminated_length": 420.0625, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "entropy": 0.5946223735809326, "epoch": 0.9620098039215687, "frac_reward_zero_std": 0.75, "grad_norm": 0.4689450161838002, "kl": 0.024493422359228134, "learning_rate": 8.601643951826758e-07, "loss": 0.004, "num_tokens": 33928257.0, "reward": 0.59375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.6643285751342773, "sampling/importance_sampling_ratio/mean": 1.0002706050872803, "sampling/importance_sampling_ratio/min": 0.6716874241828918, "sampling/sampling_logp_difference/max": 0.5094218254089355, "sampling/sampling_logp_difference/mean": 0.016302669420838356, "step": 785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1352.0, "completions/max_terminated_length": 1352.0, "completions/mean_length": 398.34375, "completions/mean_terminated_length": 398.34375, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "entropy": 0.46796315908432007, "epoch": 0.9632352941176471, "frac_reward_zero_std": 1.0, "grad_norm": 0.015132054188836952, "kl": 0.022569676861166954, "learning_rate": 8.596699001693255e-07, "loss": 0.0002, "num_tokens": 33970007.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.630755066871643, "sampling/importance_sampling_ratio/mean": 1.0002809762954712, "sampling/importance_sampling_ratio/min": 0.3221859931945801, "sampling/sampling_logp_difference/max": 1.1326262950897217, "sampling/sampling_logp_difference/mean": 0.013694694265723228, "step": 786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1033.0, "completions/max_terminated_length": 1033.0, "completions/mean_length": 459.34375, "completions/mean_terminated_length": 459.34375, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "entropy": 0.5031492114067078, "epoch": 0.9644607843137255, "frac_reward_zero_std": 0.75, "grad_norm": 0.36243460852046816, "kl": 0.02985949069261551, "learning_rate": 8.591746750488637e-07, "loss": -0.0012, "num_tokens": 34019117.0, "reward": 0.71875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.4361599683761597, "sampling/importance_sampling_ratio/mean": 0.9999244809150696, "sampling/importance_sampling_ratio/min": 0.6517505049705505, "sampling/sampling_logp_difference/max": 0.42809343338012695, "sampling/sampling_logp_difference/mean": 0.013905894942581654, "step": 787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 566.0, "completions/max_terminated_length": 566.0, "completions/mean_length": 308.375, "completions/mean_terminated_length": 308.375, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "entropy": 0.5398274064064026, "epoch": 0.9656862745098039, "frac_reward_zero_std": 1.0, "grad_norm": 0.018345620518145523, "kl": 0.028484433889389038, "learning_rate": 8.58678720826566e-07, "loss": 0.0003, "num_tokens": 34055973.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.9376873970031738, "sampling/importance_sampling_ratio/mean": 1.0006375312805176, "sampling/importance_sampling_ratio/min": 0.6091328263282776, "sampling/sampling_logp_difference/max": 0.6614952087402344, "sampling/sampling_logp_difference/mean": 0.01680939644575119, "step": 788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 939.0, "completions/max_terminated_length": 939.0, "completions/mean_length": 402.609375, "completions/mean_terminated_length": 402.609375, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.339847594499588, "epoch": 0.9669117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.011100218330461736, "kl": 0.016965359449386597, "learning_rate": 8.58182038509188e-07, "loss": 0.0002, "num_tokens": 34099516.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5940072536468506, "sampling/importance_sampling_ratio/mean": 0.9999998211860657, "sampling/importance_sampling_ratio/min": 0.6368657350540161, "sampling/sampling_logp_difference/max": 0.4662511348724365, "sampling/sampling_logp_difference/mean": 0.010887120850384235, "step": 789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 600.0, "completions/max_terminated_length": 600.0, "completions/mean_length": 281.59375, "completions/mean_terminated_length": 281.59375, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.5514305233955383, "epoch": 0.9681372549019608, "frac_reward_zero_std": 0.75, "grad_norm": 0.5826172176742026, "kl": 0.03653998300433159, "learning_rate": 8.576846291049633e-07, "loss": 0.0104, "num_tokens": 34137538.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.481687068939209, "sampling/importance_sampling_ratio/mean": 0.9999902844429016, "sampling/importance_sampling_ratio/min": 0.6113826632499695, "sampling/sampling_logp_difference/max": 0.4920322895050049, "sampling/sampling_logp_difference/mean": 0.016111690551042557, "step": 790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 855.0, "completions/max_terminated_length": 855.0, "completions/mean_length": 355.765625, "completions/mean_terminated_length": 355.765625, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.69139164686203, "epoch": 0.9693627450980392, "frac_reward_zero_std": 0.5, "grad_norm": 0.7231817872554617, "kl": 0.04800046607851982, "learning_rate": 8.571864936236015e-07, "loss": 0.0578, "num_tokens": 34174723.0, "reward": -0.71875, "reward_std": 0.42695626616477966, "rewards/decision_reward_func/mean": -0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.5678832530975342, "sampling/importance_sampling_ratio/mean": 0.9999889731407166, "sampling/importance_sampling_ratio/min": 0.6368663311004639, "sampling/sampling_logp_difference/max": 0.45119547843933105, "sampling/sampling_logp_difference/mean": 0.018912168219685555, "step": 791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 898.0, "completions/max_terminated_length": 898.0, "completions/mean_length": 356.09375, "completions/mean_terminated_length": 356.09375, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.47070953249931335, "epoch": 0.9705882352941176, "frac_reward_zero_std": 0.75, "grad_norm": 0.5694541596659811, "kl": 0.02229326218366623, "learning_rate": 8.56687633076286e-07, "loss": -0.0088, "num_tokens": 34213865.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.698731780052185, "sampling/importance_sampling_ratio/mean": 0.9992849826812744, "sampling/importance_sampling_ratio/min": 0.6045060157775879, "sampling/sampling_logp_difference/max": 0.5298819541931152, "sampling/sampling_logp_difference/mean": 0.014636294916272163, "step": 792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 994.0, "completions/max_terminated_length": 994.0, "completions/mean_length": 336.515625, "completions/mean_terminated_length": 336.515625, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.488922655582428, "epoch": 0.9718137254901961, "frac_reward_zero_std": 1.0, "grad_norm": 0.01534817558399206, "kl": 0.029156874865293503, "learning_rate": 8.561880484756724e-07, "loss": 0.0002, "num_tokens": 34255738.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4338340759277344, "sampling/importance_sampling_ratio/mean": 0.9999443292617798, "sampling/importance_sampling_ratio/min": 0.613062858581543, "sampling/sampling_logp_difference/max": 0.4892878532409668, "sampling/sampling_logp_difference/mean": 0.015585701912641525, "step": 793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1148.0, "completions/max_terminated_length": 1148.0, "completions/mean_length": 336.890625, "completions/mean_terminated_length": 336.890625, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.4514719843864441, "epoch": 0.9730392156862745, "frac_reward_zero_std": 1.0, "grad_norm": 0.024932817422293054, "kl": 0.022997815161943436, "learning_rate": 8.556877408358854e-07, "loss": 0.0002, "num_tokens": 34293379.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.5284610986709595, "sampling/importance_sampling_ratio/mean": 1.000150203704834, "sampling/importance_sampling_ratio/min": 0.6439381241798401, "sampling/sampling_logp_difference/max": 0.440152645111084, "sampling/sampling_logp_difference/mean": 0.014653358608484268, "step": 794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1037.0, "completions/max_terminated_length": 1037.0, "completions/mean_length": 494.609375, "completions/mean_terminated_length": 494.609375, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "entropy": 0.5638453960418701, "epoch": 0.9742647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 0.7124011084232115, "kl": 0.018564216792583466, "learning_rate": 8.551867111725182e-07, "loss": 0.0061, "num_tokens": 34341482.0, "reward": 0.59375, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.5154064893722534, "sampling/importance_sampling_ratio/mean": 1.0000834465026855, "sampling/importance_sampling_ratio/min": 0.6936931610107422, "sampling/sampling_logp_difference/max": 0.4156837463378906, "sampling/sampling_logp_difference/mean": 0.014492059126496315, "step": 795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 803.0, "completions/max_terminated_length": 803.0, "completions/mean_length": 316.15625, "completions/mean_terminated_length": 316.15625, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.6049585938453674, "epoch": 0.9754901960784313, "frac_reward_zero_std": 1.0, "grad_norm": 0.019904880834604566, "kl": 0.03187735751271248, "learning_rate": 8.546849605026288e-07, "loss": 0.0003, "num_tokens": 34382836.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5600908994674683, "sampling/importance_sampling_ratio/mean": 1.0004396438598633, "sampling/importance_sampling_ratio/min": 0.6938838958740234, "sampling/sampling_logp_difference/max": 0.4447441101074219, "sampling/sampling_logp_difference/mean": 0.01796342432498932, "step": 796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1455.0, "completions/max_terminated_length": 1455.0, "completions/mean_length": 449.515625, "completions/mean_terminated_length": 449.515625, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "entropy": 0.5281847715377808, "epoch": 0.9767156862745098, "frac_reward_zero_std": 1.0, "grad_norm": 0.012225324498660236, "kl": 0.01726653426885605, "learning_rate": 8.541824898447397e-07, "loss": 0.0002, "num_tokens": 34433381.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6142302751541138, "sampling/importance_sampling_ratio/mean": 0.9999728202819824, "sampling/importance_sampling_ratio/min": 0.5237794518470764, "sampling/sampling_logp_difference/max": 0.6466846466064453, "sampling/sampling_logp_difference/mean": 0.014855876564979553, "step": 797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 851.0, "completions/max_terminated_length": 851.0, "completions/mean_length": 344.984375, "completions/mean_terminated_length": 344.984375, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.4672160744667053, "epoch": 0.9779411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.013293461640055508, "kl": 0.0207839272916317, "learning_rate": 8.536793002188343e-07, "loss": 0.0002, "num_tokens": 34473284.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.497794508934021, "sampling/importance_sampling_ratio/mean": 1.0000165700912476, "sampling/importance_sampling_ratio/min": 0.6170880794525146, "sampling/sampling_logp_difference/max": 0.482743501663208, "sampling/sampling_logp_difference/mean": 0.014504889957606792, "step": 798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 614.0, "completions/max_terminated_length": 614.0, "completions/mean_length": 408.46875, "completions/mean_terminated_length": 408.46875, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "entropy": 0.5349903106689453, "epoch": 0.9791666666666666, "frac_reward_zero_std": 0.75, "grad_norm": 0.5110278669772659, "kl": 0.018368054181337357, "learning_rate": 8.531753926463556e-07, "loss": -0.0122, "num_tokens": 34524434.0, "reward": 0.125, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 1.761712908744812, "sampling/importance_sampling_ratio/mean": 0.9999021291732788, "sampling/importance_sampling_ratio/min": 0.6274725794792175, "sampling/sampling_logp_difference/max": 0.566286563873291, "sampling/sampling_logp_difference/mean": 0.014641386456787586, "step": 799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1472.0, "completions/max_terminated_length": 1472.0, "completions/mean_length": 538.140625, "completions/mean_terminated_length": 538.140625, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "entropy": 0.5497861504554749, "epoch": 0.9803921568627451, "frac_reward_zero_std": 0.25, "grad_norm": 1.2511907526828006, "kl": 0.01944294571876526, "learning_rate": 8.526707681502043e-07, "loss": 0.008, "num_tokens": 34587579.0, "reward": 0.875, "reward_std": 0.42078250646591187, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.5442301034927368, "sampling/importance_sampling_ratio/mean": 1.0001311302185059, "sampling/importance_sampling_ratio/min": 0.4596956968307495, "sampling/sampling_logp_difference/max": 0.7771905660629272, "sampling/sampling_logp_difference/mean": 0.015164727345108986, "step": 800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 419.140625, "completions/mean_terminated_length": 419.140625, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "entropy": 0.4435018301010132, "epoch": 0.9816176470588235, "frac_reward_zero_std": 1.0, "grad_norm": 0.01219271840457557, "kl": 0.020672716200351715, "learning_rate": 8.521654277547361e-07, "loss": 0.0002, "num_tokens": 34634724.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3978151082992554, "sampling/importance_sampling_ratio/mean": 0.9997361898422241, "sampling/importance_sampling_ratio/min": 0.606693685054779, "sampling/sampling_logp_difference/max": 0.49973130226135254, "sampling/sampling_logp_difference/mean": 0.013806047849357128, "step": 801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 716.0, "completions/max_terminated_length": 716.0, "completions/mean_length": 372.4375, "completions/mean_terminated_length": 372.4375, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "entropy": 0.4806109666824341, "epoch": 0.9828431372549019, "frac_reward_zero_std": 1.0, "grad_norm": 0.020982879876408796, "kl": 0.021084006875753403, "learning_rate": 8.516593724857597e-07, "loss": 0.0002, "num_tokens": 34676720.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4411864280700684, "sampling/importance_sampling_ratio/mean": 1.0000569820404053, "sampling/importance_sampling_ratio/min": 0.5295711159706116, "sampling/sampling_logp_difference/max": 0.6356878280639648, "sampling/sampling_logp_difference/mean": 0.015663012862205505, "step": 802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1765.0, "completions/max_terminated_length": 1765.0, "completions/mean_length": 339.203125, "completions/mean_terminated_length": 339.203125, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.5519985556602478, "epoch": 0.9840686274509803, "frac_reward_zero_std": 0.75, "grad_norm": 0.6115762107557831, "kl": 0.03167096897959709, "learning_rate": 8.511526033705356e-07, "loss": -0.0117, "num_tokens": 34716333.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.5015382766723633, "sampling/importance_sampling_ratio/mean": 1.0000666379928589, "sampling/importance_sampling_ratio/min": 0.6169269680976868, "sampling/sampling_logp_difference/max": 0.4830045700073242, "sampling/sampling_logp_difference/mean": 0.016751505434513092, "step": 803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1575.0, "completions/max_terminated_length": 1575.0, "completions/mean_length": 472.21875, "completions/mean_terminated_length": 472.21875, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "entropy": 0.46064493060112, "epoch": 0.9852941176470589, "frac_reward_zero_std": 1.0, "grad_norm": 0.012719519195513356, "kl": 0.019233424216508865, "learning_rate": 8.506451214377728e-07, "loss": 0.0002, "num_tokens": 34763227.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5278314352035522, "sampling/importance_sampling_ratio/mean": 0.9998906850814819, "sampling/importance_sampling_ratio/min": 0.6396620273590088, "sampling/sampling_logp_difference/max": 0.4468154311180115, "sampling/sampling_logp_difference/mean": 0.014837637543678284, "step": 804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 923.0, "completions/max_terminated_length": 923.0, "completions/mean_length": 361.1875, "completions/mean_terminated_length": 361.1875, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.3964069187641144, "epoch": 0.9865196078431373, "frac_reward_zero_std": 0.75, "grad_norm": 0.532722865599922, "kl": 0.021793823689222336, "learning_rate": 8.501369277176274e-07, "loss": -0.0231, "num_tokens": 34808743.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.6554694175720215, "sampling/importance_sampling_ratio/mean": 1.0000495910644531, "sampling/importance_sampling_ratio/min": 0.4863824248313904, "sampling/sampling_logp_difference/max": 0.7207601070404053, "sampling/sampling_logp_difference/mean": 0.014344685710966587, "step": 805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 677.0, "completions/max_terminated_length": 677.0, "completions/mean_length": 313.140625, "completions/mean_terminated_length": 313.140625, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.4241112172603607, "epoch": 0.9877450980392157, "frac_reward_zero_std": 1.0, "grad_norm": 0.023425965047033117, "kl": 0.026911094784736633, "learning_rate": 8.496280232417007e-07, "loss": 0.0002, "num_tokens": 34852976.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5839177370071411, "sampling/importance_sampling_ratio/mean": 0.9994020462036133, "sampling/importance_sampling_ratio/min": 0.6094582080841064, "sampling/sampling_logp_difference/max": 0.49518489837646484, "sampling/sampling_logp_difference/mean": 0.01488051563501358, "step": 806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 550.0, "completions/max_terminated_length": 550.0, "completions/mean_length": 283.4375, "completions/mean_terminated_length": 283.4375, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.4660641551017761, "epoch": 0.9889705882352942, "frac_reward_zero_std": 1.0, "grad_norm": 0.01745895573427588, "kl": 0.0214752946048975, "learning_rate": 8.491184090430363e-07, "loss": 0.0002, "num_tokens": 34887468.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4720323085784912, "sampling/importance_sampling_ratio/mean": 1.0000641345977783, "sampling/importance_sampling_ratio/min": 0.6748349070549011, "sampling/sampling_logp_difference/max": 0.39328718185424805, "sampling/sampling_logp_difference/mean": 0.015778541564941406, "step": 807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 588.0, "completions/max_terminated_length": 588.0, "completions/mean_length": 342.78125, "completions/mean_terminated_length": 342.78125, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "entropy": 0.4341048002243042, "epoch": 0.9901960784313726, "frac_reward_zero_std": 1.0, "grad_norm": 0.015258281812130134, "kl": 0.017839530482888222, "learning_rate": 8.48608086156119e-07, "loss": 0.0002, "num_tokens": 34929326.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.2739661931991577, "sampling/importance_sampling_ratio/mean": 0.9999217987060547, "sampling/importance_sampling_ratio/min": 0.6058225035667419, "sampling/sampling_logp_difference/max": 0.5011682510375977, "sampling/sampling_logp_difference/mean": 0.014498371630907059, "step": 808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1114.0, "completions/max_terminated_length": 1114.0, "completions/mean_length": 328.328125, "completions/mean_terminated_length": 328.328125, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.4656296968460083, "epoch": 0.991421568627451, "frac_reward_zero_std": 0.75, "grad_norm": 0.49167309985225327, "kl": 0.02728867530822754, "learning_rate": 8.480970556168717e-07, "loss": -0.0021, "num_tokens": 34962403.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.587177038192749, "sampling/importance_sampling_ratio/mean": 1.0000154972076416, "sampling/importance_sampling_ratio/min": 0.6867961883544922, "sampling/sampling_logp_difference/max": 0.4619569778442383, "sampling/sampling_logp_difference/mean": 0.014931060373783112, "step": 809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1836.0, "completions/max_terminated_length": 1836.0, "completions/mean_length": 508.484375, "completions/mean_terminated_length": 508.484375, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "entropy": 0.47584545612335205, "epoch": 0.9926470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.010183147366729487, "kl": 0.013545319437980652, "learning_rate": 8.47585318462654e-07, "loss": 0.0001, "num_tokens": 35015042.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.546833872795105, "sampling/importance_sampling_ratio/mean": 0.9999799728393555, "sampling/importance_sampling_ratio/min": 0.6771500110626221, "sampling/sampling_logp_difference/max": 0.43621015548706055, "sampling/sampling_logp_difference/mean": 0.013639264740049839, "step": 810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1187.0, "completions/max_terminated_length": 1187.0, "completions/mean_length": 460.125, "completions/mean_terminated_length": 460.125, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "entropy": 0.5031360387802124, "epoch": 0.9938725490196079, "frac_reward_zero_std": 0.75, "grad_norm": 0.515717918780484, "kl": 0.016793442890048027, "learning_rate": 8.470728757322603e-07, "loss": -0.0089, "num_tokens": 35066042.0, "reward": 0.375, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9997833371162415, "sampling/importance_sampling_ratio/min": 0.6074450016021729, "sampling/sampling_logp_difference/max": 0.895805835723877, "sampling/sampling_logp_difference/mean": 0.014495819807052612, "step": 811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 574.0, "completions/max_terminated_length": 574.0, "completions/mean_length": 260.828125, "completions/mean_terminated_length": 260.828125, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.3313244581222534, "epoch": 0.9950980392156863, "frac_reward_zero_std": 1.0, "grad_norm": 0.016761641331149987, "kl": 0.029090698808431625, "learning_rate": 8.465597284659163e-07, "loss": 0.0002, "num_tokens": 35095935.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3958734273910522, "sampling/importance_sampling_ratio/mean": 0.9995677471160889, "sampling/importance_sampling_ratio/min": 0.6020660996437073, "sampling/sampling_logp_difference/max": 0.5073879957199097, "sampling/sampling_logp_difference/mean": 0.012608187273144722, "step": 812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 898.0, "completions/max_terminated_length": 898.0, "completions/mean_length": 363.8125, "completions/mean_terminated_length": 363.8125, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.39672601222991943, "epoch": 0.9963235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.01175761868632323, "kl": 0.018186356872320175, "learning_rate": 8.460458777052788e-07, "loss": 0.0002, "num_tokens": 35136851.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.799277901649475, "sampling/importance_sampling_ratio/mean": 0.9997471570968628, "sampling/importance_sampling_ratio/min": 0.5093982219696045, "sampling/sampling_logp_difference/max": 0.674525260925293, "sampling/sampling_logp_difference/mean": 0.013747948221862316, "step": 813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 827.0, "completions/max_terminated_length": 827.0, "completions/mean_length": 398.609375, "completions/mean_terminated_length": 398.609375, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "entropy": 0.5601762533187866, "epoch": 0.9975490196078431, "frac_reward_zero_std": 0.75, "grad_norm": 0.6047732066859605, "kl": 0.022629810497164726, "learning_rate": 8.455313244934324e-07, "loss": 0.0066, "num_tokens": 35182474.0, "reward": -0.25, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": -0.25, "rewards/decision_reward_func/std": 0.9759001135826111, "sampling/importance_sampling_ratio/max": 1.6018401384353638, "sampling/importance_sampling_ratio/mean": 0.9998610615730286, "sampling/importance_sampling_ratio/min": 0.59054034948349, "sampling/sampling_logp_difference/max": 0.5267172455787659, "sampling/sampling_logp_difference/mean": 0.014847964979708195, "step": 814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1019.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 452.0, "completions/mean_terminated_length": 452.0, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.3934324085712433, "epoch": 0.9987745098039216, "frac_reward_zero_std": 1.0, "grad_norm": 0.01018262219022322, "kl": 0.016980599611997604, "learning_rate": 8.450160698748871e-07, "loss": 0.0002, "num_tokens": 35227434.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3860242366790771, "sampling/importance_sampling_ratio/mean": 1.0000152587890625, "sampling/importance_sampling_ratio/min": 0.6234198212623596, "sampling/sampling_logp_difference/max": 0.4725351333618164, "sampling/sampling_logp_difference/mean": 0.013679477386176586, "step": 815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/max_terminated_length": 519.0, "completions/mean_length": 265.53125, "completions/mean_terminated_length": 265.53125, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.48240184783935547, "epoch": 1.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.01675641756020913, "kl": 0.035414524376392365, "learning_rate": 8.445001148955775e-07, "loss": 0.0003, "num_tokens": 35259292.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4287195205688477, "sampling/importance_sampling_ratio/mean": 0.999833881855011, "sampling/importance_sampling_ratio/min": 0.6391898393630981, "sampling/sampling_logp_difference/max": 0.4475538730621338, "sampling/sampling_logp_difference/mean": 0.015231864526867867, "step": 816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 622.0, "completions/max_terminated_length": 622.0, "completions/mean_length": 305.734375, "completions/mean_terminated_length": 305.734375, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "entropy": 0.4987301230430603, "epoch": 1.0012254901960784, "frac_reward_zero_std": 1.0, "grad_norm": 0.016368707135383423, "kl": 0.02424335479736328, "learning_rate": 8.439834606028593e-07, "loss": 0.0002, "num_tokens": 35298347.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3703854084014893, "sampling/importance_sampling_ratio/mean": 0.9993170499801636, "sampling/importance_sampling_ratio/min": 0.6029584407806396, "sampling/sampling_logp_difference/max": 0.5059070587158203, "sampling/sampling_logp_difference/mean": 0.015957478433847427, "step": 817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 898.0, "completions/max_terminated_length": 898.0, "completions/mean_length": 402.5, "completions/mean_terminated_length": 402.5, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "entropy": 0.49397552013397217, "epoch": 1.0024509803921569, "frac_reward_zero_std": 0.75, "grad_norm": 0.6030865456016474, "kl": 0.02434811368584633, "learning_rate": 8.434661080455082e-07, "loss": -0.0115, "num_tokens": 35343019.0, "reward": 0.8125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.280001163482666, "sampling/importance_sampling_ratio/mean": 0.9997414350509644, "sampling/importance_sampling_ratio/min": 0.6971049308776855, "sampling/sampling_logp_difference/max": 0.36081933975219727, "sampling/sampling_logp_difference/mean": 0.01418485026806593, "step": 818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 586.0, "completions/max_terminated_length": 586.0, "completions/mean_length": 300.5625, "completions/mean_terminated_length": 300.5625, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.4183977544307709, "epoch": 1.0036764705882353, "frac_reward_zero_std": 0.75, "grad_norm": 0.6192046650747374, "kl": 0.031775712966918945, "learning_rate": 8.42948058273717e-07, "loss": 0.0352, "num_tokens": 35375247.0, "reward": 0.6875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.6582053899765015, "sampling/importance_sampling_ratio/mean": 0.9999525547027588, "sampling/importance_sampling_ratio/min": 0.6196064352989197, "sampling/sampling_logp_difference/max": 0.5057358741760254, "sampling/sampling_logp_difference/mean": 0.014253491535782814, "step": 819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 865.0, "completions/max_terminated_length": 865.0, "completions/mean_length": 372.125, "completions/mean_terminated_length": 372.125, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.4494644105434418, "epoch": 1.0049019607843137, "frac_reward_zero_std": 1.0, "grad_norm": 0.036065631089798184, "kl": 0.02571011334657669, "learning_rate": 8.424293123390938e-07, "loss": 0.0002, "num_tokens": 35414695.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5427117347717285, "sampling/importance_sampling_ratio/mean": 0.999953031539917, "sampling/importance_sampling_ratio/min": 0.6279893517494202, "sampling/sampling_logp_difference/max": 0.46523213386535645, "sampling/sampling_logp_difference/mean": 0.01410834863781929, "step": 820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 713.0, "completions/max_terminated_length": 713.0, "completions/mean_length": 412.96875, "completions/mean_terminated_length": 412.96875, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "entropy": 0.5479364395141602, "epoch": 1.0061274509803921, "frac_reward_zero_std": 1.0, "grad_norm": 0.0209722118209267, "kl": 0.021950162947177887, "learning_rate": 8.4190987129466e-07, "loss": 0.0002, "num_tokens": 35459813.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.4757169485092163, "sampling/importance_sampling_ratio/mean": 1.000272512435913, "sampling/importance_sampling_ratio/min": 0.5226291418075562, "sampling/sampling_logp_difference/max": 0.6488831043243408, "sampling/sampling_logp_difference/mean": 0.016717704012989998, "step": 821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 788.0, "completions/max_terminated_length": 788.0, "completions/mean_length": 397.6875, "completions/mean_terminated_length": 397.6875, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "entropy": 0.4508061707019806, "epoch": 1.0073529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.012431502543977888, "kl": 0.016288448125123978, "learning_rate": 8.413897361948483e-07, "loss": 0.0002, "num_tokens": 35502785.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.799373745918274, "sampling/importance_sampling_ratio/mean": 1.0000230073928833, "sampling/importance_sampling_ratio/min": 0.6815706491470337, "sampling/sampling_logp_difference/max": 0.587438702583313, "sampling/sampling_logp_difference/mean": 0.01388187799602747, "step": 822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 600.0, "completions/max_terminated_length": 600.0, "completions/mean_length": 364.40625, "completions/mean_terminated_length": 364.40625, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "entropy": 0.40032196044921875, "epoch": 1.008578431372549, "frac_reward_zero_std": 1.0, "grad_norm": 0.011357065937915153, "kl": 0.014658722095191479, "learning_rate": 8.408689080954997e-07, "loss": 0.0001, "num_tokens": 35547099.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3007348775863647, "sampling/importance_sampling_ratio/mean": 0.9998449087142944, "sampling/importance_sampling_ratio/min": 0.6335480809211731, "sampling/sampling_logp_difference/max": 0.4564194679260254, "sampling/sampling_logp_difference/mean": 0.012699559330940247, "step": 823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 944.0, "completions/max_terminated_length": 944.0, "completions/mean_length": 377.8125, "completions/mean_terminated_length": 377.8125, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.41380536556243896, "epoch": 1.0098039215686274, "frac_reward_zero_std": 1.0, "grad_norm": 0.01187764175157601, "kl": 0.018476031720638275, "learning_rate": 8.403473880538625e-07, "loss": 0.0002, "num_tokens": 35591359.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.4559379816055298, "sampling/importance_sampling_ratio/mean": 0.9998773336410522, "sampling/importance_sampling_ratio/min": 0.6208937168121338, "sampling/sampling_logp_difference/max": 0.476595401763916, "sampling/sampling_logp_difference/mean": 0.013538211584091187, "step": 824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1026.0, "completions/max_terminated_length": 1026.0, "completions/mean_length": 320.9375, "completions/mean_terminated_length": 320.9375, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.47616416215896606, "epoch": 1.0110294117647058, "frac_reward_zero_std": 0.75, "grad_norm": 0.6885426928613942, "kl": 0.024915143847465515, "learning_rate": 8.398251771285892e-07, "loss": -0.0116, "num_tokens": 35635835.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.9138820171356201, "sampling/importance_sampling_ratio/mean": 0.9999698400497437, "sampling/importance_sampling_ratio/min": 0.6925370693206787, "sampling/sampling_logp_difference/max": 0.6491336822509766, "sampling/sampling_logp_difference/mean": 0.0151006318628788, "step": 825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1100.0, "completions/max_terminated_length": 1100.0, "completions/mean_length": 400.3125, "completions/mean_terminated_length": 400.3125, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.4541095197200775, "epoch": 1.0122549019607843, "frac_reward_zero_std": 1.0, "grad_norm": 0.012121128309006761, "kl": 0.01972046121954918, "learning_rate": 8.393022763797346e-07, "loss": 0.0002, "num_tokens": 35676639.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4737224578857422, "sampling/importance_sampling_ratio/mean": 1.0004751682281494, "sampling/importance_sampling_ratio/min": 0.6476913690567017, "sampling/sampling_logp_difference/max": 0.4343409538269043, "sampling/sampling_logp_difference/mean": 0.014103256165981293, "step": 826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 538.0, "completions/max_terminated_length": 538.0, "completions/mean_length": 271.609375, "completions/mean_terminated_length": 271.609375, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.4625665843486786, "epoch": 1.0134803921568627, "frac_reward_zero_std": 0.75, "grad_norm": 0.7285864323716251, "kl": 0.04873106628656387, "learning_rate": 8.387786868687548e-07, "loss": 0.0439, "num_tokens": 35705942.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.412795066833496, "sampling/importance_sampling_ratio/mean": 1.00007963180542, "sampling/importance_sampling_ratio/min": 0.7095739245414734, "sampling/sampling_logp_difference/max": 0.34557008743286133, "sampling/sampling_logp_difference/mean": 0.015058880671858788, "step": 827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1857.0, "completions/max_terminated_length": 1857.0, "completions/mean_length": 468.0625, "completions/mean_terminated_length": 468.0625, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "entropy": 0.5597566366195679, "epoch": 1.0147058823529411, "frac_reward_zero_std": 0.75, "grad_norm": 0.6200492649134131, "kl": 0.02291674166917801, "learning_rate": 8.382544096585026e-07, "loss": 0.0212, "num_tokens": 35751162.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.6182817220687866, "sampling/importance_sampling_ratio/mean": 1.0002772808074951, "sampling/importance_sampling_ratio/min": 0.47059696912765503, "sampling/sampling_logp_difference/max": 0.7537532448768616, "sampling/sampling_logp_difference/mean": 0.016573898494243622, "step": 828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 901.0, "completions/max_terminated_length": 901.0, "completions/mean_length": 344.921875, "completions/mean_terminated_length": 344.921875, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "entropy": 0.4740946292877197, "epoch": 1.0159313725490196, "frac_reward_zero_std": 1.0, "grad_norm": 0.017227604438527173, "kl": 0.022538714110851288, "learning_rate": 8.37729445813228e-07, "loss": 0.0002, "num_tokens": 35793413.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001556873321533, "sampling/importance_sampling_ratio/min": 0.34088990092277527, "sampling/sampling_logp_difference/max": 1.0761957168579102, "sampling/sampling_logp_difference/mean": 0.01445299107581377, "step": 829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 580.0, "completions/max_terminated_length": 580.0, "completions/mean_length": 373.03125, "completions/mean_terminated_length": 373.03125, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "entropy": 0.42983734607696533, "epoch": 1.017156862745098, "frac_reward_zero_std": 1.0, "grad_norm": 0.014081525927589033, "kl": 0.019020304083824158, "learning_rate": 8.372037963985741e-07, "loss": 0.0002, "num_tokens": 35840423.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.655460000038147, "sampling/importance_sampling_ratio/mean": 0.9997948408126831, "sampling/importance_sampling_ratio/min": 0.432391881942749, "sampling/sampling_logp_difference/max": 0.8384230136871338, "sampling/sampling_logp_difference/mean": 0.014495404437184334, "step": 830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1265.0, "completions/max_terminated_length": 1265.0, "completions/mean_length": 478.109375, "completions/mean_terminated_length": 478.109375, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "entropy": 0.37262606620788574, "epoch": 1.0183823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 0.6493578837995234, "kl": 0.018630554899573326, "learning_rate": 8.366774624815761e-07, "loss": -0.0313, "num_tokens": 35892014.0, "reward": 0.90625, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.5278353691101074, "sampling/importance_sampling_ratio/mean": 1.0000782012939453, "sampling/importance_sampling_ratio/min": 0.6147372722625732, "sampling/sampling_logp_difference/max": 0.4865603446960449, "sampling/sampling_logp_difference/mean": 0.011993379332125187, "step": 831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 561.0, "completions/max_terminated_length": 561.0, "completions/mean_length": 277.921875, "completions/mean_terminated_length": 277.921875, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.4105915129184723, "epoch": 1.0196078431372548, "frac_reward_zero_std": 1.0, "grad_norm": 0.019697483875101042, "kl": 0.021748226135969162, "learning_rate": 8.361504451306584e-07, "loss": 0.0002, "num_tokens": 35932041.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.8775612115859985, "sampling/importance_sampling_ratio/mean": 0.999927282333374, "sampling/importance_sampling_ratio/min": 0.6174290776252747, "sampling/sampling_logp_difference/max": 0.6299736499786377, "sampling/sampling_logp_difference/mean": 0.014155371114611626, "step": 832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 791.0, "completions/max_terminated_length": 791.0, "completions/mean_length": 367.5625, "completions/mean_terminated_length": 367.5625, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.4926304221153259, "epoch": 1.0208333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.012711600140360017, "kl": 0.02093254029750824, "learning_rate": 8.356227454156328e-07, "loss": 0.0002, "num_tokens": 35971469.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.548194169998169, "sampling/importance_sampling_ratio/mean": 1.0002301931381226, "sampling/importance_sampling_ratio/min": 0.7034939527511597, "sampling/sampling_logp_difference/max": 0.437089204788208, "sampling/sampling_logp_difference/mean": 0.01614668220281601, "step": 833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 985.0, "completions/max_terminated_length": 985.0, "completions/mean_length": 327.65625, "completions/mean_terminated_length": 327.65625, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.469058632850647, "epoch": 1.0220588235294117, "frac_reward_zero_std": 0.75, "grad_norm": 0.6629405781644633, "kl": 0.02095085009932518, "learning_rate": 8.350943644076964e-07, "loss": -0.0295, "num_tokens": 36008855.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.4578101634979248, "sampling/importance_sampling_ratio/mean": 0.9999817609786987, "sampling/importance_sampling_ratio/min": 0.2191912680864334, "sampling/sampling_logp_difference/max": 1.517810583114624, "sampling/sampling_logp_difference/mean": 0.014413051307201385, "step": 834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 987.0, "completions/max_terminated_length": 987.0, "completions/mean_length": 417.234375, "completions/mean_terminated_length": 417.234375, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "entropy": 0.35443413257598877, "epoch": 1.0232843137254901, "frac_reward_zero_std": 1.0, "grad_norm": 0.010485269984056791, "kl": 0.014154710806906223, "learning_rate": 8.34565303179429e-07, "loss": 0.0001, "num_tokens": 36050950.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6016755104064941, "sampling/importance_sampling_ratio/mean": 1.000335931777954, "sampling/importance_sampling_ratio/min": 0.7147842049598694, "sampling/sampling_logp_difference/max": 0.4710502624511719, "sampling/sampling_logp_difference/mean": 0.011959867551922798, "step": 835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 915.0, "completions/max_terminated_length": 915.0, "completions/mean_length": 437.515625, "completions/mean_terminated_length": 437.515625, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.5473031997680664, "epoch": 1.0245098039215685, "frac_reward_zero_std": 1.0, "grad_norm": 0.014600211104638189, "kl": 0.02786817029118538, "learning_rate": 8.340355628047917e-07, "loss": 0.0002, "num_tokens": 36097831.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.566723108291626, "sampling/importance_sampling_ratio/mean": 0.9997404217720032, "sampling/importance_sampling_ratio/min": 0.6287466287612915, "sampling/sampling_logp_difference/max": 0.46402692794799805, "sampling/sampling_logp_difference/mean": 0.015844102948904037, "step": 836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 902.0, "completions/max_terminated_length": 902.0, "completions/mean_length": 371.265625, "completions/mean_terminated_length": 371.265625, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "entropy": 0.701164722442627, "epoch": 1.025735294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.016220472175500083, "kl": 0.026872701942920685, "learning_rate": 8.335051443591234e-07, "loss": 0.0003, "num_tokens": 36140712.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.5476317405700684, "sampling/importance_sampling_ratio/mean": 1.000091552734375, "sampling/importance_sampling_ratio/min": 0.7019948959350586, "sampling/sampling_logp_difference/max": 0.4367258548736572, "sampling/sampling_logp_difference/mean": 0.01877165213227272, "step": 837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 708.0, "completions/max_terminated_length": 708.0, "completions/mean_length": 367.828125, "completions/mean_terminated_length": 367.828125, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "entropy": 0.36331313848495483, "epoch": 1.0269607843137254, "frac_reward_zero_std": 1.0, "grad_norm": 0.010685455110671219, "kl": 0.015179133042693138, "learning_rate": 8.329740489191405e-07, "loss": 0.0001, "num_tokens": 36179869.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6207282543182373, "sampling/importance_sampling_ratio/mean": 1.0002416372299194, "sampling/importance_sampling_ratio/min": 0.6077021956443787, "sampling/sampling_logp_difference/max": 0.49807024002075195, "sampling/sampling_logp_difference/mean": 0.011475006118416786, "step": 838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 865.0, "completions/max_terminated_length": 865.0, "completions/mean_length": 483.125, "completions/mean_terminated_length": 483.125, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "entropy": 0.4659738540649414, "epoch": 1.0281862745098038, "frac_reward_zero_std": 0.75, "grad_norm": 0.5382537533476007, "kl": 0.021947693079710007, "learning_rate": 8.324422775629327e-07, "loss": -0.0087, "num_tokens": 36233029.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.6968222856521606, "sampling/importance_sampling_ratio/mean": 1.0002154111862183, "sampling/importance_sampling_ratio/min": 0.6501498818397522, "sampling/sampling_logp_difference/max": 0.5287572145462036, "sampling/sampling_logp_difference/mean": 0.013145716860890388, "step": 839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 569.0, "completions/max_terminated_length": 569.0, "completions/mean_length": 311.25, "completions/mean_terminated_length": 311.25, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "entropy": 0.4122235178947449, "epoch": 1.0294117647058822, "frac_reward_zero_std": 1.0, "grad_norm": 0.013291115889266271, "kl": 0.022669076919555664, "learning_rate": 8.319098313699624e-07, "loss": 0.0002, "num_tokens": 36272373.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.577401876449585, "sampling/importance_sampling_ratio/mean": 0.9999696612358093, "sampling/importance_sampling_ratio/min": 0.6905845403671265, "sampling/sampling_logp_difference/max": 0.4557790756225586, "sampling/sampling_logp_difference/mean": 0.013853969052433968, "step": 840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 949.0, "completions/max_terminated_length": 949.0, "completions/mean_length": 443.046875, "completions/mean_terminated_length": 443.046875, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "entropy": 0.40403634309768677, "epoch": 1.0306372549019607, "frac_reward_zero_std": 1.0, "grad_norm": 0.012302837885924796, "kl": 0.016302479431033134, "learning_rate": 8.313767114210615e-07, "loss": 0.0002, "num_tokens": 36327992.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4493327140808105, "sampling/importance_sampling_ratio/mean": 1.0000079870224, "sampling/importance_sampling_ratio/min": 0.6931122541427612, "sampling/sampling_logp_difference/max": 0.37110328674316406, "sampling/sampling_logp_difference/mean": 0.012042234651744366, "step": 841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 795.0, "completions/max_terminated_length": 795.0, "completions/mean_length": 369.8125, "completions/mean_terminated_length": 369.8125, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "entropy": 0.40932318568229675, "epoch": 1.031862745098039, "frac_reward_zero_std": 1.0, "grad_norm": 0.02595775131092835, "kl": 0.03158611059188843, "learning_rate": 8.308429187984298e-07, "loss": 0.0003, "num_tokens": 36366380.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5680538415908813, "sampling/importance_sampling_ratio/mean": 1.0000011920928955, "sampling/importance_sampling_ratio/min": 0.6130444407463074, "sampling/sampling_logp_difference/max": 0.4893178939819336, "sampling/sampling_logp_difference/mean": 0.013050958514213562, "step": 842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1509.0, "completions/max_terminated_length": 1509.0, "completions/mean_length": 533.140625, "completions/mean_terminated_length": 533.140625, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "entropy": 0.4066576659679413, "epoch": 1.0330882352941178, "frac_reward_zero_std": 1.0, "grad_norm": 0.011834910114630786, "kl": 0.015388630330562592, "learning_rate": 8.303084545856322e-07, "loss": 0.0001, "num_tokens": 36426677.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5971640348434448, "sampling/importance_sampling_ratio/mean": 1.0000064373016357, "sampling/importance_sampling_ratio/min": 0.5376309752464294, "sampling/sampling_logp_difference/max": 0.6205828189849854, "sampling/sampling_logp_difference/mean": 0.011414074338972569, "step": 843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1070.0, "completions/max_terminated_length": 1070.0, "completions/mean_length": 449.28125, "completions/mean_terminated_length": 449.28125, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "entropy": 0.48656803369522095, "epoch": 1.0343137254901962, "frac_reward_zero_std": 1.0, "grad_norm": 0.05943471609550393, "kl": 0.020146511495113373, "learning_rate": 8.297733198675977e-07, "loss": 0.0002, "num_tokens": 36477255.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4393980503082275, "sampling/importance_sampling_ratio/mean": 0.9999526739120483, "sampling/importance_sampling_ratio/min": 0.6484747529029846, "sampling/sampling_logp_difference/max": 0.4331321716308594, "sampling/sampling_logp_difference/mean": 0.014281999319791794, "step": 844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1139.0, "completions/max_terminated_length": 1139.0, "completions/mean_length": 414.8125, "completions/mean_terminated_length": 414.8125, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "entropy": 0.5331407785415649, "epoch": 1.0355392156862746, "frac_reward_zero_std": 1.0, "grad_norm": 0.012525863490262362, "kl": 0.019666114822030067, "learning_rate": 8.292375157306155e-07, "loss": 0.0002, "num_tokens": 36523739.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5314794778823853, "sampling/importance_sampling_ratio/mean": 0.999993085861206, "sampling/importance_sampling_ratio/min": 0.6701066493988037, "sampling/sampling_logp_difference/max": 0.42623424530029297, "sampling/sampling_logp_difference/mean": 0.014918945729732513, "step": 845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 652.0, "completions/max_terminated_length": 652.0, "completions/mean_length": 353.0, "completions/mean_terminated_length": 353.0, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "entropy": 0.5159058570861816, "epoch": 1.036764705882353, "frac_reward_zero_std": 0.75, "grad_norm": 0.5220719316071455, "kl": 0.03352625295519829, "learning_rate": 8.287010432623343e-07, "loss": 0.0299, "num_tokens": 36561931.0, "reward": 0.28125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 1.9512150287628174, "sampling/importance_sampling_ratio/mean": 1.000083327293396, "sampling/importance_sampling_ratio/min": 0.6933680772781372, "sampling/sampling_logp_difference/max": 0.668452262878418, "sampling/sampling_logp_difference/mean": 0.015339499339461327, "step": 846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 775.0, "completions/max_terminated_length": 775.0, "completions/mean_length": 319.625, "completions/mean_terminated_length": 319.625, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.4472338557243347, "epoch": 1.0379901960784315, "frac_reward_zero_std": 1.0, "grad_norm": 0.015392327770479393, "kl": 0.023647285997867584, "learning_rate": 8.281639035517591e-07, "loss": 0.0002, "num_tokens": 36596771.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.623776912689209, "sampling/importance_sampling_ratio/mean": 1.0002211332321167, "sampling/importance_sampling_ratio/min": 0.6626731157302856, "sampling/sampling_logp_difference/max": 0.4847548007965088, "sampling/sampling_logp_difference/mean": 0.015304419212043285, "step": 847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1018.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 362.359375, "completions/mean_terminated_length": 362.359375, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.42755377292633057, "epoch": 1.0392156862745099, "frac_reward_zero_std": 1.0, "grad_norm": 0.012449017574473097, "kl": 0.023968011140823364, "learning_rate": 8.276260976892495e-07, "loss": 0.0002, "num_tokens": 36642570.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4523426294326782, "sampling/importance_sampling_ratio/mean": 1.0001201629638672, "sampling/importance_sampling_ratio/min": 0.6174314022064209, "sampling/sampling_logp_difference/max": 0.48218727111816406, "sampling/sampling_logp_difference/mean": 0.013769105076789856, "step": 848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 625.0, "completions/max_terminated_length": 625.0, "completions/mean_length": 338.28125, "completions/mean_terminated_length": 338.28125, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.5090155005455017, "epoch": 1.0404411764705883, "frac_reward_zero_std": 1.0, "grad_norm": 0.014375522357879585, "kl": 0.02145942859351635, "learning_rate": 8.270876267665173e-07, "loss": 0.0002, "num_tokens": 36684828.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6474897861480713, "sampling/importance_sampling_ratio/mean": 1.000279426574707, "sampling/importance_sampling_ratio/min": 0.6883040070533752, "sampling/sampling_logp_difference/max": 0.4992527961730957, "sampling/sampling_logp_difference/mean": 0.015276769176125526, "step": 849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 829.0, "completions/max_terminated_length": 829.0, "completions/mean_length": 364.71875, "completions/mean_terminated_length": 364.71875, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "entropy": 0.454105019569397, "epoch": 1.0416666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.012605976263520027, "kl": 0.01959565281867981, "learning_rate": 8.265484918766242e-07, "loss": 0.0002, "num_tokens": 36721354.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.7422443628311157, "sampling/importance_sampling_ratio/mean": 0.9999165534973145, "sampling/importance_sampling_ratio/min": 0.2979317605495453, "sampling/sampling_logp_difference/max": 1.210890769958496, "sampling/sampling_logp_difference/mean": 0.01445569284260273, "step": 850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 819.0, "completions/max_terminated_length": 819.0, "completions/mean_length": 362.9375, "completions/mean_terminated_length": 362.9375, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "entropy": 0.4856789708137512, "epoch": 1.0428921568627452, "frac_reward_zero_std": 1.0, "grad_norm": 0.013451084741635644, "kl": 0.018137648701667786, "learning_rate": 8.260086941139804e-07, "loss": 0.0002, "num_tokens": 36767894.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4282795190811157, "sampling/importance_sampling_ratio/mean": 0.9999197125434875, "sampling/importance_sampling_ratio/min": 0.5990440249443054, "sampling/sampling_logp_difference/max": 0.5124201774597168, "sampling/sampling_logp_difference/mean": 0.01418228168040514, "step": 851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 841.0, "completions/max_terminated_length": 841.0, "completions/mean_length": 325.9375, "completions/mean_terminated_length": 325.9375, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.676593542098999, "epoch": 1.0441176470588236, "frac_reward_zero_std": 0.25, "grad_norm": 1.0832132480780228, "kl": 0.04200778901576996, "learning_rate": 8.254682345743405e-07, "loss": 0.0371, "num_tokens": 36805346.0, "reward": 0.65625, "reward_std": 0.5539814233779907, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.4234087467193604, "sampling/importance_sampling_ratio/mean": 1.0000464916229248, "sampling/importance_sampling_ratio/min": 0.6774286031723022, "sampling/sampling_logp_difference/max": 0.38945114612579346, "sampling/sampling_logp_difference/mean": 0.01866847462952137, "step": 852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1372.0, "completions/max_terminated_length": 1372.0, "completions/mean_length": 511.5, "completions/mean_terminated_length": 511.5, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "entropy": 0.48972129821777344, "epoch": 1.045343137254902, "frac_reward_zero_std": 0.75, "grad_norm": 0.49944105800418714, "kl": 0.01757653057575226, "learning_rate": 8.249271143548036e-07, "loss": 0.0139, "num_tokens": 36858066.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.503455638885498, "sampling/importance_sampling_ratio/mean": 1.000135064125061, "sampling/importance_sampling_ratio/min": 0.6053248047828674, "sampling/sampling_logp_difference/max": 0.5019900798797607, "sampling/sampling_logp_difference/mean": 0.012987576425075531, "step": 853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1940.0, "completions/max_terminated_length": 1940.0, "completions/mean_length": 516.171875, "completions/mean_terminated_length": 516.171875, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "entropy": 0.5231927633285522, "epoch": 1.0465686274509804, "frac_reward_zero_std": 0.5, "grad_norm": 0.6775332950517056, "kl": 0.02581137977540493, "learning_rate": 8.243853345538093e-07, "loss": -0.0233, "num_tokens": 36913837.0, "reward": 0.90625, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.4706839323043823, "sampling/importance_sampling_ratio/mean": 1.0000205039978027, "sampling/importance_sampling_ratio/min": 0.5195791721343994, "sampling/sampling_logp_difference/max": 0.6547360420227051, "sampling/sampling_logp_difference/mean": 0.01477834116667509, "step": 854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1074.0, "completions/max_terminated_length": 1074.0, "completions/mean_length": 370.234375, "completions/mean_terminated_length": 370.234375, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.45566508173942566, "epoch": 1.0477941176470589, "frac_reward_zero_std": 1.0, "grad_norm": 0.014381737825803643, "kl": 0.01859443075954914, "learning_rate": 8.238428962711362e-07, "loss": 0.0002, "num_tokens": 36954844.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4061518907546997, "sampling/importance_sampling_ratio/mean": 0.9996511340141296, "sampling/importance_sampling_ratio/min": 0.662319540977478, "sampling/sampling_logp_difference/max": 0.41200709342956543, "sampling/sampling_logp_difference/mean": 0.01324477419257164, "step": 855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 929.0, "completions/max_terminated_length": 929.0, "completions/mean_length": 454.5, "completions/mean_terminated_length": 454.5, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "entropy": 0.5846160650253296, "epoch": 1.0490196078431373, "frac_reward_zero_std": 0.75, "grad_norm": 0.5430349105445419, "kl": 0.023290738463401794, "learning_rate": 8.232998006078997e-07, "loss": 0.0129, "num_tokens": 37004108.0, "reward": 0.71875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.597166657447815, "sampling/importance_sampling_ratio/mean": 0.9996073246002197, "sampling/importance_sampling_ratio/min": 0.6357550621032715, "sampling/sampling_logp_difference/max": 0.468231201171875, "sampling/sampling_logp_difference/mean": 0.015723854303359985, "step": 856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 742.0, "completions/max_terminated_length": 742.0, "completions/mean_length": 331.125, "completions/mean_terminated_length": 331.125, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.36586248874664307, "epoch": 1.0502450980392157, "frac_reward_zero_std": 1.0, "grad_norm": 0.010943010037780353, "kl": 0.019169632345438004, "learning_rate": 8.227560486665498e-07, "loss": 0.0002, "num_tokens": 37042452.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3198637962341309, "sampling/importance_sampling_ratio/mean": 0.9996393918991089, "sampling/importance_sampling_ratio/min": 0.6484593152999878, "sampling/sampling_logp_difference/max": 0.43315601348876953, "sampling/sampling_logp_difference/mean": 0.012517621740698814, "step": 857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1213.0, "completions/max_terminated_length": 1213.0, "completions/mean_length": 428.265625, "completions/mean_terminated_length": 428.265625, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.5629438757896423, "epoch": 1.0514705882352942, "frac_reward_zero_std": 1.0, "grad_norm": 0.014357681245602508, "kl": 0.028896110132336617, "learning_rate": 8.222116415508682e-07, "loss": 0.0002, "num_tokens": 37085813.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5112817287445068, "sampling/importance_sampling_ratio/mean": 1.000229835510254, "sampling/importance_sampling_ratio/min": 0.5389931201934814, "sampling/sampling_logp_difference/max": 0.6180524826049805, "sampling/sampling_logp_difference/mean": 0.016544140875339508, "step": 858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 745.0, "completions/max_terminated_length": 745.0, "completions/mean_length": 335.1875, "completions/mean_terminated_length": 335.1875, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "entropy": 0.41577181220054626, "epoch": 1.0526960784313726, "frac_reward_zero_std": 1.0, "grad_norm": 0.019415795061468778, "kl": 0.022818922996520996, "learning_rate": 8.21666580365967e-07, "loss": 0.0002, "num_tokens": 37130865.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001976490020752, "sampling/importance_sampling_ratio/min": 0.5758025050163269, "sampling/sampling_logp_difference/max": 0.9459843635559082, "sampling/sampling_logp_difference/mean": 0.013468477874994278, "step": 859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 960.0, "completions/max_terminated_length": 960.0, "completions/mean_length": 389.375, "completions/mean_terminated_length": 389.375, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.5921446084976196, "epoch": 1.053921568627451, "frac_reward_zero_std": 1.0, "grad_norm": 0.01399150727488807, "kl": 0.028352342545986176, "learning_rate": 8.211208662182858e-07, "loss": 0.0002, "num_tokens": 37176905.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.6207653284072876, "sampling/importance_sampling_ratio/mean": 0.9999873638153076, "sampling/importance_sampling_ratio/min": 0.5830851197242737, "sampling/sampling_logp_difference/max": 0.5394220352172852, "sampling/sampling_logp_difference/mean": 0.016944658011198044, "step": 860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1258.0, "completions/max_terminated_length": 1258.0, "completions/mean_length": 382.28125, "completions/mean_terminated_length": 382.28125, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.5611501336097717, "epoch": 1.0551470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.014563892277017784, "kl": 0.027144744992256165, "learning_rate": 8.205745002155899e-07, "loss": 0.0002, "num_tokens": 37219819.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3625625371932983, "sampling/importance_sampling_ratio/mean": 0.999889075756073, "sampling/importance_sampling_ratio/min": 0.6577969789505005, "sampling/sampling_logp_difference/max": 0.4188588857650757, "sampling/sampling_logp_difference/mean": 0.015938464552164078, "step": 861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 714.0, "completions/max_terminated_length": 714.0, "completions/mean_length": 349.90625, "completions/mean_terminated_length": 349.90625, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "entropy": 0.4704465866088867, "epoch": 1.0563725490196079, "frac_reward_zero_std": 1.0, "grad_norm": 0.018164379804906598, "kl": 0.020080573856830597, "learning_rate": 8.200274834669675e-07, "loss": 0.0002, "num_tokens": 37257477.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.4352002143859863, "sampling/importance_sampling_ratio/mean": 1.0001873970031738, "sampling/importance_sampling_ratio/min": 0.6544366478919983, "sampling/sampling_logp_difference/max": 0.4239804744720459, "sampling/sampling_logp_difference/mean": 0.013853654265403748, "step": 862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 871.0, "completions/max_terminated_length": 871.0, "completions/mean_length": 409.40625, "completions/mean_terminated_length": 409.40625, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.46995168924331665, "epoch": 1.0575980392156863, "frac_reward_zero_std": 1.0, "grad_norm": 0.012287581753829538, "kl": 0.016538305208086967, "learning_rate": 8.194798170828279e-07, "loss": 0.0002, "num_tokens": 37303583.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.528493046760559, "sampling/importance_sampling_ratio/mean": 1.0000345706939697, "sampling/importance_sampling_ratio/min": 0.7181244492530823, "sampling/sampling_logp_difference/max": 0.4242823123931885, "sampling/sampling_logp_difference/mean": 0.014643233269453049, "step": 863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1067.0, "completions/max_terminated_length": 1067.0, "completions/mean_length": 424.78125, "completions/mean_terminated_length": 424.78125, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "entropy": 0.39203596115112305, "epoch": 1.0588235294117647, "frac_reward_zero_std": 0.75, "grad_norm": 0.5709381253569925, "kl": 0.016624227166175842, "learning_rate": 8.189315021748993e-07, "loss": -0.0066, "num_tokens": 37347633.0, "reward": 0.59375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.5453099012374878, "sampling/importance_sampling_ratio/mean": 1.0001564025878906, "sampling/importance_sampling_ratio/min": 0.6695212125778198, "sampling/sampling_logp_difference/max": 0.4352245330810547, "sampling/sampling_logp_difference/mean": 0.012517517432570457, "step": 864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3618.0, "completions/max_terminated_length": 3618.0, "completions/mean_length": 713.375, "completions/mean_terminated_length": 713.375, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "entropy": 0.5445233583450317, "epoch": 1.0600490196078431, "frac_reward_zero_std": 0.75, "grad_norm": 0.30967159257410243, "kl": 0.021849237382411957, "learning_rate": 8.183825398562263e-07, "loss": -0.0015, "num_tokens": 37411081.0, "reward": -0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": -0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.4754436016082764, "sampling/importance_sampling_ratio/mean": 0.999617874622345, "sampling/importance_sampling_ratio/min": 0.48325493931770325, "sampling/sampling_logp_difference/max": 0.7272109985351562, "sampling/sampling_logp_difference/mean": 0.01467932015657425, "step": 865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 812.0, "completions/max_terminated_length": 812.0, "completions/mean_length": 283.15625, "completions/mean_terminated_length": 283.15625, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.5011054277420044, "epoch": 1.0612745098039216, "frac_reward_zero_std": 1.0, "grad_norm": 0.0375432615882682, "kl": 0.032709185034036636, "learning_rate": 8.178329312411676e-07, "loss": 0.0003, "num_tokens": 37447587.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3498896360397339, "sampling/importance_sampling_ratio/mean": 0.9994264841079712, "sampling/importance_sampling_ratio/min": 0.6087914705276489, "sampling/sampling_logp_difference/max": 0.4962794780731201, "sampling/sampling_logp_difference/mean": 0.015032051131129265, "step": 866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 782.0, "completions/max_terminated_length": 782.0, "completions/mean_length": 351.40625, "completions/mean_terminated_length": 351.40625, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "entropy": 0.5707926750183105, "epoch": 1.0625, "frac_reward_zero_std": 0.75, "grad_norm": 0.5182444136156701, "kl": 0.028838731348514557, "learning_rate": 8.172826774453936e-07, "loss": -0.036, "num_tokens": 37482413.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.3735285997390747, "sampling/importance_sampling_ratio/mean": 0.9996201992034912, "sampling/importance_sampling_ratio/min": 0.5382956266403198, "sampling/sampling_logp_difference/max": 0.6193474531173706, "sampling/sampling_logp_difference/mean": 0.01599281281232834, "step": 867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1084.0, "completions/max_terminated_length": 1084.0, "completions/mean_length": 411.71875, "completions/mean_terminated_length": 411.71875, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "entropy": 0.6203360557556152, "epoch": 1.0637254901960784, "frac_reward_zero_std": 0.75, "grad_norm": 0.7564307834284638, "kl": 0.023432517424225807, "learning_rate": 8.16731779585885e-07, "loss": -0.0732, "num_tokens": 37533371.0, "reward": -0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": -0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0004048347473145, "sampling/importance_sampling_ratio/min": 0.5538867712020874, "sampling/sampling_logp_difference/max": 0.7156343460083008, "sampling/sampling_logp_difference/mean": 0.017641521990299225, "step": 868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 796.0, "completions/max_terminated_length": 796.0, "completions/mean_length": 423.9375, "completions/mean_terminated_length": 423.9375, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "entropy": 0.47536951303482056, "epoch": 1.0649509803921569, "frac_reward_zero_std": 1.0, "grad_norm": 0.011179005917295199, "kl": 0.015664702281355858, "learning_rate": 8.161802387809292e-07, "loss": 0.0002, "num_tokens": 37578791.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.597170114517212, "sampling/importance_sampling_ratio/mean": 1.0000163316726685, "sampling/importance_sampling_ratio/min": 0.5076878070831299, "sampling/sampling_logp_difference/max": 0.6778886318206787, "sampling/sampling_logp_difference/mean": 0.014871329069137573, "step": 869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1703.0, "completions/max_terminated_length": 1703.0, "completions/mean_length": 586.46875, "completions/mean_terminated_length": 586.46875, "completions/min_length": 340.0, "completions/min_terminated_length": 340.0, "entropy": 0.48970508575439453, "epoch": 1.0661764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.010787845033768971, "kl": 0.01567646861076355, "learning_rate": 8.156280561501194e-07, "loss": 0.0002, "num_tokens": 37639189.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000684261322021, "sampling/importance_sampling_ratio/min": 0.6880142688751221, "sampling/sampling_logp_difference/max": 0.847126841545105, "sampling/sampling_logp_difference/mean": 0.013899365440011024, "step": 870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 760.0, "completions/max_terminated_length": 760.0, "completions/mean_length": 416.296875, "completions/mean_terminated_length": 416.296875, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "entropy": 0.4785991609096527, "epoch": 1.0674019607843137, "frac_reward_zero_std": 1.0, "grad_norm": 0.015126898979559388, "kl": 0.020525505766272545, "learning_rate": 8.150752328143513e-07, "loss": 0.0002, "num_tokens": 37686344.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999615550041199, "sampling/importance_sampling_ratio/min": 0.6152343153953552, "sampling/sampling_logp_difference/max": 0.7365431785583496, "sampling/sampling_logp_difference/mean": 0.014830423519015312, "step": 871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1202.0, "completions/max_terminated_length": 1202.0, "completions/mean_length": 526.40625, "completions/mean_terminated_length": 526.40625, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "entropy": 0.5568031072616577, "epoch": 1.0686274509803921, "frac_reward_zero_std": 1.0, "grad_norm": 0.012731195802153226, "kl": 0.017582394182682037, "learning_rate": 8.145217698958211e-07, "loss": 0.0002, "num_tokens": 37737170.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.8375439643859863, "sampling/importance_sampling_ratio/mean": 1.0000368356704712, "sampling/importance_sampling_ratio/min": 0.5243291854858398, "sampling/sampling_logp_difference/max": 0.6456356048583984, "sampling/sampling_logp_difference/mean": 0.015001557767391205, "step": 872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 718.0, "completions/max_terminated_length": 718.0, "completions/mean_length": 365.25, "completions/mean_terminated_length": 365.25, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "entropy": 0.39887675642967224, "epoch": 1.0698529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.009611889941187802, "kl": 0.01571408286690712, "learning_rate": 8.139676685180236e-07, "loss": 0.0001, "num_tokens": 37777138.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.493390679359436, "sampling/importance_sampling_ratio/mean": 0.9998472929000854, "sampling/importance_sampling_ratio/min": 0.6883745789527893, "sampling/sampling_logp_difference/max": 0.4010491371154785, "sampling/sampling_logp_difference/mean": 0.012456140480935574, "step": 873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 910.0, "completions/max_terminated_length": 910.0, "completions/mean_length": 375.390625, "completions/mean_terminated_length": 375.390625, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.38861963152885437, "epoch": 1.071078431372549, "frac_reward_zero_std": 1.0, "grad_norm": 0.01553422992331984, "kl": 0.01890058070421219, "learning_rate": 8.134129298057495e-07, "loss": 0.0002, "num_tokens": 37819003.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.674485206604004, "sampling/importance_sampling_ratio/mean": 0.999923825263977, "sampling/importance_sampling_ratio/min": 0.5632240176200867, "sampling/sampling_logp_difference/max": 0.574077844619751, "sampling/sampling_logp_difference/mean": 0.012619503773748875, "step": 874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.0, "completions/max_terminated_length": 546.0, "completions/mean_length": 324.96875, "completions/mean_terminated_length": 324.96875, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "entropy": 0.487202525138855, "epoch": 1.0723039215686274, "frac_reward_zero_std": 1.0, "grad_norm": 0.012356151636831385, "kl": 0.017236681655049324, "learning_rate": 8.128575548850832e-07, "loss": 0.0002, "num_tokens": 37855289.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.5251429080963135, "sampling/importance_sampling_ratio/mean": 0.9999264478683472, "sampling/importance_sampling_ratio/min": 0.7094698548316956, "sampling/sampling_logp_difference/max": 0.4220881462097168, "sampling/sampling_logp_difference/mean": 0.013969069346785545, "step": 875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1804.0, "completions/max_terminated_length": 1804.0, "completions/mean_length": 487.0, "completions/mean_terminated_length": 487.0, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "entropy": 0.4617059528827667, "epoch": 1.0735294117647058, "frac_reward_zero_std": 1.0, "grad_norm": 0.011270508045320708, "kl": 0.01935384050011635, "learning_rate": 8.123015448834005e-07, "loss": 0.0002, "num_tokens": 37905641.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5905044078826904, "sampling/importance_sampling_ratio/mean": 1.000331163406372, "sampling/importance_sampling_ratio/min": 0.6299557089805603, "sampling/sampling_logp_difference/max": 0.4640512466430664, "sampling/sampling_logp_difference/mean": 0.014118541032075882, "step": 876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 840.0, "completions/max_terminated_length": 840.0, "completions/mean_length": 356.8125, "completions/mean_terminated_length": 356.8125, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "entropy": 0.4555931091308594, "epoch": 1.0747549019607843, "frac_reward_zero_std": 0.75, "grad_norm": 0.5067962783001283, "kl": 0.024681556969881058, "learning_rate": 8.117449009293668e-07, "loss": 0.0033, "num_tokens": 37943533.0, "reward": 0.3125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.2997187376022339, "sampling/importance_sampling_ratio/mean": 1.000197172164917, "sampling/importance_sampling_ratio/min": 0.4269721210002899, "sampling/sampling_logp_difference/max": 0.851036548614502, "sampling/sampling_logp_difference/mean": 0.013757619075477123, "step": 877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1620.0, "completions/max_terminated_length": 1620.0, "completions/mean_length": 428.015625, "completions/mean_terminated_length": 428.015625, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.4784441292285919, "epoch": 1.0759803921568627, "frac_reward_zero_std": 0.75, "grad_norm": 0.6891979139770266, "kl": 0.02141539379954338, "learning_rate": 8.111876241529337e-07, "loss": 0.0237, "num_tokens": 37988542.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.515368938446045, "sampling/importance_sampling_ratio/mean": 0.9996261596679688, "sampling/importance_sampling_ratio/min": 0.627615749835968, "sampling/sampling_logp_difference/max": 0.46582722663879395, "sampling/sampling_logp_difference/mean": 0.014419437386095524, "step": 878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1549.0, "completions/max_terminated_length": 1549.0, "completions/mean_length": 413.765625, "completions/mean_terminated_length": 413.765625, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "entropy": 0.6027151346206665, "epoch": 1.0772058823529411, "frac_reward_zero_std": 0.5, "grad_norm": 0.7311464628010919, "kl": 0.02985946647822857, "learning_rate": 8.106297156853379e-07, "loss": -0.0033, "num_tokens": 38030719.0, "reward": 0.84375, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.5757619142532349, "sampling/importance_sampling_ratio/mean": 0.9998623132705688, "sampling/importance_sampling_ratio/min": 0.6941671967506409, "sampling/sampling_logp_difference/max": 0.4547388553619385, "sampling/sampling_logp_difference/mean": 0.016571931540966034, "step": 879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 803.0, "completions/max_terminated_length": 803.0, "completions/mean_length": 491.46875, "completions/mean_terminated_length": 491.46875, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "entropy": 0.5273478031158447, "epoch": 1.0784313725490196, "frac_reward_zero_std": 0.75, "grad_norm": 0.4140035817370451, "kl": 0.017141733318567276, "learning_rate": 8.100711766590982e-07, "loss": -0.0087, "num_tokens": 38080525.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.7667073011398315, "sampling/importance_sampling_ratio/mean": 1.0001102685928345, "sampling/importance_sampling_ratio/min": 0.613068699836731, "sampling/sampling_logp_difference/max": 0.569117546081543, "sampling/sampling_logp_difference/mean": 0.015179449692368507, "step": 880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 784.0, "completions/max_terminated_length": 784.0, "completions/mean_length": 334.453125, "completions/mean_terminated_length": 334.453125, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.39014196395874023, "epoch": 1.079656862745098, "frac_reward_zero_std": 1.0, "grad_norm": 0.017791508190589046, "kl": 0.02341562509536743, "learning_rate": 8.095120082080134e-07, "loss": 0.0002, "num_tokens": 38117578.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.53742253780365, "sampling/importance_sampling_ratio/mean": 0.9994849562644958, "sampling/importance_sampling_ratio/min": 0.6262629628181458, "sampling/sampling_logp_difference/max": 0.4679849147796631, "sampling/sampling_logp_difference/mean": 0.013856976293027401, "step": 881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1372.0, "completions/max_terminated_length": 1372.0, "completions/mean_length": 557.96875, "completions/mean_terminated_length": 557.96875, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "entropy": 0.540973424911499, "epoch": 1.0808823529411764, "frac_reward_zero_std": 0.75, "grad_norm": 0.4613892724649675, "kl": 0.02042776718735695, "learning_rate": 8.089522114671602e-07, "loss": 0.022, "num_tokens": 38175016.0, "reward": 0.40625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.49869704246521, "sampling/importance_sampling_ratio/mean": 1.0000042915344238, "sampling/importance_sampling_ratio/min": 0.07787959277629852, "sampling/sampling_logp_difference/max": 2.552591323852539, "sampling/sampling_logp_difference/mean": 0.015390649437904358, "step": 882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 811.0, "completions/max_terminated_length": 811.0, "completions/mean_length": 380.3125, "completions/mean_terminated_length": 380.3125, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.5096001625061035, "epoch": 1.0821078431372548, "frac_reward_zero_std": 1.0, "grad_norm": 0.01365264994764651, "kl": 0.01828480325639248, "learning_rate": 8.083917875728905e-07, "loss": 0.0002, "num_tokens": 38219228.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5744246244430542, "sampling/importance_sampling_ratio/mean": 1.0000827312469482, "sampling/importance_sampling_ratio/min": 0.6796666979789734, "sampling/sampling_logp_difference/max": 0.4538898468017578, "sampling/sampling_logp_difference/mean": 0.015438522212207317, "step": 883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 922.0, "completions/max_terminated_length": 922.0, "completions/mean_length": 422.1875, "completions/mean_terminated_length": 422.1875, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.5254069566726685, "epoch": 1.0833333333333333, "frac_reward_zero_std": 0.75, "grad_norm": 0.533293962610377, "kl": 0.02293827198445797, "learning_rate": 8.07830737662829e-07, "loss": 0.0168, "num_tokens": 38265336.0, "reward": 0.0625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.7217987775802612, "sampling/importance_sampling_ratio/mean": 1.0001024007797241, "sampling/importance_sampling_ratio/min": 0.4982428550720215, "sampling/sampling_logp_difference/max": 0.6966676712036133, "sampling/sampling_logp_difference/mean": 0.015350284054875374, "step": 884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 811.0, "completions/max_terminated_length": 811.0, "completions/mean_length": 327.828125, "completions/mean_terminated_length": 327.828125, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "entropy": 0.46041902899742126, "epoch": 1.0845588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 0.011847461223383586, "kl": 0.016426920890808105, "learning_rate": 8.072690628758721e-07, "loss": 0.0002, "num_tokens": 38304205.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4489175081253052, "sampling/importance_sampling_ratio/mean": 1.0000789165496826, "sampling/importance_sampling_ratio/min": 0.7310478687286377, "sampling/sampling_logp_difference/max": 0.370816707611084, "sampling/sampling_logp_difference/mean": 0.013513938523828983, "step": 885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 871.0, "completions/max_terminated_length": 871.0, "completions/mean_length": 384.015625, "completions/mean_terminated_length": 384.015625, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.3806955814361572, "epoch": 1.0857843137254901, "frac_reward_zero_std": 1.0, "grad_norm": 0.013155895764962542, "kl": 0.017536144703626633, "learning_rate": 8.067067643521833e-07, "loss": 0.0002, "num_tokens": 38345470.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3960398435592651, "sampling/importance_sampling_ratio/mean": 1.0005300045013428, "sampling/importance_sampling_ratio/min": 0.7003123760223389, "sampling/sampling_logp_difference/max": 0.3562288284301758, "sampling/sampling_logp_difference/mean": 0.012566442601382732, "step": 886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1272.0, "completions/max_terminated_length": 1272.0, "completions/mean_length": 535.875, "completions/mean_terminated_length": 535.875, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "entropy": 0.42488420009613037, "epoch": 1.0870098039215685, "frac_reward_zero_std": 1.0, "grad_norm": 0.009574982298074311, "kl": 0.014867661520838737, "learning_rate": 8.061438432331934e-07, "loss": 0.0001, "num_tokens": 38398950.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4125384092330933, "sampling/importance_sampling_ratio/mean": 1.000244140625, "sampling/importance_sampling_ratio/min": 0.48336195945739746, "sampling/sampling_logp_difference/max": 0.7269895076751709, "sampling/sampling_logp_difference/mean": 0.012924907729029655, "step": 887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1126.0, "completions/max_terminated_length": 1126.0, "completions/mean_length": 367.84375, "completions/mean_terminated_length": 367.84375, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.5122273564338684, "epoch": 1.088235294117647, "frac_reward_zero_std": 0.75, "grad_norm": 0.611274034077521, "kl": 0.022803552448749542, "learning_rate": 8.055803006615965e-07, "loss": -0.0328, "num_tokens": 38437452.0, "reward": 0.6875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.5972588062286377, "sampling/importance_sampling_ratio/mean": 0.9999412894248962, "sampling/importance_sampling_ratio/min": 0.6510802507400513, "sampling/sampling_logp_difference/max": 0.4682888984680176, "sampling/sampling_logp_difference/mean": 0.014254300855100155, "step": 888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 846.0, "completions/max_terminated_length": 846.0, "completions/mean_length": 401.921875, "completions/mean_terminated_length": 401.921875, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.4095113277435303, "epoch": 1.0894607843137254, "frac_reward_zero_std": 1.0, "grad_norm": 0.010477956187691739, "kl": 0.013982458040118217, "learning_rate": 8.050161377813485e-07, "loss": 0.0001, "num_tokens": 38481959.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3377184867858887, "sampling/importance_sampling_ratio/mean": 0.9997671842575073, "sampling/importance_sampling_ratio/min": 0.5413225293159485, "sampling/sampling_logp_difference/max": 0.6137400269508362, "sampling/sampling_logp_difference/mean": 0.012877598404884338, "step": 889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1163.0, "completions/max_terminated_length": 1163.0, "completions/mean_length": 325.875, "completions/mean_terminated_length": 325.875, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.46991509199142456, "epoch": 1.0906862745098038, "frac_reward_zero_std": 1.0, "grad_norm": 0.015555393374093405, "kl": 0.024382973089814186, "learning_rate": 8.04451355737664e-07, "loss": 0.0002, "num_tokens": 38518831.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6272839307785034, "sampling/importance_sampling_ratio/mean": 1.0001800060272217, "sampling/importance_sampling_ratio/min": 0.6198499202728271, "sampling/sampling_logp_difference/max": 0.48691225051879883, "sampling/sampling_logp_difference/mean": 0.015678830444812775, "step": 890 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2142.0, "completions/max_terminated_length": 2142.0, "completions/mean_length": 564.21875, "completions/mean_terminated_length": 564.21875, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "entropy": 0.533220648765564, "epoch": 1.0919117647058822, "frac_reward_zero_std": 1.0, "grad_norm": 0.009469860982720832, "kl": 0.013307198882102966, "learning_rate": 8.03885955677015e-07, "loss": 0.0001, "num_tokens": 38577421.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.899863839149475, "sampling/importance_sampling_ratio/mean": 0.9998913407325745, "sampling/importance_sampling_ratio/min": 0.35318806767463684, "sampling/sampling_logp_difference/max": 1.0407545566558838, "sampling/sampling_logp_difference/mean": 0.01518654078245163, "step": 891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 625.0, "completions/max_terminated_length": 625.0, "completions/mean_length": 347.34375, "completions/mean_terminated_length": 347.34375, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "entropy": 0.5211541056632996, "epoch": 1.093137254901961, "frac_reward_zero_std": 0.75, "grad_norm": 0.5570868091742779, "kl": 0.028366055339574814, "learning_rate": 8.033199387471276e-07, "loss": -0.0008, "num_tokens": 38628867.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.5619100332260132, "sampling/importance_sampling_ratio/mean": 0.9998133182525635, "sampling/importance_sampling_ratio/min": 0.6687731742858887, "sampling/sampling_logp_difference/max": 0.4459095001220703, "sampling/sampling_logp_difference/mean": 0.015445943921804428, "step": 892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1052.0, "completions/max_terminated_length": 1052.0, "completions/mean_length": 431.21875, "completions/mean_terminated_length": 431.21875, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.3932618200778961, "epoch": 1.094362745098039, "frac_reward_zero_std": 0.75, "grad_norm": 0.5942462130810566, "kl": 0.01780024543404579, "learning_rate": 8.027533060969806e-07, "loss": -0.0007, "num_tokens": 38677233.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9998631477355957, "sampling/importance_sampling_ratio/min": 0.6674957871437073, "sampling/sampling_logp_difference/max": 1.0026841163635254, "sampling/sampling_logp_difference/mean": 0.01202003937214613, "step": 893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 972.0, "completions/max_terminated_length": 972.0, "completions/mean_length": 430.375, "completions/mean_terminated_length": 430.375, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "entropy": 0.47690486907958984, "epoch": 1.0955882352941178, "frac_reward_zero_std": 1.0, "grad_norm": 0.014583004744227173, "kl": 0.017660045996308327, "learning_rate": 8.021860588768021e-07, "loss": 0.0002, "num_tokens": 38719817.0, "reward": -0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": -0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.9099587202072144, "sampling/importance_sampling_ratio/mean": 0.9997490644454956, "sampling/importance_sampling_ratio/min": 0.5917360782623291, "sampling/sampling_logp_difference/max": 0.6470816135406494, "sampling/sampling_logp_difference/mean": 0.015078769996762276, "step": 894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1289.0, "completions/max_terminated_length": 1289.0, "completions/mean_length": 446.421875, "completions/mean_terminated_length": 446.421875, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "entropy": 0.4248363673686981, "epoch": 1.0968137254901962, "frac_reward_zero_std": 1.0, "grad_norm": 0.029567552303250644, "kl": 0.0156199149787426, "learning_rate": 8.016181982380681e-07, "loss": 0.0002, "num_tokens": 38766004.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.845438838005066, "sampling/importance_sampling_ratio/mean": 0.9996616840362549, "sampling/importance_sampling_ratio/min": 0.6095535159111023, "sampling/sampling_logp_difference/max": 0.6127171516418457, "sampling/sampling_logp_difference/mean": 0.012708935886621475, "step": 895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 435.0, "completions/max_terminated_length": 435.0, "completions/mean_length": 212.84375, "completions/mean_terminated_length": 212.84375, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.41291892528533936, "epoch": 1.0980392156862746, "frac_reward_zero_std": 1.0, "grad_norm": 0.016725153300249082, "kl": 0.01977841556072235, "learning_rate": 8.010497253335e-07, "loss": 0.0002, "num_tokens": 38795226.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5311286449432373, "sampling/importance_sampling_ratio/mean": 0.9999145269393921, "sampling/importance_sampling_ratio/min": 0.6171660423278809, "sampling/sampling_logp_difference/max": 0.4826171398162842, "sampling/sampling_logp_difference/mean": 0.014583239331841469, "step": 896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 960.0, "completions/max_terminated_length": 960.0, "completions/mean_length": 485.375, "completions/mean_terminated_length": 485.375, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "entropy": 0.47617030143737793, "epoch": 1.099264705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.01154634199307795, "kl": 0.016824908554553986, "learning_rate": 8.004806413170612e-07, "loss": 0.0002, "num_tokens": 38843282.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6463361978530884, "sampling/importance_sampling_ratio/mean": 1.0000842809677124, "sampling/importance_sampling_ratio/min": 0.6012313961982727, "sampling/sampling_logp_difference/max": 0.5087754726409912, "sampling/sampling_logp_difference/mean": 0.013621354475617409, "step": 897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 808.0, "completions/max_terminated_length": 808.0, "completions/mean_length": 446.59375, "completions/mean_terminated_length": 446.59375, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "entropy": 0.49723929166793823, "epoch": 1.1004901960784315, "frac_reward_zero_std": 0.75, "grad_norm": 0.3684155214232889, "kl": 0.01746043749153614, "learning_rate": 7.999109473439569e-07, "loss": -0.0053, "num_tokens": 38888616.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.4400172233581543, "sampling/importance_sampling_ratio/mean": 1.000138759613037, "sampling/importance_sampling_ratio/min": 0.7243005633354187, "sampling/sampling_logp_difference/max": 0.3646550178527832, "sampling/sampling_logp_difference/mean": 0.013859079219400883, "step": 898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 756.0, "completions/max_terminated_length": 756.0, "completions/mean_length": 362.859375, "completions/mean_terminated_length": 362.859375, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.4355216324329376, "epoch": 1.1017156862745099, "frac_reward_zero_std": 0.75, "grad_norm": 0.4841010611483662, "kl": 0.023189855739474297, "learning_rate": 7.993406445706292e-07, "loss": -0.0178, "num_tokens": 38930959.0, "reward": 0.75, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.5240193605422974, "sampling/importance_sampling_ratio/mean": 1.000259518623352, "sampling/importance_sampling_ratio/min": 0.6081385016441345, "sampling/sampling_logp_difference/max": 0.49735260009765625, "sampling/sampling_logp_difference/mean": 0.013582772575318813, "step": 899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 577.0, "completions/max_terminated_length": 577.0, "completions/mean_length": 304.21875, "completions/mean_terminated_length": 304.21875, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.39185911417007446, "epoch": 1.1029411764705883, "frac_reward_zero_std": 1.0, "grad_norm": 0.011921551477198566, "kl": 0.01765832118690014, "learning_rate": 7.987697341547568e-07, "loss": 0.0002, "num_tokens": 38964077.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.363523244857788, "sampling/importance_sampling_ratio/mean": 0.999721348285675, "sampling/importance_sampling_ratio/min": 0.6173207759857178, "sampling/sampling_logp_difference/max": 0.48236656188964844, "sampling/sampling_logp_difference/mean": 0.014211287721991539, "step": 900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1127.0, "completions/max_terminated_length": 1127.0, "completions/mean_length": 393.203125, "completions/mean_terminated_length": 393.203125, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "entropy": 0.45232677459716797, "epoch": 1.1041666666666667, "frac_reward_zero_std": 0.75, "grad_norm": 0.5336994518046008, "kl": 0.01966608501970768, "learning_rate": 7.981982172552517e-07, "loss": 0.0184, "num_tokens": 39007722.0, "reward": 0.15625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.4111456871032715, "sampling/importance_sampling_ratio/mean": 0.9999991059303284, "sampling/importance_sampling_ratio/min": 0.4682559370994568, "sampling/sampling_logp_difference/max": 0.7587403059005737, "sampling/sampling_logp_difference/mean": 0.01377181801944971, "step": 901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 458.0, "completions/max_terminated_length": 458.0, "completions/mean_length": 251.609375, "completions/mean_terminated_length": 251.609375, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.4379044771194458, "epoch": 1.1053921568627452, "frac_reward_zero_std": 1.0, "grad_norm": 0.015228321321604574, "kl": 0.028489112854003906, "learning_rate": 7.976260950322571e-07, "loss": 0.0002, "num_tokens": 39038257.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4776419401168823, "sampling/importance_sampling_ratio/mean": 0.999793529510498, "sampling/importance_sampling_ratio/min": 0.6771495938301086, "sampling/sampling_logp_difference/max": 0.3904474973678589, "sampling/sampling_logp_difference/mean": 0.016651956364512444, "step": 902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 631.0, "completions/max_terminated_length": 631.0, "completions/mean_length": 379.40625, "completions/mean_terminated_length": 379.40625, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "entropy": 0.5013963580131531, "epoch": 1.1066176470588236, "frac_reward_zero_std": 1.0, "grad_norm": 0.012365518566002879, "kl": 0.017365146428346634, "learning_rate": 7.970533686471448e-07, "loss": 0.0002, "num_tokens": 39086587.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5749825239181519, "sampling/importance_sampling_ratio/mean": 1.0001577138900757, "sampling/importance_sampling_ratio/min": 0.6612205505371094, "sampling/sampling_logp_difference/max": 0.45424413681030273, "sampling/sampling_logp_difference/mean": 0.015427983365952969, "step": 903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1672.0, "completions/max_terminated_length": 1672.0, "completions/mean_length": 385.796875, "completions/mean_terminated_length": 385.796875, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.35299021005630493, "epoch": 1.107843137254902, "frac_reward_zero_std": 1.0, "grad_norm": 0.012013552330408822, "kl": 0.014780383557081223, "learning_rate": 7.964800392625128e-07, "loss": 0.0001, "num_tokens": 39129518.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.958440899848938, "sampling/importance_sampling_ratio/mean": 0.9999262690544128, "sampling/importance_sampling_ratio/min": 0.6409898996353149, "sampling/sampling_logp_difference/max": 0.6721487045288086, "sampling/sampling_logp_difference/mean": 0.012208307161927223, "step": 904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 333.59375, "completions/mean_terminated_length": 333.59375, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.37062472105026245, "epoch": 1.1090686274509804, "frac_reward_zero_std": 0.75, "grad_norm": 0.5917772371340507, "kl": 0.020676087588071823, "learning_rate": 7.959061080421838e-07, "loss": -0.0202, "num_tokens": 39169236.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.8010663986206055, "sampling/importance_sampling_ratio/mean": 1.0003856420516968, "sampling/importance_sampling_ratio/min": 0.6173537373542786, "sampling/sampling_logp_difference/max": 0.58837890625, "sampling/sampling_logp_difference/mean": 0.013369724154472351, "step": 905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 315.796875, "completions/mean_terminated_length": 315.796875, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "entropy": 0.45895692706108093, "epoch": 1.1102941176470589, "frac_reward_zero_std": 1.0, "grad_norm": 0.016826468252425557, "kl": 0.02111418917775154, "learning_rate": 7.953315761512017e-07, "loss": 0.0002, "num_tokens": 39205943.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4323837757110596, "sampling/importance_sampling_ratio/mean": 0.9996593594551086, "sampling/importance_sampling_ratio/min": 0.6605594754219055, "sampling/sampling_logp_difference/max": 0.41466808319091797, "sampling/sampling_logp_difference/mean": 0.015033164992928505, "step": 906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 914.0, "completions/max_terminated_length": 914.0, "completions/mean_length": 373.6875, "completions/mean_terminated_length": 373.6875, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "entropy": 0.4657692015171051, "epoch": 1.1115196078431373, "frac_reward_zero_std": 1.0, "grad_norm": 0.014469436353942102, "kl": 0.02437550574541092, "learning_rate": 7.947564447558299e-07, "loss": 0.0002, "num_tokens": 39245587.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5467438697814941, "sampling/importance_sampling_ratio/mean": 1.0002634525299072, "sampling/importance_sampling_ratio/min": 0.6071556806564331, "sampling/sampling_logp_difference/max": 0.49897003173828125, "sampling/sampling_logp_difference/mean": 0.015078732743859291, "step": 907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 902.0, "completions/max_terminated_length": 902.0, "completions/mean_length": 334.046875, "completions/mean_terminated_length": 334.046875, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "entropy": 0.483864426612854, "epoch": 1.1127450980392157, "frac_reward_zero_std": 1.0, "grad_norm": 0.014362638271149808, "kl": 0.01628665067255497, "learning_rate": 7.941807150235485e-07, "loss": 0.0002, "num_tokens": 39288150.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4612447023391724, "sampling/importance_sampling_ratio/mean": 1.0000317096710205, "sampling/importance_sampling_ratio/min": 0.6273065209388733, "sampling/sampling_logp_difference/max": 0.4663200378417969, "sampling/sampling_logp_difference/mean": 0.014993532560765743, "step": 908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 802.0, "completions/max_terminated_length": 802.0, "completions/mean_length": 344.90625, "completions/mean_terminated_length": 344.90625, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.44066470861434937, "epoch": 1.1139705882352942, "frac_reward_zero_std": 0.75, "grad_norm": 0.7993237052892854, "kl": 0.017723217606544495, "learning_rate": 7.936043881230525e-07, "loss": 0.0393, "num_tokens": 39329024.0, "reward": 0.40625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.4518367052078247, "sampling/importance_sampling_ratio/mean": 1.0004817247390747, "sampling/importance_sampling_ratio/min": 0.6242125630378723, "sampling/sampling_logp_difference/max": 0.4712643623352051, "sampling/sampling_logp_difference/mean": 0.014128797687590122, "step": 909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1116.0, "completions/max_terminated_length": 1116.0, "completions/mean_length": 409.046875, "completions/mean_terminated_length": 409.046875, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.46189451217651367, "epoch": 1.1151960784313726, "frac_reward_zero_std": 1.0, "grad_norm": 3795616.2116620047, "kl": 214.88619995117188, "learning_rate": 7.930274652242491e-07, "loss": 3.4032, "num_tokens": 39371715.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5058560371398926, "sampling/importance_sampling_ratio/mean": 1.000040054321289, "sampling/importance_sampling_ratio/min": 2.38701375110395e-07, "sampling/sampling_logp_difference/max": 15.248052597045898, "sampling/sampling_logp_difference/mean": 0.014989332295954227, "step": 910 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 599.0, "completions/max_terminated_length": 599.0, "completions/mean_length": 341.015625, "completions/mean_terminated_length": 341.015625, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.436462938785553, "epoch": 1.116421568627451, "frac_reward_zero_std": 0.75, "grad_norm": 0.6913993672122204, "kl": 0.017974652349948883, "learning_rate": 7.924499474982551e-07, "loss": 0.0091, "num_tokens": 39418276.0, "reward": 0.84375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.4425737857818604, "sampling/importance_sampling_ratio/mean": 1.000072717666626, "sampling/importance_sampling_ratio/min": 0.6733540892601013, "sampling/sampling_logp_difference/max": 0.39548397064208984, "sampling/sampling_logp_difference/mean": 0.014727843925356865, "step": 911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 817.0, "completions/max_terminated_length": 817.0, "completions/mean_length": 347.984375, "completions/mean_terminated_length": 347.984375, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.526493489742279, "epoch": 1.1176470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.017154572689610844, "kl": 0.022245081141591072, "learning_rate": 7.91871836117395e-07, "loss": 0.0002, "num_tokens": 39454755.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4496610164642334, "sampling/importance_sampling_ratio/mean": 1.0000824928283691, "sampling/importance_sampling_ratio/min": 0.5399715900421143, "sampling/sampling_logp_difference/max": 0.6162387132644653, "sampling/sampling_logp_difference/mean": 0.016975507140159607, "step": 912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1066.0, "completions/max_terminated_length": 1066.0, "completions/mean_length": 364.765625, "completions/mean_terminated_length": 364.765625, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "entropy": 0.3604802191257477, "epoch": 1.1188725490196079, "frac_reward_zero_std": 1.0, "grad_norm": 0.011480437802749915, "kl": 0.013887192122638226, "learning_rate": 7.91293132255198e-07, "loss": 0.0001, "num_tokens": 39499076.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4393962621688843, "sampling/importance_sampling_ratio/mean": 0.9996575713157654, "sampling/importance_sampling_ratio/min": 0.5114845037460327, "sampling/sampling_logp_difference/max": 0.6704379320144653, "sampling/sampling_logp_difference/mean": 0.012245162390172482, "step": 913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 672.0, "completions/max_terminated_length": 672.0, "completions/mean_length": 336.234375, "completions/mean_terminated_length": 336.234375, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "entropy": 0.3620328903198242, "epoch": 1.1200980392156863, "frac_reward_zero_std": 1.0, "grad_norm": 0.00929380104858899, "kl": 0.012564781121909618, "learning_rate": 7.907138370863967e-07, "loss": 0.0001, "num_tokens": 39538147.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.5278973579406738, "sampling/importance_sampling_ratio/mean": 0.9996464848518372, "sampling/importance_sampling_ratio/min": 0.6077016592025757, "sampling/sampling_logp_difference/max": 0.49807119369506836, "sampling/sampling_logp_difference/mean": 0.0127522898837924, "step": 914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 554.0, "completions/max_terminated_length": 554.0, "completions/mean_length": 340.96875, "completions/mean_terminated_length": 340.96875, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "entropy": 0.45628631114959717, "epoch": 1.1213235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.014254570010313357, "kl": 0.019159547984600067, "learning_rate": 7.901339517869232e-07, "loss": 0.0002, "num_tokens": 39580209.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4155908823013306, "sampling/importance_sampling_ratio/mean": 0.9999619722366333, "sampling/importance_sampling_ratio/min": 0.6523239612579346, "sampling/sampling_logp_difference/max": 0.42721402645111084, "sampling/sampling_logp_difference/mean": 0.015014533884823322, "step": 915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 738.0, "completions/max_terminated_length": 738.0, "completions/mean_length": 274.515625, "completions/mean_terminated_length": 274.515625, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.3500842750072479, "epoch": 1.1225490196078431, "frac_reward_zero_std": 1.0, "grad_norm": 0.01247534327150086, "kl": 0.016987504437565804, "learning_rate": 7.895534775339083e-07, "loss": 0.0002, "num_tokens": 39618434.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5937914848327637, "sampling/importance_sampling_ratio/mean": 0.9999802112579346, "sampling/importance_sampling_ratio/min": 0.6437342762947083, "sampling/sampling_logp_difference/max": 0.46611571311950684, "sampling/sampling_logp_difference/mean": 0.014076455496251583, "step": 916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 841.0, "completions/max_terminated_length": 841.0, "completions/mean_length": 387.25, "completions/mean_terminated_length": 387.25, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "entropy": 0.41866469383239746, "epoch": 1.1237745098039216, "frac_reward_zero_std": 0.75, "grad_norm": 0.5036903105536479, "kl": 0.014852667227387428, "learning_rate": 7.889724155056776e-07, "loss": 0.0107, "num_tokens": 39671026.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.6188225746154785, "sampling/importance_sampling_ratio/mean": 1.0000314712524414, "sampling/importance_sampling_ratio/min": 0.5404905080795288, "sampling/sampling_logp_difference/max": 0.6152782440185547, "sampling/sampling_logp_difference/mean": 0.013088248670101166, "step": 917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1369.0, "completions/max_terminated_length": 1369.0, "completions/mean_length": 458.171875, "completions/mean_terminated_length": 458.171875, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "entropy": 0.39862826466560364, "epoch": 1.125, "frac_reward_zero_std": 1.0, "grad_norm": 0.010058130146979935, "kl": 0.011514803394675255, "learning_rate": 7.883907668817506e-07, "loss": 0.0001, "num_tokens": 39720605.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4052278995513916, "sampling/importance_sampling_ratio/mean": 1.0002282857894897, "sampling/importance_sampling_ratio/min": 0.6341673731803894, "sampling/sampling_logp_difference/max": 0.4554424285888672, "sampling/sampling_logp_difference/mean": 0.011525727808475494, "step": 918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 968.0, "completions/max_terminated_length": 968.0, "completions/mean_length": 435.046875, "completions/mean_terminated_length": 435.046875, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "entropy": 0.4823020398616791, "epoch": 1.1262254901960784, "frac_reward_zero_std": 0.75, "grad_norm": 0.5037469399498303, "kl": 0.019573427736759186, "learning_rate": 7.878085328428368e-07, "loss": 0.0012, "num_tokens": 39764784.0, "reward": 0.09375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.3706907033920288, "sampling/importance_sampling_ratio/mean": 0.9995969533920288, "sampling/importance_sampling_ratio/min": 0.6134204268455505, "sampling/sampling_logp_difference/max": 0.4887046813964844, "sampling/sampling_logp_difference/mean": 0.013996835798025131, "step": 919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 863.0, "completions/max_terminated_length": 863.0, "completions/mean_length": 382.59375, "completions/mean_terminated_length": 382.59375, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.4504874050617218, "epoch": 1.1274509803921569, "frac_reward_zero_std": 1.0, "grad_norm": 0.015638168123282576, "kl": 0.01594439148902893, "learning_rate": 7.872257145708345e-07, "loss": 0.0001, "num_tokens": 39810870.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4250260591506958, "sampling/importance_sampling_ratio/mean": 1.0003266334533691, "sampling/importance_sampling_ratio/min": 0.6506755948066711, "sampling/sampling_logp_difference/max": 0.42974400520324707, "sampling/sampling_logp_difference/mean": 0.014153266325592995, "step": 920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 671.0, "completions/max_terminated_length": 671.0, "completions/mean_length": 363.984375, "completions/mean_terminated_length": 363.984375, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "entropy": 0.2993699908256531, "epoch": 1.1286764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.02292116059848233, "kl": 0.011618348769843578, "learning_rate": 7.86642313248828e-07, "loss": 0.0001, "num_tokens": 39849861.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4644570350646973, "sampling/importance_sampling_ratio/mean": 1.0000545978546143, "sampling/importance_sampling_ratio/min": 0.26262301206588745, "sampling/sampling_logp_difference/max": 1.3370356559753418, "sampling/sampling_logp_difference/mean": 0.009977871552109718, "step": 921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2023.0, "completions/max_terminated_length": 2023.0, "completions/mean_length": 584.046875, "completions/mean_terminated_length": 584.046875, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.49404165148735046, "epoch": 1.1299019607843137, "frac_reward_zero_std": 0.5, "grad_norm": 0.7355059518508309, "kl": 0.014983756467700005, "learning_rate": 7.860583300610847e-07, "loss": 0.1494, "num_tokens": 39911592.0, "reward": 0.40625, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.6088849306106567, "sampling/importance_sampling_ratio/mean": 1.0000519752502441, "sampling/importance_sampling_ratio/min": 0.604062020778656, "sampling/sampling_logp_difference/max": 0.5040783882141113, "sampling/sampling_logp_difference/mean": 0.01413729041814804, "step": 922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.0, "completions/max_terminated_length": 532.0, "completions/mean_length": 347.890625, "completions/mean_terminated_length": 347.890625, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.5237497687339783, "epoch": 1.1311274509803921, "frac_reward_zero_std": 0.75, "grad_norm": 0.6979271604099834, "kl": 0.020476536825299263, "learning_rate": 7.854737661930539e-07, "loss": -0.0088, "num_tokens": 39949217.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.570483922958374, "sampling/importance_sampling_ratio/mean": 1.0000884532928467, "sampling/importance_sampling_ratio/min": 0.655025064945221, "sampling/sampling_logp_difference/max": 0.4513838291168213, "sampling/sampling_logp_difference/mean": 0.015962611883878708, "step": 923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1568.0, "completions/max_terminated_length": 1568.0, "completions/mean_length": 442.40625, "completions/mean_terminated_length": 442.40625, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "entropy": 0.41301485896110535, "epoch": 1.1323529411764706, "frac_reward_zero_std": 0.75, "grad_norm": 0.6412408492082519, "kl": 0.017263216897845268, "learning_rate": 7.848886228313632e-07, "loss": 0.0907, "num_tokens": 39997259.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.5009677410125732, "sampling/importance_sampling_ratio/mean": 0.9998687505722046, "sampling/importance_sampling_ratio/min": 0.5368533730506897, "sampling/sampling_logp_difference/max": 0.6220302581787109, "sampling/sampling_logp_difference/mean": 0.013191437348723412, "step": 924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 475.0, "completions/max_terminated_length": 475.0, "completions/mean_length": 250.4375, "completions/mean_terminated_length": 250.4375, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.33542847633361816, "epoch": 1.133578431372549, "frac_reward_zero_std": 1.0, "grad_norm": 0.016155124122245193, "kl": 0.01704217866063118, "learning_rate": 7.843029011638162e-07, "loss": 0.0002, "num_tokens": 40026455.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4358428716659546, "sampling/importance_sampling_ratio/mean": 0.9996052980422974, "sampling/importance_sampling_ratio/min": 0.6771752834320068, "sampling/sampling_logp_difference/max": 0.38982510566711426, "sampling/sampling_logp_difference/mean": 0.013608593493700027, "step": 925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 913.0, "completions/max_terminated_length": 913.0, "completions/mean_length": 328.625, "completions/mean_terminated_length": 328.625, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.3925665616989136, "epoch": 1.1348039215686274, "frac_reward_zero_std": 1.0, "grad_norm": 0.012310375791440972, "kl": 0.016626421362161636, "learning_rate": 7.837166023793908e-07, "loss": 0.0002, "num_tokens": 40066575.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6881778240203857, "sampling/importance_sampling_ratio/mean": 0.9995685815811157, "sampling/importance_sampling_ratio/min": 0.6956479549407959, "sampling/sampling_logp_difference/max": 0.5236496925354004, "sampling/sampling_logp_difference/mean": 0.012723254971206188, "step": 926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 562.0, "completions/max_terminated_length": 562.0, "completions/mean_length": 285.640625, "completions/mean_terminated_length": 285.640625, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.42676204442977905, "epoch": 1.1360294117647058, "frac_reward_zero_std": 1.0, "grad_norm": 0.01427982578873053, "kl": 0.017801744863390923, "learning_rate": 7.831297276682368e-07, "loss": 0.0002, "num_tokens": 40098968.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.43587064743042, "sampling/importance_sampling_ratio/mean": 1.0003480911254883, "sampling/importance_sampling_ratio/min": 0.5505924224853516, "sampling/sampling_logp_difference/max": 0.5967605113983154, "sampling/sampling_logp_difference/mean": 0.015875065699219704, "step": 927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 582.0, "completions/max_terminated_length": 582.0, "completions/mean_length": 308.25, "completions/mean_terminated_length": 308.25, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.43266725540161133, "epoch": 1.1372549019607843, "frac_reward_zero_std": 1.0, "grad_norm": 0.013680380852056157, "kl": 0.01884029060602188, "learning_rate": 7.825422782216724e-07, "loss": 0.0002, "num_tokens": 40139080.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5971448421478271, "sampling/importance_sampling_ratio/mean": 1.0000373125076294, "sampling/importance_sampling_ratio/min": 0.678729772567749, "sampling/sampling_logp_difference/max": 0.4682176113128662, "sampling/sampling_logp_difference/mean": 0.014155280776321888, "step": 928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 662.0, "completions/max_terminated_length": 662.0, "completions/mean_length": 325.296875, "completions/mean_terminated_length": 325.296875, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.4327702820301056, "epoch": 1.1384803921568627, "frac_reward_zero_std": 1.0, "grad_norm": 0.011391448207679759, "kl": 0.014879927039146423, "learning_rate": 7.819542552321827e-07, "loss": 0.0001, "num_tokens": 40175435.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.508432149887085, "sampling/importance_sampling_ratio/mean": 0.9994142055511475, "sampling/importance_sampling_ratio/min": 0.6786009669303894, "sampling/sampling_logp_difference/max": 0.4110708236694336, "sampling/sampling_logp_difference/mean": 0.014705008827149868, "step": 929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 800.0, "completions/max_terminated_length": 800.0, "completions/mean_length": 367.203125, "completions/mean_terminated_length": 367.203125, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "entropy": 0.5332962274551392, "epoch": 1.1397058823529411, "frac_reward_zero_std": 1.0, "grad_norm": 0.02369259747904978, "kl": 0.024235349148511887, "learning_rate": 7.813656598934173e-07, "loss": 0.0002, "num_tokens": 40216216.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4407724142074585, "sampling/importance_sampling_ratio/mean": 1.000398874282837, "sampling/importance_sampling_ratio/min": 0.6763667464256287, "sampling/sampling_logp_difference/max": 0.3910198211669922, "sampling/sampling_logp_difference/mean": 0.016801439225673676, "step": 930 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1159.0, "completions/max_terminated_length": 1159.0, "completions/mean_length": 417.46875, "completions/mean_terminated_length": 417.46875, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "entropy": 0.4593676030635834, "epoch": 1.1409313725490196, "frac_reward_zero_std": 1.0, "grad_norm": 0.013377289030467159, "kl": 0.01645844057202339, "learning_rate": 7.807764934001874e-07, "loss": 0.0002, "num_tokens": 40259542.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.8626850843429565, "sampling/importance_sampling_ratio/mean": 0.9998859167098999, "sampling/importance_sampling_ratio/min": 0.6444015502929688, "sampling/sampling_logp_difference/max": 0.6220190525054932, "sampling/sampling_logp_difference/mean": 0.014115707948803902, "step": 931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 624.0, "completions/max_terminated_length": 624.0, "completions/mean_length": 349.46875, "completions/mean_terminated_length": 349.46875, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "entropy": 0.457862913608551, "epoch": 1.142156862745098, "frac_reward_zero_std": 0.75, "grad_norm": 0.7472165157268938, "kl": 0.019886402413249016, "learning_rate": 7.801867569484634e-07, "loss": -0.0115, "num_tokens": 40304388.0, "reward": 0.71875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.9768327474594116, "sampling/importance_sampling_ratio/mean": 0.9997333288192749, "sampling/importance_sampling_ratio/min": 0.5569307208061218, "sampling/sampling_logp_difference/max": 0.6814959049224854, "sampling/sampling_logp_difference/mean": 0.014669973403215408, "step": 932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 769.0, "completions/max_terminated_length": 769.0, "completions/mean_length": 321.703125, "completions/mean_terminated_length": 321.703125, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.4543549716472626, "epoch": 1.1433823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.015590390394861544, "kl": 0.018459167331457138, "learning_rate": 7.795964517353733e-07, "loss": 0.0002, "num_tokens": 40340705.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4361846446990967, "sampling/importance_sampling_ratio/mean": 0.9998949766159058, "sampling/importance_sampling_ratio/min": 0.3982090353965759, "sampling/sampling_logp_difference/max": 0.920778214931488, "sampling/sampling_logp_difference/mean": 0.014566565863788128, "step": 933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 690.0, "completions/max_terminated_length": 690.0, "completions/mean_length": 290.125, "completions/mean_terminated_length": 290.125, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.44440481066703796, "epoch": 1.1446078431372548, "frac_reward_zero_std": 1.0, "grad_norm": 0.02118522653029941, "kl": 0.02342885732650757, "learning_rate": 7.790055789591993e-07, "loss": 0.0002, "num_tokens": 40375609.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5802276134490967, "sampling/importance_sampling_ratio/mean": 1.000329613685608, "sampling/importance_sampling_ratio/min": 0.38396045565605164, "sampling/sampling_logp_difference/max": 0.9572157859802246, "sampling/sampling_logp_difference/mean": 0.01584678702056408, "step": 934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 270.390625, "completions/mean_terminated_length": 270.390625, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.4188202917575836, "epoch": 1.1458333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.015990911217984946, "kl": 0.01786458119750023, "learning_rate": 7.784141398193753e-07, "loss": 0.0002, "num_tokens": 40417234.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5118809938430786, "sampling/importance_sampling_ratio/mean": 0.9995365738868713, "sampling/importance_sampling_ratio/min": 0.6104766726493835, "sampling/sampling_logp_difference/max": 0.4935152530670166, "sampling/sampling_logp_difference/mean": 0.014841822907328606, "step": 935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1014.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 398.859375, "completions/mean_terminated_length": 398.859375, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "entropy": 0.4555830657482147, "epoch": 1.1470588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 0.008916979043764026, "kl": 0.013061080127954483, "learning_rate": 7.778221355164857e-07, "loss": 0.0001, "num_tokens": 40469785.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000382900238037, "sampling/importance_sampling_ratio/min": 0.4920617640018463, "sampling/sampling_logp_difference/max": 0.7302007675170898, "sampling/sampling_logp_difference/mean": 0.014633501879870892, "step": 936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1944.0, "completions/max_terminated_length": 1944.0, "completions/mean_length": 377.28125, "completions/mean_terminated_length": 377.28125, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "entropy": 0.44511091709136963, "epoch": 1.1482843137254901, "frac_reward_zero_std": 1.0, "grad_norm": 0.019617433250119085, "kl": 0.029182177037000656, "learning_rate": 7.772295672522614e-07, "loss": 0.0003, "num_tokens": 40512827.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4113596677780151, "sampling/importance_sampling_ratio/mean": 1.0001815557479858, "sampling/importance_sampling_ratio/min": 0.6196080446243286, "sampling/sampling_logp_difference/max": 0.478668212890625, "sampling/sampling_logp_difference/mean": 0.01435931771993637, "step": 937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 613.0, "completions/max_terminated_length": 613.0, "completions/mean_length": 298.796875, "completions/mean_terminated_length": 298.796875, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.5985913276672363, "epoch": 1.1495098039215685, "frac_reward_zero_std": 0.75, "grad_norm": 0.7061169449223113, "kl": 0.02338649332523346, "learning_rate": 7.766364362295788e-07, "loss": -0.0365, "num_tokens": 40551134.0, "reward": 0.0625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.8552391529083252, "sampling/importance_sampling_ratio/mean": 1.0004775524139404, "sampling/importance_sampling_ratio/min": 0.6262683868408203, "sampling/sampling_logp_difference/max": 0.6180136203765869, "sampling/sampling_logp_difference/mean": 0.017632469534873962, "step": 938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1369.0, "completions/max_terminated_length": 1369.0, "completions/mean_length": 408.6875, "completions/mean_terminated_length": 408.6875, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "entropy": 0.5569237470626831, "epoch": 1.150735294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.8138302013031027, "kl": 0.018642188981175423, "learning_rate": 7.760427436524559e-07, "loss": 0.0002, "num_tokens": 40595786.0, "reward": -0.09375, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": -0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.5249321460723877, "sampling/importance_sampling_ratio/mean": 1.0000078678131104, "sampling/importance_sampling_ratio/min": 0.6375809907913208, "sampling/sampling_logp_difference/max": 0.4500739574432373, "sampling/sampling_logp_difference/mean": 0.015366158448159695, "step": 939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 666.0, "completions/max_terminated_length": 666.0, "completions/mean_length": 322.8125, "completions/mean_terminated_length": 322.8125, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.5132248401641846, "epoch": 1.1519607843137254, "frac_reward_zero_std": 0.75, "grad_norm": 0.6265848188163772, "kl": 0.02068491466343403, "learning_rate": 7.754484907260512e-07, "loss": -0.0029, "num_tokens": 40634158.0, "reward": -0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": -0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.5752291679382324, "sampling/importance_sampling_ratio/mean": 0.9999234676361084, "sampling/importance_sampling_ratio/min": 0.5910096764564514, "sampling/sampling_logp_difference/max": 0.5259228944778442, "sampling/sampling_logp_difference/mean": 0.016536490991711617, "step": 940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1053.0, "completions/max_terminated_length": 1053.0, "completions/mean_length": 452.8125, "completions/mean_terminated_length": 452.8125, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "entropy": 0.44229787588119507, "epoch": 1.153186274509804, "frac_reward_zero_std": 0.75, "grad_norm": 0.5692592972777891, "kl": 0.014031889848411083, "learning_rate": 7.748536786566606e-07, "loss": -0.0428, "num_tokens": 40682210.0, "reward": 0.71875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.6088980436325073, "sampling/importance_sampling_ratio/mean": 1.0001893043518066, "sampling/importance_sampling_ratio/min": 0.6368654370307922, "sampling/sampling_logp_difference/max": 0.47554945945739746, "sampling/sampling_logp_difference/mean": 0.01231298130005598, "step": 941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1073.0, "completions/max_terminated_length": 1073.0, "completions/mean_length": 395.234375, "completions/mean_terminated_length": 395.234375, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.49358224868774414, "epoch": 1.1544117647058822, "frac_reward_zero_std": 1.0, "grad_norm": 0.012430629426406612, "kl": 0.016691043972969055, "learning_rate": 7.742583086517149e-07, "loss": 0.0002, "num_tokens": 40729793.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.953208088874817, "sampling/importance_sampling_ratio/mean": 0.9999034404754639, "sampling/importance_sampling_ratio/min": 0.6193159222602844, "sampling/sampling_logp_difference/max": 0.6694731712341309, "sampling/sampling_logp_difference/mean": 0.016142509877681732, "step": 942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1022.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 407.265625, "completions/mean_terminated_length": 407.265625, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "entropy": 0.3657209873199463, "epoch": 1.155637254901961, "frac_reward_zero_std": 0.5, "grad_norm": 0.911479998630644, "kl": 0.010978571139276028, "learning_rate": 7.736623819197773e-07, "loss": 0.0086, "num_tokens": 40773890.0, "reward": 0.9375, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.451781988143921, "sampling/importance_sampling_ratio/mean": 0.9998575448989868, "sampling/importance_sampling_ratio/min": 0.6117782592773438, "sampling/sampling_logp_difference/max": 0.49138545989990234, "sampling/sampling_logp_difference/mean": 0.0117138447239995, "step": 943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 802.0, "completions/max_terminated_length": 802.0, "completions/mean_length": 381.609375, "completions/mean_terminated_length": 381.609375, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.363036572933197, "epoch": 1.156862745098039, "frac_reward_zero_std": 1.0, "grad_norm": 0.01334404520161415, "kl": 0.014807555824518204, "learning_rate": 7.730658996705415e-07, "loss": 0.0001, "num_tokens": 40817833.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.8902366161346436, "sampling/importance_sampling_ratio/mean": 0.9998100996017456, "sampling/importance_sampling_ratio/min": 0.5773409605026245, "sampling/sampling_logp_difference/max": 0.6367020606994629, "sampling/sampling_logp_difference/mean": 0.012631082907319069, "step": 944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 560.0, "completions/max_terminated_length": 560.0, "completions/mean_length": 277.34375, "completions/mean_terminated_length": 277.34375, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.5021189451217651, "epoch": 1.1580882352941178, "frac_reward_zero_std": 0.75, "grad_norm": 0.6580080091512213, "kl": 0.02805817313492298, "learning_rate": 7.724688631148286e-07, "loss": 0.0037, "num_tokens": 40853695.0, "reward": 0.40625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.4694055318832397, "sampling/importance_sampling_ratio/mean": 0.9999300837516785, "sampling/importance_sampling_ratio/min": 0.6626021265983582, "sampling/sampling_logp_difference/max": 0.41158056259155273, "sampling/sampling_logp_difference/mean": 0.016338404268026352, "step": 945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 961.0, "completions/max_terminated_length": 961.0, "completions/mean_length": 336.9375, "completions/mean_terminated_length": 336.9375, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.447573184967041, "epoch": 1.159313725490196, "frac_reward_zero_std": 0.75, "grad_norm": 0.4187053926028054, "kl": 0.0185670405626297, "learning_rate": 7.718712734645849e-07, "loss": -0.0555, "num_tokens": 40893435.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.4453223943710327, "sampling/importance_sampling_ratio/mean": 0.999966561794281, "sampling/importance_sampling_ratio/min": 0.6699577569961548, "sampling/sampling_logp_difference/max": 0.4005405902862549, "sampling/sampling_logp_difference/mean": 0.01412887591868639, "step": 946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1280.0, "completions/max_terminated_length": 1280.0, "completions/mean_length": 379.109375, "completions/mean_terminated_length": 379.109375, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.5440686941146851, "epoch": 1.1605392156862746, "frac_reward_zero_std": 0.75, "grad_norm": 0.45076930767236895, "kl": 0.029552670195698738, "learning_rate": 7.712731319328797e-07, "loss": -0.0274, "num_tokens": 40935234.0, "reward": -0.09375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": -0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.5399127006530762, "sampling/importance_sampling_ratio/mean": 0.9998072385787964, "sampling/importance_sampling_ratio/min": 0.6175768971443176, "sampling/sampling_logp_difference/max": 0.4819517135620117, "sampling/sampling_logp_difference/mean": 0.015896756201982498, "step": 947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/max_terminated_length": 473.0, "completions/mean_length": 231.90625, "completions/mean_terminated_length": 231.90625, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.4478425979614258, "epoch": 1.161764705882353, "frac_reward_zero_std": 0.75, "grad_norm": 0.8579790366187718, "kl": 0.023114584386348724, "learning_rate": 7.706744397339022e-07, "loss": 0.0015, "num_tokens": 40965964.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.9668688774108887, "sampling/importance_sampling_ratio/mean": 1.000008463859558, "sampling/importance_sampling_ratio/min": 0.6106392741203308, "sampling/sampling_logp_difference/max": 0.6764428615570068, "sampling/sampling_logp_difference/mean": 0.016367100179195404, "step": 948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1013.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 418.9375, "completions/mean_terminated_length": 418.9375, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "entropy": 0.5635786652565002, "epoch": 1.1629901960784315, "frac_reward_zero_std": 0.75, "grad_norm": 0.6383022942593753, "kl": 0.017191991209983826, "learning_rate": 7.700751980829601e-07, "loss": -0.0181, "num_tokens": 41011560.0, "reward": 0.28125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 1.5020791292190552, "sampling/importance_sampling_ratio/mean": 0.9994247555732727, "sampling/importance_sampling_ratio/min": 0.23429064452648163, "sampling/sampling_logp_difference/max": 1.451192855834961, "sampling/sampling_logp_difference/mean": 0.015703028067946434, "step": 949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/max_terminated_length": 500.0, "completions/mean_length": 260.28125, "completions/mean_terminated_length": 260.28125, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.38153502345085144, "epoch": 1.1642156862745099, "frac_reward_zero_std": 1.0, "grad_norm": 0.010425442637467142, "kl": 0.01383605133742094, "learning_rate": 7.694754081964754e-07, "loss": 0.0001, "num_tokens": 41043786.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5751053094863892, "sampling/importance_sampling_ratio/mean": 1.0000345706939697, "sampling/importance_sampling_ratio/min": 0.6266753673553467, "sampling/sampling_logp_difference/max": 0.46732664108276367, "sampling/sampling_logp_difference/mean": 0.013468923978507519, "step": 950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1356.0, "completions/max_terminated_length": 1356.0, "completions/mean_length": 432.9375, "completions/mean_terminated_length": 432.9375, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "entropy": 0.385353147983551, "epoch": 1.1654411764705883, "frac_reward_zero_std": 0.75, "grad_norm": 0.4776144612880716, "kl": 0.015064786188304424, "learning_rate": 7.688750712919839e-07, "loss": -0.0058, "num_tokens": 41092918.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.4011493921279907, "sampling/importance_sampling_ratio/mean": 1.000025749206543, "sampling/importance_sampling_ratio/min": 0.5433887243270874, "sampling/sampling_logp_difference/max": 0.6099303364753723, "sampling/sampling_logp_difference/mean": 0.012659663334488869, "step": 951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 840.0, "completions/max_terminated_length": 840.0, "completions/mean_length": 363.109375, "completions/mean_terminated_length": 363.109375, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.5684800148010254, "epoch": 1.1666666666666667, "frac_reward_zero_std": 0.75, "grad_norm": 0.5559935402137343, "kl": 0.026855148375034332, "learning_rate": 7.682741885881314e-07, "loss": 0.0049, "num_tokens": 41134493.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.5176734924316406, "sampling/importance_sampling_ratio/mean": 0.9995768070220947, "sampling/importance_sampling_ratio/min": 0.650613009929657, "sampling/sampling_logp_difference/max": 0.4298403263092041, "sampling/sampling_logp_difference/mean": 0.01702478528022766, "step": 952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1272.0, "completions/max_terminated_length": 1272.0, "completions/mean_length": 416.25, "completions/mean_terminated_length": 416.25, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.5798816680908203, "epoch": 1.1678921568627452, "frac_reward_zero_std": 1.0, "grad_norm": 0.011240050959847564, "kl": 0.018897918984293938, "learning_rate": 7.676727613046719e-07, "loss": 0.0002, "num_tokens": 41182509.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.5215610265731812, "sampling/importance_sampling_ratio/mean": 0.9998370409011841, "sampling/importance_sampling_ratio/min": 0.5483723282814026, "sampling/sampling_logp_difference/max": 0.6008007526397705, "sampling/sampling_logp_difference/mean": 0.01645175740122795, "step": 953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1050.0, "completions/max_terminated_length": 1050.0, "completions/mean_length": 281.53125, "completions/mean_terminated_length": 281.53125, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.5563374757766724, "epoch": 1.1691176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 1.1849910234174377, "kl": 0.02671617642045021, "learning_rate": 7.670707906624643e-07, "loss": 0.042, "num_tokens": 41215263.0, "reward": 0.78125, "reward_std": 0.375, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.5492985248565674, "sampling/importance_sampling_ratio/mean": 0.9997384548187256, "sampling/importance_sampling_ratio/min": 0.2174793779850006, "sampling/sampling_logp_difference/max": 1.5256513357162476, "sampling/sampling_logp_difference/mean": 0.017361294478178024, "step": 954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 657.0, "completions/max_terminated_length": 657.0, "completions/mean_length": 322.78125, "completions/mean_terminated_length": 322.78125, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "entropy": 0.541365921497345, "epoch": 1.170343137254902, "frac_reward_zero_std": 0.5, "grad_norm": 0.9005295532910909, "kl": 0.027645833790302277, "learning_rate": 7.664682778834712e-07, "loss": 0.0058, "num_tokens": 41253505.0, "reward": -0.21875, "reward_std": 0.4515564441680908, "rewards/decision_reward_func/mean": -0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 1.5868611335754395, "sampling/importance_sampling_ratio/mean": 1.000201940536499, "sampling/importance_sampling_ratio/min": 0.4764883518218994, "sampling/sampling_logp_difference/max": 0.7413120269775391, "sampling/sampling_logp_difference/mean": 0.016526609659194946, "step": 955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 742.0, "completions/max_terminated_length": 742.0, "completions/mean_length": 341.765625, "completions/mean_terminated_length": 341.765625, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.5351499915122986, "epoch": 1.1715686274509804, "frac_reward_zero_std": 1.0, "grad_norm": 0.012358943755861133, "kl": 0.017067890614271164, "learning_rate": 7.658652241907554e-07, "loss": 0.0002, "num_tokens": 41289682.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5975886583328247, "sampling/importance_sampling_ratio/mean": 1.0000712871551514, "sampling/importance_sampling_ratio/min": 0.6677515506744385, "sampling/sampling_logp_difference/max": 0.46849536895751953, "sampling/sampling_logp_difference/mean": 0.017824752256274223, "step": 956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1211.0, "completions/max_terminated_length": 1211.0, "completions/mean_length": 368.59375, "completions/mean_terminated_length": 368.59375, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.4359895586967468, "epoch": 1.1727941176470589, "frac_reward_zero_std": 1.0, "grad_norm": 0.012484428812207083, "kl": 0.0163139458745718, "learning_rate": 7.652616308084774e-07, "loss": 0.0002, "num_tokens": 41332504.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.5664291381835938, "sampling/importance_sampling_ratio/mean": 1.0000537633895874, "sampling/importance_sampling_ratio/min": 0.6404737234115601, "sampling/sampling_logp_difference/max": 0.44879865646362305, "sampling/sampling_logp_difference/mean": 0.013151295483112335, "step": 957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 633.0, "completions/max_terminated_length": 633.0, "completions/mean_length": 291.046875, "completions/mean_terminated_length": 291.046875, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.4096795320510864, "epoch": 1.1740196078431373, "frac_reward_zero_std": 0.75, "grad_norm": 0.5594037803250566, "kl": 0.01657106541097164, "learning_rate": 7.646574989618937e-07, "loss": -0.0017, "num_tokens": 41367147.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.4500716924667358, "sampling/importance_sampling_ratio/mean": 1.0002528429031372, "sampling/importance_sampling_ratio/min": 0.7195301651954651, "sampling/sampling_logp_difference/max": 0.3716130256652832, "sampling/sampling_logp_difference/mean": 0.015026958659291267, "step": 958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 590.0, "completions/max_terminated_length": 590.0, "completions/mean_length": 306.75, "completions/mean_terminated_length": 306.75, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "entropy": 0.5108782649040222, "epoch": 1.1752450980392157, "frac_reward_zero_std": 0.75, "grad_norm": 0.6716794976115774, "kl": 0.024091824889183044, "learning_rate": 7.640528298773536e-07, "loss": -0.0055, "num_tokens": 41402971.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.4398475885391235, "sampling/importance_sampling_ratio/mean": 1.000092625617981, "sampling/importance_sampling_ratio/min": 0.6211755871772766, "sampling/sampling_logp_difference/max": 0.47614145278930664, "sampling/sampling_logp_difference/mean": 0.017245344817638397, "step": 959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 641.0, "completions/max_terminated_length": 641.0, "completions/mean_length": 320.578125, "completions/mean_terminated_length": 320.578125, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.4776054918766022, "epoch": 1.1764705882352942, "frac_reward_zero_std": 1.0, "grad_norm": 0.020195921355817582, "kl": 0.02067079395055771, "learning_rate": 7.634476247822972e-07, "loss": 0.0002, "num_tokens": 41439552.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5879510641098022, "sampling/importance_sampling_ratio/mean": 0.9999288320541382, "sampling/importance_sampling_ratio/min": 0.6384508013725281, "sampling/sampling_logp_difference/max": 0.462444543838501, "sampling/sampling_logp_difference/mean": 0.01548301987349987, "step": 960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 594.0, "completions/max_terminated_length": 594.0, "completions/mean_length": 275.84375, "completions/mean_terminated_length": 275.84375, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.447788268327713, "epoch": 1.1776960784313726, "frac_reward_zero_std": 1.0, "grad_norm": 0.012040945917179554, "kl": 0.018080953508615494, "learning_rate": 7.628418849052523e-07, "loss": 0.0002, "num_tokens": 41472790.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6478561162948608, "sampling/importance_sampling_ratio/mean": 0.9993513226509094, "sampling/importance_sampling_ratio/min": 0.6403316259384155, "sampling/sampling_logp_difference/max": 0.4994751214981079, "sampling/sampling_logp_difference/mean": 0.014050785452127457, "step": 961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1370.0, "completions/max_terminated_length": 1370.0, "completions/mean_length": 417.40625, "completions/mean_terminated_length": 417.40625, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.40977567434310913, "epoch": 1.178921568627451, "frac_reward_zero_std": 0.75, "grad_norm": 0.5687975082003246, "kl": 0.013944894075393677, "learning_rate": 7.622356114758327e-07, "loss": -0.0196, "num_tokens": 41516464.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.5174232721328735, "sampling/importance_sampling_ratio/mean": 0.9995956420898438, "sampling/importance_sampling_ratio/min": 0.6011704206466675, "sampling/sampling_logp_difference/max": 0.5088768005371094, "sampling/sampling_logp_difference/mean": 0.013313937932252884, "step": 962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1000.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 382.25, "completions/mean_terminated_length": 382.25, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.4594183564186096, "epoch": 1.1801470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.07254399046703494, "kl": 0.03543552756309509, "learning_rate": 7.616288057247349e-07, "loss": 0.0002, "num_tokens": 41560032.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6339343786239624, "sampling/importance_sampling_ratio/mean": 1.0000615119934082, "sampling/importance_sampling_ratio/min": 0.5260618925094604, "sampling/sampling_logp_difference/max": 0.642336368560791, "sampling/sampling_logp_difference/mean": 0.015564600005745888, "step": 963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1150.0, "completions/max_terminated_length": 1150.0, "completions/mean_length": 334.15625, "completions/mean_terminated_length": 334.15625, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.38727670907974243, "epoch": 1.1813725490196079, "frac_reward_zero_std": 1.0, "grad_norm": 0.010432231922852555, "kl": 0.012871417216956615, "learning_rate": 7.610214688837361e-07, "loss": 0.0001, "num_tokens": 41608170.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6496334075927734, "sampling/importance_sampling_ratio/mean": 0.9999215006828308, "sampling/importance_sampling_ratio/min": 0.5490169525146484, "sampling/sampling_logp_difference/max": 0.5996259450912476, "sampling/sampling_logp_difference/mean": 0.012350855395197868, "step": 964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 725.0, "completions/max_terminated_length": 725.0, "completions/mean_length": 291.0625, "completions/mean_terminated_length": 291.0625, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.3395121097564697, "epoch": 1.1825980392156863, "frac_reward_zero_std": 1.0, "grad_norm": 0.010046471887092208, "kl": 0.012805217877030373, "learning_rate": 7.604136021856916e-07, "loss": 0.0001, "num_tokens": 41643262.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000325441360474, "sampling/importance_sampling_ratio/min": 0.6651583909988403, "sampling/sampling_logp_difference/max": 1.092421054840088, "sampling/sampling_logp_difference/mean": 0.013157753273844719, "step": 965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 824.0, "completions/max_terminated_length": 824.0, "completions/mean_length": 308.1875, "completions/mean_terminated_length": 308.1875, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "entropy": 0.36673638224601746, "epoch": 1.1838235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.011694888930407274, "kl": 0.015358058735728264, "learning_rate": 7.598052068645324e-07, "loss": 0.0001, "num_tokens": 41685114.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.723783016204834, "sampling/importance_sampling_ratio/mean": 1.0002949237823486, "sampling/importance_sampling_ratio/min": 0.6986619830131531, "sampling/sampling_logp_difference/max": 0.5445213317871094, "sampling/sampling_logp_difference/mean": 0.011855213902890682, "step": 966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 699.0, "completions/max_terminated_length": 699.0, "completions/mean_length": 274.9375, "completions/mean_terminated_length": 274.9375, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.32736629247665405, "epoch": 1.1850490196078431, "frac_reward_zero_std": 1.0, "grad_norm": 0.02614783411194795, "kl": 0.013298116624355316, "learning_rate": 7.591962841552626e-07, "loss": 0.0001, "num_tokens": 41728134.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000203013420105, "sampling/importance_sampling_ratio/min": 0.615496039390564, "sampling/sampling_logp_difference/max": 0.9101986885070801, "sampling/sampling_logp_difference/mean": 0.012536443769931793, "step": 967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 971.0, "completions/max_terminated_length": 971.0, "completions/mean_length": 317.421875, "completions/mean_terminated_length": 317.421875, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.4694092273712158, "epoch": 1.1862745098039216, "frac_reward_zero_std": 0.75, "grad_norm": 0.5567727130277104, "kl": 0.02529379166662693, "learning_rate": 7.585868352939562e-07, "loss": 0.0141, "num_tokens": 41765329.0, "reward": 0.21875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 1.435806393623352, "sampling/importance_sampling_ratio/mean": 0.99980628490448, "sampling/importance_sampling_ratio/min": 0.6393488645553589, "sampling/sampling_logp_difference/max": 0.4473050832748413, "sampling/sampling_logp_difference/mean": 0.015851978212594986, "step": 968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 618.0, "completions/max_terminated_length": 618.0, "completions/mean_length": 280.890625, "completions/mean_terminated_length": 280.890625, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.556938886642456, "epoch": 1.1875, "frac_reward_zero_std": 0.75, "grad_norm": 0.7993028114638361, "kl": 0.027273159474134445, "learning_rate": 7.579768615177564e-07, "loss": -0.0316, "num_tokens": 41800106.0, "reward": 0.21875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 1.403174638748169, "sampling/importance_sampling_ratio/mean": 0.999940812587738, "sampling/importance_sampling_ratio/min": 0.5746225714683533, "sampling/sampling_logp_difference/max": 0.554041862487793, "sampling/sampling_logp_difference/mean": 0.017456721514463425, "step": 969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 633.0, "completions/max_terminated_length": 633.0, "completions/mean_length": 353.9375, "completions/mean_terminated_length": 353.9375, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "entropy": 0.5507869720458984, "epoch": 1.1887254901960784, "frac_reward_zero_std": 0.75, "grad_norm": 0.6454260168985936, "kl": 0.022801361978054047, "learning_rate": 7.57366364064871e-07, "loss": 0.0136, "num_tokens": 41842854.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.6474692821502686, "sampling/importance_sampling_ratio/mean": 0.9998937249183655, "sampling/importance_sampling_ratio/min": 0.6415351629257202, "sampling/sampling_logp_difference/max": 0.4992403984069824, "sampling/sampling_logp_difference/mean": 0.015885604545474052, "step": 970 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 611.0, "completions/max_terminated_length": 611.0, "completions/mean_length": 310.546875, "completions/mean_terminated_length": 310.546875, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "entropy": 0.5073447823524475, "epoch": 1.1899509803921569, "frac_reward_zero_std": 0.5, "grad_norm": 0.9003429888570723, "kl": 0.02553410269320011, "learning_rate": 7.567553441745711e-07, "loss": 0.0097, "num_tokens": 41885881.0, "reward": 0.65625, "reward_std": 0.375, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.3131308555603027, "sampling/importance_sampling_ratio/mean": 0.9997944235801697, "sampling/importance_sampling_ratio/min": 0.6446648836135864, "sampling/sampling_logp_difference/max": 0.4390246868133545, "sampling/sampling_logp_difference/mean": 0.01482811477035284, "step": 971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 624.0, "completions/max_terminated_length": 624.0, "completions/mean_length": 331.578125, "completions/mean_terminated_length": 331.578125, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.5401585102081299, "epoch": 1.1911764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.014391891123807949, "kl": 0.016997363418340683, "learning_rate": 7.561438030871885e-07, "loss": 0.0002, "num_tokens": 41923822.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4456439018249512, "sampling/importance_sampling_ratio/mean": 1.0002140998840332, "sampling/importance_sampling_ratio/min": 0.612078070640564, "sampling/sampling_logp_difference/max": 0.49089550971984863, "sampling/sampling_logp_difference/mean": 0.017667587846517563, "step": 972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 905.0, "completions/max_terminated_length": 905.0, "completions/mean_length": 445.1875, "completions/mean_terminated_length": 445.1875, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "entropy": 0.46711423993110657, "epoch": 1.1924019607843137, "frac_reward_zero_std": 0.5, "grad_norm": 0.8201985603808057, "kl": 0.02147463522851467, "learning_rate": 7.555317420441129e-07, "loss": 0.0658, "num_tokens": 41972954.0, "reward": 0.90625, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.4744269847869873, "sampling/importance_sampling_ratio/mean": 0.9999150037765503, "sampling/importance_sampling_ratio/min": 0.07274787127971649, "sampling/sampling_logp_difference/max": 2.620755672454834, "sampling/sampling_logp_difference/mean": 0.012979123741388321, "step": 973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 864.0, "completions/max_terminated_length": 864.0, "completions/mean_length": 309.75, "completions/mean_terminated_length": 309.75, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "entropy": 0.4687129259109497, "epoch": 1.1936274509803921, "frac_reward_zero_std": 1.0, "grad_norm": 0.01694974219194406, "kl": 0.02081015519797802, "learning_rate": 7.549191622877892e-07, "loss": 0.0002, "num_tokens": 42011386.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5594754219055176, "sampling/importance_sampling_ratio/mean": 0.9995567798614502, "sampling/importance_sampling_ratio/min": 0.6443440318107605, "sampling/sampling_logp_difference/max": 0.4443495273590088, "sampling/sampling_logp_difference/mean": 0.014580529183149338, "step": 974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 863.0, "completions/max_terminated_length": 863.0, "completions/mean_length": 393.203125, "completions/mean_terminated_length": 393.203125, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "entropy": 0.45880019664764404, "epoch": 1.1948529411764706, "frac_reward_zero_std": 0.75, "grad_norm": 0.6207973298602183, "kl": 0.015126598998904228, "learning_rate": 7.543060650617158e-07, "loss": 0.0039, "num_tokens": 42055063.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.5480831861495972, "sampling/importance_sampling_ratio/mean": 1.0002458095550537, "sampling/importance_sampling_ratio/min": 0.7013744115829468, "sampling/sampling_logp_difference/max": 0.43701744079589844, "sampling/sampling_logp_difference/mean": 0.013662204146385193, "step": 975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 826.0, "completions/max_terminated_length": 826.0, "completions/mean_length": 397.03125, "completions/mean_terminated_length": 397.03125, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.5800267457962036, "epoch": 1.196078431372549, "frac_reward_zero_std": 0.5, "grad_norm": 1.2047719094277454, "kl": 0.02439621090888977, "learning_rate": 7.53692451610441e-07, "loss": 0.051, "num_tokens": 42100233.0, "reward": 0.09375, "reward_std": 0.4515564441680908, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9998199343681335, "sampling/importance_sampling_ratio/min": 0.5960355997085571, "sampling/sampling_logp_difference/max": 1.0547699928283691, "sampling/sampling_logp_difference/mean": 0.016498761251568794, "step": 976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 872.0, "completions/max_terminated_length": 872.0, "completions/mean_length": 382.328125, "completions/mean_terminated_length": 382.328125, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.4230691194534302, "epoch": 1.1973039215686274, "frac_reward_zero_std": 1.0, "grad_norm": 0.010625685521544482, "kl": 0.014319577254354954, "learning_rate": 7.530783231795614e-07, "loss": 0.0001, "num_tokens": 42141694.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.9102847576141357, "sampling/importance_sampling_ratio/mean": 1.0003812313079834, "sampling/importance_sampling_ratio/min": 0.5745927691459656, "sampling/sampling_logp_difference/max": 0.6472523212432861, "sampling/sampling_logp_difference/mean": 0.013714063912630081, "step": 977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 884.0, "completions/max_terminated_length": 884.0, "completions/mean_length": 327.375, "completions/mean_terminated_length": 327.375, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.32676371932029724, "epoch": 1.1985294117647058, "frac_reward_zero_std": 0.75, "grad_norm": 1.1283063641814568, "kl": 0.013587646186351776, "learning_rate": 7.524636810157188e-07, "loss": 0.0856, "num_tokens": 42180918.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.4367188215255737, "sampling/importance_sampling_ratio/mean": 0.9999919533729553, "sampling/importance_sampling_ratio/min": 0.6299502849578857, "sampling/sampling_logp_difference/max": 0.4621143341064453, "sampling/sampling_logp_difference/mean": 0.010550693608820438, "step": 978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/max_terminated_length": 399.0, "completions/mean_length": 252.34375, "completions/mean_terminated_length": 252.34375, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.41724056005477905, "epoch": 1.1997549019607843, "frac_reward_zero_std": 1.0, "grad_norm": 0.02414355440447347, "kl": 0.019795836880803108, "learning_rate": 7.518485263665977e-07, "loss": 0.0002, "num_tokens": 42214012.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5971448421478271, "sampling/importance_sampling_ratio/mean": 0.9999709725379944, "sampling/importance_sampling_ratio/min": 0.6622461080551147, "sampling/sampling_logp_difference/max": 0.4682176113128662, "sampling/sampling_logp_difference/mean": 0.014359518885612488, "step": 979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 577.0, "completions/max_terminated_length": 577.0, "completions/mean_length": 275.6875, "completions/mean_terminated_length": 275.6875, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.5412540435791016, "epoch": 1.2009803921568627, "frac_reward_zero_std": 0.5, "grad_norm": 0.9416817875942943, "kl": 0.02357449010014534, "learning_rate": 7.512328604809232e-07, "loss": -0.0039, "num_tokens": 42246264.0, "reward": 0.9375, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.3786433935165405, "sampling/importance_sampling_ratio/mean": 0.999703049659729, "sampling/importance_sampling_ratio/min": 0.5393821001052856, "sampling/sampling_logp_difference/max": 0.6173310279846191, "sampling/sampling_logp_difference/mean": 0.01637186110019684, "step": 980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 900.0, "completions/max_terminated_length": 900.0, "completions/mean_length": 348.09375, "completions/mean_terminated_length": 348.09375, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.4146501421928406, "epoch": 1.2022058823529411, "frac_reward_zero_std": 0.75, "grad_norm": 0.5874428065711707, "kl": 0.015377018600702286, "learning_rate": 7.506166846084579e-07, "loss": 0.0042, "num_tokens": 42286766.0, "reward": 0.0625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.4754077196121216, "sampling/importance_sampling_ratio/mean": 1.0001602172851562, "sampling/importance_sampling_ratio/min": 0.6624588370323181, "sampling/sampling_logp_difference/max": 0.41179680824279785, "sampling/sampling_logp_difference/mean": 0.013858208432793617, "step": 981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 920.0, "completions/max_terminated_length": 920.0, "completions/mean_length": 452.359375, "completions/mean_terminated_length": 452.359375, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "entropy": 0.47576576471328735, "epoch": 1.2034313725490196, "frac_reward_zero_std": 0.5, "grad_norm": 0.7733054436107837, "kl": 0.017762858420610428, "learning_rate": 7.5e-07, "loss": 0.0613, "num_tokens": 42345349.0, "reward": 0.875, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.554861068725586, "sampling/importance_sampling_ratio/mean": 1.0004749298095703, "sampling/importance_sampling_ratio/min": 0.5862053632736206, "sampling/sampling_logp_difference/max": 0.5340850353240967, "sampling/sampling_logp_difference/mean": 0.013276378624141216, "step": 982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/max_terminated_length": 524.0, "completions/mean_length": 267.859375, "completions/mean_terminated_length": 267.859375, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.44374436140060425, "epoch": 1.204656862745098, "frac_reward_zero_std": 0.75, "grad_norm": 0.6919915954447272, "kl": 0.022599492222070694, "learning_rate": 7.493828079073801e-07, "loss": 0.0306, "num_tokens": 42375708.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.3500136137008667, "sampling/importance_sampling_ratio/mean": 0.9995726346969604, "sampling/importance_sampling_ratio/min": 0.6612072587013245, "sampling/sampling_logp_difference/max": 0.41368794441223145, "sampling/sampling_logp_difference/mean": 0.015526032075285912, "step": 983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 581.0, "completions/max_terminated_length": 581.0, "completions/mean_length": 251.0625, "completions/mean_terminated_length": 251.0625, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.32864657044410706, "epoch": 1.2058823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.018157011309881614, "kl": 0.016002517193555832, "learning_rate": 7.487651095834588e-07, "loss": 0.0001, "num_tokens": 42406240.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4276227951049805, "sampling/importance_sampling_ratio/mean": 1.0001676082611084, "sampling/importance_sampling_ratio/min": 0.6254533529281616, "sampling/sampling_logp_difference/max": 0.46927857398986816, "sampling/sampling_logp_difference/mean": 0.01254268642514944, "step": 984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1066.0, "completions/max_terminated_length": 1066.0, "completions/mean_length": 374.765625, "completions/mean_terminated_length": 374.765625, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "entropy": 0.48466092348098755, "epoch": 1.2071078431372548, "frac_reward_zero_std": 1.0, "grad_norm": 0.015468979950818128, "kl": 0.01531573012471199, "learning_rate": 7.481469062821251e-07, "loss": 0.0001, "num_tokens": 42447729.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5971453189849854, "sampling/importance_sampling_ratio/mean": 0.9998141527175903, "sampling/importance_sampling_ratio/min": 0.6636133193969727, "sampling/sampling_logp_difference/max": 0.4682178497314453, "sampling/sampling_logp_difference/mean": 0.014433901757001877, "step": 985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 756.0, "completions/max_terminated_length": 756.0, "completions/mean_length": 337.8125, "completions/mean_terminated_length": 337.8125, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.5117394924163818, "epoch": 1.2083333333333333, "frac_reward_zero_std": 0.5, "grad_norm": 0.877464420042257, "kl": 0.025254417210817337, "learning_rate": 7.47528199258292e-07, "loss": 0.0391, "num_tokens": 42487861.0, "reward": 0.59375, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.7877987623214722, "sampling/importance_sampling_ratio/mean": 1.0001999139785767, "sampling/importance_sampling_ratio/min": 0.3753165602684021, "sampling/sampling_logp_difference/max": 0.9799854755401611, "sampling/sampling_logp_difference/mean": 0.01565280184149742, "step": 986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 467.0, "completions/max_terminated_length": 467.0, "completions/mean_length": 223.828125, "completions/mean_terminated_length": 223.828125, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.43098071217536926, "epoch": 1.2095588235294117, "frac_reward_zero_std": 0.75, "grad_norm": 0.7214790975807119, "kl": 0.026232028380036354, "learning_rate": 7.469089897678957e-07, "loss": 0.0113, "num_tokens": 42515418.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.5876084566116333, "sampling/importance_sampling_ratio/mean": 0.9997380971908569, "sampling/importance_sampling_ratio/min": 0.696341335773468, "sampling/sampling_logp_difference/max": 0.46222877502441406, "sampling/sampling_logp_difference/mean": 0.015428689308464527, "step": 987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 466.0, "completions/max_terminated_length": 466.0, "completions/mean_length": 246.515625, "completions/mean_terminated_length": 246.515625, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.5275146961212158, "epoch": 1.2107843137254901, "frac_reward_zero_std": 0.5, "grad_norm": 1.10253530252883, "kl": 0.04553501307964325, "learning_rate": 7.462892790678925e-07, "loss": 0.0026, "num_tokens": 42548827.0, "reward": 0.40625, "reward_std": 0.47978055477142334, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.507978081703186, "sampling/importance_sampling_ratio/mean": 1.000449776649475, "sampling/importance_sampling_ratio/min": 0.7529246807098389, "sampling/sampling_logp_difference/max": 0.4107697010040283, "sampling/sampling_logp_difference/mean": 0.01601814478635788, "step": 988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 577.0, "completions/max_terminated_length": 577.0, "completions/mean_length": 270.875, "completions/mean_terminated_length": 270.875, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.5724170804023743, "epoch": 1.2120098039215685, "frac_reward_zero_std": 0.75, "grad_norm": 0.8706117723342843, "kl": 0.026116492226719856, "learning_rate": 7.456690684162556e-07, "loss": 0.0289, "num_tokens": 42580627.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.6598587036132812, "sampling/importance_sampling_ratio/mean": 0.9999436736106873, "sampling/importance_sampling_ratio/min": 0.6124981641769409, "sampling/sampling_logp_difference/max": 0.5067324638366699, "sampling/sampling_logp_difference/mean": 0.01889456994831562, "step": 989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/max_terminated_length": 530.0, "completions/mean_length": 257.59375, "completions/mean_terminated_length": 257.59375, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.5676281452178955, "epoch": 1.213235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.1484682606226562, "kl": 0.03918970376253128, "learning_rate": 7.450483590719736e-07, "loss": 0.0053, "num_tokens": 42625929.0, "reward": 0.65625, "reward_std": 0.4597553312778473, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.8615480661392212, "sampling/importance_sampling_ratio/mean": 1.000133752822876, "sampling/importance_sampling_ratio/min": 0.6435976624488831, "sampling/sampling_logp_difference/max": 0.6214084625244141, "sampling/sampling_logp_difference/mean": 0.018035713583230972, "step": 990 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 938.0, "completions/max_terminated_length": 938.0, "completions/mean_length": 280.265625, "completions/mean_terminated_length": 280.265625, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.5869972705841064, "epoch": 1.2144607843137254, "frac_reward_zero_std": 0.75, "grad_norm": 0.74278018229643, "kl": 0.03139817342162132, "learning_rate": 7.444271522950468e-07, "loss": 0.0451, "num_tokens": 42660618.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.4296096563339233, "sampling/importance_sampling_ratio/mean": 1.0004620552062988, "sampling/importance_sampling_ratio/min": 0.5413239002227783, "sampling/sampling_logp_difference/max": 0.6137375831604004, "sampling/sampling_logp_difference/mean": 0.017468148842453957, "step": 991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 457.0, "completions/max_terminated_length": 457.0, "completions/mean_length": 238.515625, "completions/mean_terminated_length": 238.515625, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.4554062485694885, "epoch": 1.215686274509804, "frac_reward_zero_std": 0.75, "grad_norm": 0.71198886823943, "kl": 0.03311925381422043, "learning_rate": 7.438054493464859e-07, "loss": 0.0293, "num_tokens": 42696699.0, "reward": 0.65625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.3547227382659912, "sampling/importance_sampling_ratio/mean": 1.0000028610229492, "sampling/importance_sampling_ratio/min": 0.6255058646202087, "sampling/sampling_logp_difference/max": 0.46919453144073486, "sampling/sampling_logp_difference/mean": 0.015584859997034073, "step": 992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 441.0, "completions/max_terminated_length": 441.0, "completions/mean_length": 268.5, "completions/mean_terminated_length": 268.5, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.43388330936431885, "epoch": 1.2169117647058822, "frac_reward_zero_std": 0.75, "grad_norm": 0.6587430840444651, "kl": 0.03411659970879555, "learning_rate": 7.431832514883081e-07, "loss": 0.0028, "num_tokens": 42730075.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.7125017642974854, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 0.7303200364112854, "sampling/sampling_logp_difference/max": 0.5379552841186523, "sampling/sampling_logp_difference/mean": 0.013606551103293896, "step": 993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 253.53125, "completions/mean_terminated_length": 253.53125, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.4767951965332031, "epoch": 1.218137254901961, "frac_reward_zero_std": 0.75, "grad_norm": 0.7793436264992458, "kl": 0.026810109615325928, "learning_rate": 7.42560559983536e-07, "loss": -0.0216, "num_tokens": 42764637.0, "reward": 0.0625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.3258380889892578, "sampling/importance_sampling_ratio/mean": 0.9999908208847046, "sampling/importance_sampling_ratio/min": 0.7269160747528076, "sampling/sampling_logp_difference/max": 0.31894421577453613, "sampling/sampling_logp_difference/mean": 0.015285547822713852, "step": 994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 477.0, "completions/max_terminated_length": 477.0, "completions/mean_length": 248.90625, "completions/mean_terminated_length": 248.90625, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.612114667892456, "epoch": 1.219362745098039, "frac_reward_zero_std": 0.5, "grad_norm": 0.9851099011495248, "kl": 0.04345347732305527, "learning_rate": 7.419373760961939e-07, "loss": -0.0346, "num_tokens": 42800791.0, "reward": 0.78125, "reward_std": 0.375, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.549504280090332, "sampling/importance_sampling_ratio/mean": 0.9991915225982666, "sampling/importance_sampling_ratio/min": 0.6209067106246948, "sampling/sampling_logp_difference/max": 0.4765744209289551, "sampling/sampling_logp_difference/mean": 0.01939048245549202, "step": 995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 435.0, "completions/max_terminated_length": 435.0, "completions/mean_length": 227.984375, "completions/mean_terminated_length": 227.984375, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.538272500038147, "epoch": 1.2205882352941178, "frac_reward_zero_std": 0.5, "grad_norm": 1.0826344086914892, "kl": 0.04787176474928856, "learning_rate": 7.413137010913054e-07, "loss": 0.0161, "num_tokens": 42831110.0, "reward": -0.53125, "reward_std": 0.3723389506340027, "rewards/decision_reward_func/mean": -0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.4362585544586182, "sampling/importance_sampling_ratio/mean": 1.0000594854354858, "sampling/importance_sampling_ratio/min": 0.5252496600151062, "sampling/sampling_logp_difference/max": 0.6438815593719482, "sampling/sampling_logp_difference/mean": 0.01725119911134243, "step": 996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 423.0, "completions/max_terminated_length": 423.0, "completions/mean_length": 269.125, "completions/mean_terminated_length": 269.125, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.6170411109924316, "epoch": 1.221813725490196, "frac_reward_zero_std": 0.5, "grad_norm": 0.9772204467442919, "kl": 0.0593695268034935, "learning_rate": 7.406895362348915e-07, "loss": 0.003, "num_tokens": 42870958.0, "reward": 0.6875, "reward_std": 0.47360679507255554, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.6365857124328613, "sampling/importance_sampling_ratio/mean": 0.9994696378707886, "sampling/importance_sampling_ratio/min": 0.6623708009719849, "sampling/sampling_logp_difference/max": 0.49261224269866943, "sampling/sampling_logp_difference/mean": 0.017314037308096886, "step": 997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/max_terminated_length": 367.0, "completions/mean_length": 218.09375, "completions/mean_terminated_length": 218.09375, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.5279123187065125, "epoch": 1.2230392156862746, "frac_reward_zero_std": 1.0, "grad_norm": 0.04229030044678826, "kl": 0.04430222883820534, "learning_rate": 7.400648827939671e-07, "loss": 0.0004, "num_tokens": 42903268.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5875672101974487, "sampling/importance_sampling_ratio/mean": 1.0002925395965576, "sampling/importance_sampling_ratio/min": 0.6275952458381653, "sampling/sampling_logp_difference/max": 0.46585988998413086, "sampling/sampling_logp_difference/mean": 0.017195124179124832, "step": 998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 453.0, "completions/max_terminated_length": 453.0, "completions/mean_length": 222.53125, "completions/mean_terminated_length": 222.53125, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.6237752437591553, "epoch": 1.224264705882353, "frac_reward_zero_std": 0.75, "grad_norm": 0.902581488098436, "kl": 0.05890728533267975, "learning_rate": 7.394397420365392e-07, "loss": 0.0222, "num_tokens": 42936342.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.4012773036956787, "sampling/importance_sampling_ratio/mean": 0.9991165399551392, "sampling/importance_sampling_ratio/min": 0.771501898765564, "sampling/sampling_logp_difference/max": 0.3373842239379883, "sampling/sampling_logp_difference/mean": 0.01780422404408455, "step": 999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/max_terminated_length": 480.0, "completions/mean_length": 234.34375, "completions/mean_terminated_length": 234.34375, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.41644275188446045, "epoch": 1.2254901960784315, "frac_reward_zero_std": 0.75, "grad_norm": 0.8938799527837136, "kl": 0.04229922592639923, "learning_rate": 7.388141152316038e-07, "loss": -0.0247, "num_tokens": 42967420.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.4326969385147095, "sampling/importance_sampling_ratio/mean": 1.000288724899292, "sampling/importance_sampling_ratio/min": 0.606316089630127, "sampling/sampling_logp_difference/max": 0.5003538131713867, "sampling/sampling_logp_difference/mean": 0.014152223244309425, "step": 1000 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.0, "completions/max_terminated_length": 331.0, "completions/mean_length": 178.140625, "completions/mean_terminated_length": 178.140625, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.44902151823043823, "epoch": 1.2267156862745099, "frac_reward_zero_std": 0.75, "grad_norm": 0.8509201636714476, "kl": 0.058626316487789154, "learning_rate": 7.381880036491439e-07, "loss": 0.0217, "num_tokens": 42991605.0, "reward": 0.625, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.6088902950286865, "sampling/importance_sampling_ratio/mean": 1.0001192092895508, "sampling/importance_sampling_ratio/min": 0.603908360004425, "sampling/sampling_logp_difference/max": 0.5043327808380127, "sampling/sampling_logp_difference/mean": 0.017346547916531563, "step": 1001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 899.0, "completions/max_terminated_length": 899.0, "completions/mean_length": 326.21875, "completions/mean_terminated_length": 326.21875, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.5434325933456421, "epoch": 1.2279411764705883, "frac_reward_zero_std": 0.5, "grad_norm": 0.9014796460226538, "kl": 0.036520786583423615, "learning_rate": 7.375614085601264e-07, "loss": -0.0036, "num_tokens": 43033059.0, "reward": 0.6875, "reward_std": 0.3811737596988678, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.4756203889846802, "sampling/importance_sampling_ratio/mean": 1.0000921487808228, "sampling/importance_sampling_ratio/min": 0.6658933162689209, "sampling/sampling_logp_difference/max": 0.40662574768066406, "sampling/sampling_logp_difference/mean": 0.015643564984202385, "step": 1002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 428.0, "completions/max_terminated_length": 428.0, "completions/mean_length": 205.671875, "completions/mean_terminated_length": 205.671875, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.5741546154022217, "epoch": 1.2291666666666667, "frac_reward_zero_std": 0.5, "grad_norm": 1.3693491830764948, "kl": 0.07366272807121277, "learning_rate": 7.369343312364993e-07, "loss": 0.0508, "num_tokens": 43061182.0, "reward": 0.71875, "reward_std": 0.42695626616477966, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.65120267868042, "sampling/importance_sampling_ratio/mean": 1.0001182556152344, "sampling/importance_sampling_ratio/min": 0.6335078477859497, "sampling/sampling_logp_difference/max": 0.5015039443969727, "sampling/sampling_logp_difference/mean": 0.018137292936444283, "step": 1003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 775.0, "completions/max_terminated_length": 775.0, "completions/mean_length": 262.75, "completions/mean_terminated_length": 262.75, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.545732855796814, "epoch": 1.2303921568627452, "frac_reward_zero_std": 0.5, "grad_norm": 1.0913703388579141, "kl": 0.03661258891224861, "learning_rate": 7.363067729511901e-07, "loss": 0.0162, "num_tokens": 43098430.0, "reward": 0.4375, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.5770788192749023, "sampling/importance_sampling_ratio/mean": 0.999940037727356, "sampling/importance_sampling_ratio/min": 0.6885656714439392, "sampling/sampling_logp_difference/max": 0.45557427406311035, "sampling/sampling_logp_difference/mean": 0.016680218279361725, "step": 1004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 466.0, "completions/max_terminated_length": 466.0, "completions/mean_length": 250.421875, "completions/mean_terminated_length": 250.421875, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.5628756284713745, "epoch": 1.2316176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 1.1490813344094666, "kl": 0.053378716111183167, "learning_rate": 7.356787349781022e-07, "loss": 0.0336, "num_tokens": 43135017.0, "reward": 0.25, "reward_std": 0.4472135901451111, "rewards/decision_reward_func/mean": 0.25, "rewards/decision_reward_func/std": 0.9759001135826111, "sampling/importance_sampling_ratio/max": 1.4074790477752686, "sampling/importance_sampling_ratio/mean": 0.9999617338180542, "sampling/importance_sampling_ratio/min": 0.6939548850059509, "sampling/sampling_logp_difference/max": 0.36534833908081055, "sampling/sampling_logp_difference/mean": 0.017072133719921112, "step": 1005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 573.0, "completions/max_terminated_length": 573.0, "completions/mean_length": 245.84375, "completions/mean_terminated_length": 245.84375, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.6450256109237671, "epoch": 1.232843137254902, "frac_reward_zero_std": 0.5, "grad_norm": 1.0055199880942496, "kl": 0.06880374252796173, "learning_rate": 7.350502185921131e-07, "loss": -0.006, "num_tokens": 43168943.0, "reward": -0.0625, "reward_std": 0.25, "rewards/decision_reward_func/mean": -0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.4130218029022217, "sampling/importance_sampling_ratio/mean": 1.0004574060440063, "sampling/importance_sampling_ratio/min": 0.6483243703842163, "sampling/sampling_logp_difference/max": 0.4333641529083252, "sampling/sampling_logp_difference/mean": 0.017971958965063095, "step": 1006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/max_terminated_length": 364.0, "completions/mean_length": 206.109375, "completions/mean_terminated_length": 206.109375, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.4912464916706085, "epoch": 1.2340686274509804, "frac_reward_zero_std": 0.5, "grad_norm": 1.0515846727085458, "kl": 0.05553032085299492, "learning_rate": 7.344212250690711e-07, "loss": -0.0168, "num_tokens": 43196294.0, "reward": 0.3125, "reward_std": 0.3811737596988678, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.6465678215026855, "sampling/importance_sampling_ratio/mean": 1.0005923509597778, "sampling/importance_sampling_ratio/min": 0.6428614258766174, "sampling/sampling_logp_difference/max": 0.49869298934936523, "sampling/sampling_logp_difference/mean": 0.01594463735818863, "step": 1007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 559.0, "completions/max_terminated_length": 559.0, "completions/mean_length": 260.25, "completions/mean_terminated_length": 260.25, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.5272684097290039, "epoch": 1.2352941176470589, "frac_reward_zero_std": 0.25, "grad_norm": 1.27461236563769, "kl": 0.053458210080862045, "learning_rate": 7.337917556857934e-07, "loss": -0.0241, "num_tokens": 43233638.0, "reward": 0.6875, "reward_std": 0.5879635810852051, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.4365613460540771, "sampling/importance_sampling_ratio/mean": 1.0004180669784546, "sampling/importance_sampling_ratio/min": 0.7080623507499695, "sampling/sampling_logp_difference/max": 0.36225223541259766, "sampling/sampling_logp_difference/mean": 0.0162246935069561, "step": 1008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 424.0, "completions/max_terminated_length": 424.0, "completions/mean_length": 212.328125, "completions/mean_terminated_length": 212.328125, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.5921734571456909, "epoch": 1.2365196078431373, "frac_reward_zero_std": 0.5, "grad_norm": 1.0914777328754761, "kl": 0.05500746890902519, "learning_rate": 7.331618117200625e-07, "loss": -0.0001, "num_tokens": 43267035.0, "reward": 0.0625, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.5416550636291504, "sampling/importance_sampling_ratio/mean": 0.999862790107727, "sampling/importance_sampling_ratio/min": 0.4705045521259308, "sampling/sampling_logp_difference/max": 0.7539496421813965, "sampling/sampling_logp_difference/mean": 0.019165601581335068, "step": 1009 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/max_terminated_length": 513.0, "completions/mean_length": 262.265625, "completions/mean_terminated_length": 262.265625, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.3239006996154785, "epoch": 1.2377450980392157, "frac_reward_zero_std": 1.0, "grad_norm": 0.026389946317356174, "kl": 0.021288566291332245, "learning_rate": 7.325313944506253e-07, "loss": 0.0002, "num_tokens": 43303644.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6976981163024902, "sampling/importance_sampling_ratio/mean": 0.9998065233230591, "sampling/importance_sampling_ratio/min": 0.6864089965820312, "sampling/sampling_logp_difference/max": 0.529273271560669, "sampling/sampling_logp_difference/mean": 0.010832096450030804, "step": 1010 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/max_terminated_length": 397.0, "completions/mean_length": 228.96875, "completions/mean_terminated_length": 228.96875, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.3847115635871887, "epoch": 1.2389705882352942, "frac_reward_zero_std": 1.0, "grad_norm": 0.05030023004115878, "kl": 0.034086018800735474, "learning_rate": 7.319005051571885e-07, "loss": 0.0003, "num_tokens": 43333450.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.438254952430725, "sampling/importance_sampling_ratio/mean": 0.9999770522117615, "sampling/importance_sampling_ratio/min": 0.6814442276954651, "sampling/sampling_logp_difference/max": 0.3835408687591553, "sampling/sampling_logp_difference/mean": 0.013423501513898373, "step": 1011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 460.0, "completions/max_terminated_length": 460.0, "completions/mean_length": 235.65625, "completions/mean_terminated_length": 235.65625, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.4898146986961365, "epoch": 1.2401960784313726, "frac_reward_zero_std": 0.5, "grad_norm": 1.1316461582616606, "kl": 0.038877662271261215, "learning_rate": 7.312691451204177e-07, "loss": 0.0111, "num_tokens": 43368820.0, "reward": 0.03125, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.4314295053482056, "sampling/importance_sampling_ratio/mean": 0.9998259544372559, "sampling/importance_sampling_ratio/min": 0.6498495936393738, "sampling/sampling_logp_difference/max": 0.4310142993927002, "sampling/sampling_logp_difference/mean": 0.016645723953843117, "step": 1012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 607.0, "completions/max_terminated_length": 607.0, "completions/mean_length": 244.046875, "completions/mean_terminated_length": 244.046875, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.3845529556274414, "epoch": 1.241421568627451, "frac_reward_zero_std": 0.75, "grad_norm": 0.7477305936826382, "kl": 0.030102748423814774, "learning_rate": 7.306373156219335e-07, "loss": 0.0215, "num_tokens": 43398983.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.5837743282318115, "sampling/importance_sampling_ratio/mean": 0.9998899102210999, "sampling/importance_sampling_ratio/min": 0.6807684898376465, "sampling/sampling_logp_difference/max": 0.459810733795166, "sampling/sampling_logp_difference/mean": 0.012739075347781181, "step": 1013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 751.0, "completions/max_terminated_length": 751.0, "completions/mean_length": 318.5, "completions/mean_terminated_length": 318.5, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.5055594444274902, "epoch": 1.2426470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.06849638209347797, "kl": 0.04126866161823273, "learning_rate": 7.300050179443099e-07, "loss": 0.0004, "num_tokens": 43439175.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.5243487358093262, "sampling/importance_sampling_ratio/mean": 1.0000061988830566, "sampling/importance_sampling_ratio/min": 0.5530250668525696, "sampling/sampling_logp_difference/max": 0.5923519134521484, "sampling/sampling_logp_difference/mean": 0.014675150625407696, "step": 1014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 746.0, "completions/max_terminated_length": 746.0, "completions/mean_length": 278.9375, "completions/mean_terminated_length": 278.9375, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.6393211483955383, "epoch": 1.2438725490196079, "frac_reward_zero_std": 0.5, "grad_norm": 1.1678284789767601, "kl": 0.04999134689569473, "learning_rate": 7.293722533710714e-07, "loss": -0.0561, "num_tokens": 43491267.0, "reward": 0.3125, "reward_std": 0.3811737596988678, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.5233184099197388, "sampling/importance_sampling_ratio/mean": 1.000219464302063, "sampling/importance_sampling_ratio/min": 0.5512222051620483, "sampling/sampling_logp_difference/max": 0.5956172943115234, "sampling/sampling_logp_difference/mean": 0.019904382526874542, "step": 1015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 574.0, "completions/max_terminated_length": 574.0, "completions/mean_length": 291.859375, "completions/mean_terminated_length": 291.859375, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.38408541679382324, "epoch": 1.2450980392156863, "frac_reward_zero_std": 1.0, "grad_norm": 0.0389595655916567, "kl": 0.026927795261144638, "learning_rate": 7.287390231866893e-07, "loss": 0.0003, "num_tokens": 43525626.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.344847559928894, "sampling/importance_sampling_ratio/mean": 0.9999645948410034, "sampling/importance_sampling_ratio/min": 0.20880623161792755, "sampling/sampling_logp_difference/max": 1.5663485527038574, "sampling/sampling_logp_difference/mean": 0.012945761904120445, "step": 1016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 814.0, "completions/max_terminated_length": 814.0, "completions/mean_length": 282.0625, "completions/mean_terminated_length": 282.0625, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.45294028520584106, "epoch": 1.2463235294117647, "frac_reward_zero_std": 0.75, "grad_norm": 1.1258986255521095, "kl": 0.04045727849006653, "learning_rate": 7.281053286765815e-07, "loss": 0.0947, "num_tokens": 43561886.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.598439335823059, "sampling/importance_sampling_ratio/mean": 1.0005723237991333, "sampling/importance_sampling_ratio/min": 0.7298737168312073, "sampling/sampling_logp_difference/max": 0.4690277576446533, "sampling/sampling_logp_difference/mean": 0.01470007374882698, "step": 1017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 786.0, "completions/max_terminated_length": 786.0, "completions/mean_length": 292.34375, "completions/mean_terminated_length": 292.34375, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.43049484491348267, "epoch": 1.2475490196078431, "frac_reward_zero_std": 0.5, "grad_norm": 0.7697503514796892, "kl": 0.03150913864374161, "learning_rate": 7.274711711271073e-07, "loss": -0.0362, "num_tokens": 43597860.0, "reward": 0.9375, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.599814534187317, "sampling/importance_sampling_ratio/mean": 0.9999035000801086, "sampling/importance_sampling_ratio/min": 0.5822610855102539, "sampling/sampling_logp_difference/max": 0.5408363342285156, "sampling/sampling_logp_difference/mean": 0.01419509295374155, "step": 1018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 469.0, "completions/max_terminated_length": 469.0, "completions/mean_length": 241.640625, "completions/mean_terminated_length": 241.640625, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.4491606652736664, "epoch": 1.2487745098039216, "frac_reward_zero_std": 0.75, "grad_norm": 0.7077246859255182, "kl": 0.03388433903455734, "learning_rate": 7.268365518255665e-07, "loss": 0.0037, "num_tokens": 43628957.0, "reward": 0.6875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.5685268640518188, "sampling/importance_sampling_ratio/mean": 0.9998863935470581, "sampling/importance_sampling_ratio/min": 0.637692928314209, "sampling/sampling_logp_difference/max": 0.4501368999481201, "sampling/sampling_logp_difference/mean": 0.015396618284285069, "step": 1019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 594.0, "completions/max_terminated_length": 594.0, "completions/mean_length": 300.296875, "completions/mean_terminated_length": 300.296875, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.46300625801086426, "epoch": 1.25, "frac_reward_zero_std": 0.75, "grad_norm": 0.5028151339672607, "kl": 0.04746386408805847, "learning_rate": 7.262014720601958e-07, "loss": -0.0053, "num_tokens": 43675888.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.4057986736297607, "sampling/importance_sampling_ratio/mean": 1.0006258487701416, "sampling/importance_sampling_ratio/min": 0.6821814179420471, "sampling/sampling_logp_difference/max": 0.3824596405029297, "sampling/sampling_logp_difference/mean": 0.014246114529669285, "step": 1020 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 747.0, "completions/max_terminated_length": 747.0, "completions/mean_length": 328.921875, "completions/mean_terminated_length": 328.921875, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.3674333095550537, "epoch": 1.2512254901960784, "frac_reward_zero_std": 0.75, "grad_norm": 0.8170795055962474, "kl": 0.02374059148132801, "learning_rate": 7.255659331201673e-07, "loss": 0.0358, "num_tokens": 43718347.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0004764795303345, "sampling/importance_sampling_ratio/min": 0.7131785154342651, "sampling/sampling_logp_difference/max": 0.737675666809082, "sampling/sampling_logp_difference/mean": 0.01193443313241005, "step": 1021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 603.0, "completions/max_terminated_length": 603.0, "completions/mean_length": 276.46875, "completions/mean_terminated_length": 276.46875, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.49759870767593384, "epoch": 1.2524509803921569, "frac_reward_zero_std": 0.75, "grad_norm": 0.6480808748124887, "kl": 0.02777806669473648, "learning_rate": 7.249299362955845e-07, "loss": -0.0351, "num_tokens": 43757625.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.5514986515045166, "sampling/importance_sampling_ratio/mean": 1.0005626678466797, "sampling/importance_sampling_ratio/min": 0.6886867880821228, "sampling/sampling_logp_difference/max": 0.43922126293182373, "sampling/sampling_logp_difference/mean": 0.01683877781033516, "step": 1022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 622.0, "completions/max_terminated_length": 622.0, "completions/mean_length": 272.703125, "completions/mean_terminated_length": 272.703125, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.5151655673980713, "epoch": 1.2536764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.8112728377211624, "kl": 0.043258849531412125, "learning_rate": 7.242934828774808e-07, "loss": -0.0161, "num_tokens": 43798662.0, "reward": 0.75, "reward_std": 0.42078250646591187, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.5745548009872437, "sampling/importance_sampling_ratio/mean": 1.0000172853469849, "sampling/importance_sampling_ratio/min": 0.5428580045700073, "sampling/sampling_logp_difference/max": 0.6109075546264648, "sampling/sampling_logp_difference/mean": 0.017282802611589432, "step": 1023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 826.0, "completions/max_terminated_length": 826.0, "completions/mean_length": 321.234375, "completions/mean_terminated_length": 321.234375, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.5409880876541138, "epoch": 1.2549019607843137, "frac_reward_zero_std": 0.5, "grad_norm": 0.8643615172114159, "kl": 0.026217732578516006, "learning_rate": 7.236565741578162e-07, "loss": 0.037, "num_tokens": 43839381.0, "reward": 0.28125, "reward_std": 0.375, "rewards/decision_reward_func/mean": 0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 1.3974188566207886, "sampling/importance_sampling_ratio/mean": 0.9998139142990112, "sampling/importance_sampling_ratio/min": 0.5584745407104492, "sampling/sampling_logp_difference/max": 0.5825462341308594, "sampling/sampling_logp_difference/mean": 0.01718416064977646, "step": 1024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1050.0, "completions/max_terminated_length": 1050.0, "completions/mean_length": 415.015625, "completions/mean_terminated_length": 415.015625, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "entropy": 0.37025508284568787, "epoch": 1.2561274509803921, "frac_reward_zero_std": 0.75, "grad_norm": 0.4706665196956867, "kl": 0.014621411450207233, "learning_rate": 7.230192114294753e-07, "loss": -0.0593, "num_tokens": 43883670.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0002107620239258, "sampling/importance_sampling_ratio/min": 0.644412100315094, "sampling/sampling_logp_difference/max": 0.7422773838043213, "sampling/sampling_logp_difference/mean": 0.012739733792841434, "step": 1025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 837.0, "completions/max_terminated_length": 837.0, "completions/mean_length": 466.296875, "completions/mean_terminated_length": 466.296875, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.3236148953437805, "epoch": 1.2573529411764706, "frac_reward_zero_std": 0.75, "grad_norm": 0.5481683833579675, "kl": 0.013205149210989475, "learning_rate": 7.223813959862638e-07, "loss": 0.0105, "num_tokens": 43929209.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000150203704834, "sampling/importance_sampling_ratio/min": 0.6655216813087463, "sampling/sampling_logp_difference/max": 0.8887476921081543, "sampling/sampling_logp_difference/mean": 0.010368749499320984, "step": 1026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 788.0, "completions/max_terminated_length": 788.0, "completions/mean_length": 376.828125, "completions/mean_terminated_length": 376.828125, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.4591961205005646, "epoch": 1.258578431372549, "frac_reward_zero_std": 0.75, "grad_norm": 0.5199853887090153, "kl": 0.019161755219101906, "learning_rate": 7.217431291229067e-07, "loss": -0.0358, "num_tokens": 43973262.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.54550302028656, "sampling/importance_sampling_ratio/mean": 1.0000665187835693, "sampling/importance_sampling_ratio/min": 0.6608745455741882, "sampling/sampling_logp_difference/max": 0.4353494644165039, "sampling/sampling_logp_difference/mean": 0.01432862039655447, "step": 1027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 552.0, "completions/max_terminated_length": 552.0, "completions/mean_length": 284.4375, "completions/mean_terminated_length": 284.4375, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "entropy": 0.33941274881362915, "epoch": 1.2598039215686274, "frac_reward_zero_std": 1.0, "grad_norm": 0.019513982752482544, "kl": 0.01874321699142456, "learning_rate": 7.211044121350454e-07, "loss": 0.0002, "num_tokens": 44007738.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4037752151489258, "sampling/importance_sampling_ratio/mean": 0.9996888041496277, "sampling/importance_sampling_ratio/min": 0.6324125528335571, "sampling/sampling_logp_difference/max": 0.45821332931518555, "sampling/sampling_logp_difference/mean": 0.013444009236991405, "step": 1028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 595.0, "completions/max_terminated_length": 595.0, "completions/mean_length": 256.859375, "completions/mean_terminated_length": 256.859375, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.39677590131759644, "epoch": 1.2610294117647058, "frac_reward_zero_std": 1.0, "grad_norm": 0.025453916216948074, "kl": 0.02483324334025383, "learning_rate": 7.204652463192347e-07, "loss": 0.0002, "num_tokens": 44044753.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.528907060623169, "sampling/importance_sampling_ratio/mean": 1.000924825668335, "sampling/importance_sampling_ratio/min": 0.7132664918899536, "sampling/sampling_logp_difference/max": 0.42455315589904785, "sampling/sampling_logp_difference/mean": 0.01431209035217762, "step": 1029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2194.0, "completions/max_terminated_length": 2194.0, "completions/mean_length": 591.703125, "completions/mean_terminated_length": 591.703125, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.48139727115631104, "epoch": 1.2622549019607843, "frac_reward_zero_std": 0.75, "grad_norm": 0.34184996136592427, "kl": 0.03225390613079071, "learning_rate": 7.198256329729411e-07, "loss": -0.0253, "num_tokens": 44105230.0, "reward": 0.3125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.634178638458252, "sampling/importance_sampling_ratio/mean": 0.9997146725654602, "sampling/importance_sampling_ratio/min": 0.5948020219802856, "sampling/sampling_logp_difference/max": 0.5195267200469971, "sampling/sampling_logp_difference/mean": 0.014665212482213974, "step": 1030 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1444.0, "completions/max_terminated_length": 1444.0, "completions/mean_length": 411.140625, "completions/mean_terminated_length": 411.140625, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.38757818937301636, "epoch": 1.2634803921568627, "frac_reward_zero_std": 0.75, "grad_norm": 0.663053851785061, "kl": 0.016210366040468216, "learning_rate": 7.191855733945386e-07, "loss": 0.1091, "num_tokens": 44156855.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.5474368333816528, "sampling/importance_sampling_ratio/mean": 1.0002058744430542, "sampling/importance_sampling_ratio/min": 0.0008339816122315824, "sampling/sampling_logp_difference/max": 7.089299201965332, "sampling/sampling_logp_difference/mean": 0.012645023874938488, "step": 1031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 483.0, "completions/max_terminated_length": 483.0, "completions/mean_length": 277.421875, "completions/mean_terminated_length": 277.421875, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.4594670534133911, "epoch": 1.2647058823529411, "frac_reward_zero_std": 0.75, "grad_norm": 0.7116226331931815, "kl": 0.03860531747341156, "learning_rate": 7.185450688833083e-07, "loss": 0.0059, "num_tokens": 44191330.0, "reward": 0.65625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.413215160369873, "sampling/importance_sampling_ratio/mean": 0.9998937249183655, "sampling/importance_sampling_ratio/min": 0.6815028786659241, "sampling/sampling_logp_difference/max": 0.3834547996520996, "sampling/sampling_logp_difference/mean": 0.015265140682458878, "step": 1032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 680.0, "completions/max_terminated_length": 680.0, "completions/mean_length": 342.0625, "completions/mean_terminated_length": 342.0625, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "entropy": 0.414716511964798, "epoch": 1.2659313725490196, "frac_reward_zero_std": 0.75, "grad_norm": 0.4426784604705323, "kl": 0.01993449777364731, "learning_rate": 7.179041207394331e-07, "loss": -0.0319, "num_tokens": 44230470.0, "reward": 0.84375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.663608193397522, "sampling/importance_sampling_ratio/mean": 1.0003607273101807, "sampling/importance_sampling_ratio/min": 0.6784675121307373, "sampling/sampling_logp_difference/max": 0.5089888572692871, "sampling/sampling_logp_difference/mean": 0.013463919050991535, "step": 1033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1075.0, "completions/max_terminated_length": 1075.0, "completions/mean_length": 421.515625, "completions/mean_terminated_length": 421.515625, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.41261255741119385, "epoch": 1.267156862745098, "frac_reward_zero_std": 1.0, "grad_norm": 0.015985611872005114, "kl": 0.019786981865763664, "learning_rate": 7.172627302639975e-07, "loss": 0.0002, "num_tokens": 44281159.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.9414218664169312, "sampling/importance_sampling_ratio/mean": 1.000281572341919, "sampling/importance_sampling_ratio/min": 0.5127862095832825, "sampling/sampling_logp_difference/max": 0.6678962707519531, "sampling/sampling_logp_difference/mean": 0.013543871231377125, "step": 1034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1039.0, "completions/max_terminated_length": 1039.0, "completions/mean_length": 487.546875, "completions/mean_terminated_length": 487.546875, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "entropy": 0.4981837570667267, "epoch": 1.2683823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.01986629587585958, "kl": 0.01859002746641636, "learning_rate": 7.166208987589836e-07, "loss": 0.0002, "num_tokens": 44327770.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6154550313949585, "sampling/importance_sampling_ratio/mean": 1.0000028610229492, "sampling/importance_sampling_ratio/min": 0.6742984652519226, "sampling/sampling_logp_difference/max": 0.479616641998291, "sampling/sampling_logp_difference/mean": 0.016209883615374565, "step": 1035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 644.0, "completions/max_terminated_length": 644.0, "completions/mean_length": 381.53125, "completions/mean_terminated_length": 381.53125, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "entropy": 0.28202182054519653, "epoch": 1.2696078431372548, "frac_reward_zero_std": 1.0, "grad_norm": 0.016864291037928063, "kl": 0.015844296663999557, "learning_rate": 7.159786275272686e-07, "loss": 0.0002, "num_tokens": 44367244.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.8718971014022827, "sampling/importance_sampling_ratio/mean": 1.0001133680343628, "sampling/importance_sampling_ratio/min": 0.6616312861442566, "sampling/sampling_logp_difference/max": 0.6269524097442627, "sampling/sampling_logp_difference/mean": 0.010684775188565254, "step": 1036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 724.0, "completions/max_terminated_length": 724.0, "completions/mean_length": 409.328125, "completions/mean_terminated_length": 409.328125, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.3736698031425476, "epoch": 1.2708333333333333, "frac_reward_zero_std": 0.75, "grad_norm": 0.5831128676946029, "kl": 0.022470951080322266, "learning_rate": 7.153359178726221e-07, "loss": 0.0206, "num_tokens": 44408257.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.5319674015045166, "sampling/importance_sampling_ratio/mean": 1.000103235244751, "sampling/importance_sampling_ratio/min": 0.6397772431373596, "sampling/sampling_logp_difference/max": 0.44663524627685547, "sampling/sampling_logp_difference/mean": 0.013478362932801247, "step": 1037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 705.0, "completions/max_terminated_length": 705.0, "completions/mean_length": 382.359375, "completions/mean_terminated_length": 382.359375, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "entropy": 0.47268274426460266, "epoch": 1.2720588235294117, "frac_reward_zero_std": 0.5, "grad_norm": 0.7005761378534806, "kl": 0.023024121299386024, "learning_rate": 7.146927710997046e-07, "loss": -0.0153, "num_tokens": 44449880.0, "reward": 0.03125, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.4097039699554443, "sampling/importance_sampling_ratio/mean": 0.9998856782913208, "sampling/importance_sampling_ratio/min": 0.5269438624382019, "sampling/sampling_logp_difference/max": 0.6406612396240234, "sampling/sampling_logp_difference/mean": 0.014714710414409637, "step": 1038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1028.0, "completions/max_terminated_length": 1028.0, "completions/mean_length": 461.296875, "completions/mean_terminated_length": 461.296875, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.4088643193244934, "epoch": 1.2732843137254901, "frac_reward_zero_std": 1.0, "grad_norm": 0.019662174137492487, "kl": 0.023361802101135254, "learning_rate": 7.140491885140628e-07, "loss": 0.0002, "num_tokens": 44495355.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001777410507202, "sampling/importance_sampling_ratio/min": 0.6639827489852905, "sampling/sampling_logp_difference/max": 0.918832540512085, "sampling/sampling_logp_difference/mean": 0.013251213356852531, "step": 1039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 955.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 423.71875, "completions/mean_terminated_length": 423.71875, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.45368313789367676, "epoch": 1.2745098039215685, "frac_reward_zero_std": 0.75, "grad_norm": 0.6815812205406084, "kl": 0.02580954134464264, "learning_rate": 7.134051714221286e-07, "loss": 0.0274, "num_tokens": 44542457.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.478765606880188, "sampling/importance_sampling_ratio/mean": 0.9999149441719055, "sampling/importance_sampling_ratio/min": 0.62151038646698, "sampling/sampling_logp_difference/max": 0.4756026268005371, "sampling/sampling_logp_difference/mean": 0.014548969455063343, "step": 1040 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 753.0, "completions/max_terminated_length": 753.0, "completions/mean_length": 328.78125, "completions/mean_terminated_length": 328.78125, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.2829528748989105, "epoch": 1.2757352941176472, "frac_reward_zero_std": 1.0, "grad_norm": 0.015475054269722954, "kl": 0.01725730113685131, "learning_rate": 7.127607211312162e-07, "loss": 0.0002, "num_tokens": 44577947.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3940024375915527, "sampling/importance_sampling_ratio/mean": 0.9998865127563477, "sampling/importance_sampling_ratio/min": 0.6505817770957947, "sampling/sampling_logp_difference/max": 0.4298882484436035, "sampling/sampling_logp_difference/mean": 0.009961681440472603, "step": 1041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1539.0, "completions/max_terminated_length": 1539.0, "completions/mean_length": 483.875, "completions/mean_terminated_length": 483.875, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "entropy": 0.32825252413749695, "epoch": 1.2769607843137254, "frac_reward_zero_std": 1.0, "grad_norm": 0.010279061086063782, "kl": 0.014688804745674133, "learning_rate": 7.121158389495185e-07, "loss": 0.0001, "num_tokens": 44625107.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.4326879978179932, "sampling/importance_sampling_ratio/mean": 0.9998232126235962, "sampling/importance_sampling_ratio/min": 0.7015665769577026, "sampling/sampling_logp_difference/max": 0.35955238342285156, "sampling/sampling_logp_difference/mean": 0.010487031191587448, "step": 1042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1734.0, "completions/max_terminated_length": 1734.0, "completions/mean_length": 495.734375, "completions/mean_terminated_length": 495.734375, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.46885210275650024, "epoch": 1.278186274509804, "frac_reward_zero_std": 0.75, "grad_norm": 0.6136837602696609, "kl": 0.029879368841648102, "learning_rate": 7.114705261861061e-07, "loss": 0.0479, "num_tokens": 44681650.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.5736064910888672, "sampling/importance_sampling_ratio/mean": 1.0004884004592896, "sampling/importance_sampling_ratio/min": 0.5919502377510071, "sampling/sampling_logp_difference/max": 0.5243327617645264, "sampling/sampling_logp_difference/mean": 0.015463057905435562, "step": 1043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 790.0, "completions/max_terminated_length": 790.0, "completions/mean_length": 345.515625, "completions/mean_terminated_length": 345.515625, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.34295305609703064, "epoch": 1.2794117647058822, "frac_reward_zero_std": 1.0, "grad_norm": 0.019678589567267164, "kl": 0.028027616441249847, "learning_rate": 7.108247841509222e-07, "loss": 0.0003, "num_tokens": 44716371.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4482969045639038, "sampling/importance_sampling_ratio/mean": 0.9997510313987732, "sampling/importance_sampling_ratio/min": 0.6889633536338806, "sampling/sampling_logp_difference/max": 0.37256717681884766, "sampling/sampling_logp_difference/mean": 0.012044237926602364, "step": 1044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1144.0, "completions/max_terminated_length": 1144.0, "completions/mean_length": 407.90625, "completions/mean_terminated_length": 407.90625, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.4426684081554413, "epoch": 1.280637254901961, "frac_reward_zero_std": 0.5, "grad_norm": 0.7997802117897751, "kl": 0.03952312469482422, "learning_rate": 7.101786141547828e-07, "loss": 0.0936, "num_tokens": 44758045.0, "reward": 0.03125, "reward_std": 0.42516323924064636, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.42106294631958, "sampling/importance_sampling_ratio/mean": 1.0000617504119873, "sampling/importance_sampling_ratio/min": 0.44958120584487915, "sampling/sampling_logp_difference/max": 0.7994388341903687, "sampling/sampling_logp_difference/mean": 0.013694992288947105, "step": 1045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 801.0, "completions/max_terminated_length": 801.0, "completions/mean_length": 406.765625, "completions/mean_terminated_length": 406.765625, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "entropy": 0.3840940594673157, "epoch": 1.281862745098039, "frac_reward_zero_std": 1.0, "grad_norm": 0.018477963600284333, "kl": 0.020005889236927032, "learning_rate": 7.095320175093718e-07, "loss": 0.0002, "num_tokens": 44799694.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4765831232070923, "sampling/importance_sampling_ratio/mean": 1.00017249584198, "sampling/importance_sampling_ratio/min": 0.6865346431732178, "sampling/sampling_logp_difference/max": 0.38973069190979004, "sampling/sampling_logp_difference/mean": 0.012031790800392628, "step": 1046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 566.0, "completions/max_terminated_length": 566.0, "completions/mean_length": 352.734375, "completions/mean_terminated_length": 352.734375, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "entropy": 0.34023141860961914, "epoch": 1.2830882352941178, "frac_reward_zero_std": 0.75, "grad_norm": 0.5959742239295861, "kl": 0.02003106288611889, "learning_rate": 7.088849955272396e-07, "loss": -0.0126, "num_tokens": 44838061.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.605943202972412, "sampling/importance_sampling_ratio/mean": 0.9998331069946289, "sampling/importance_sampling_ratio/min": 0.6482288837432861, "sampling/sampling_logp_difference/max": 0.4737112522125244, "sampling/sampling_logp_difference/mean": 0.011149398982524872, "step": 1047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 627.0, "completions/max_terminated_length": 627.0, "completions/mean_length": 312.6875, "completions/mean_terminated_length": 312.6875, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.4647567570209503, "epoch": 1.284313725490196, "frac_reward_zero_std": 0.75, "grad_norm": 0.8773613409720475, "kl": 0.0308322012424469, "learning_rate": 7.082375495217995e-07, "loss": -0.0187, "num_tokens": 44873865.0, "reward": 0.34375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 1.4286020994186401, "sampling/importance_sampling_ratio/mean": 0.9998634457588196, "sampling/importance_sampling_ratio/min": 0.6519092321395874, "sampling/sampling_logp_difference/max": 0.4278498888015747, "sampling/sampling_logp_difference/mean": 0.015165681950747967, "step": 1048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 730.0, "completions/max_terminated_length": 730.0, "completions/mean_length": 372.703125, "completions/mean_terminated_length": 372.703125, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "entropy": 0.4154563546180725, "epoch": 1.2855392156862746, "frac_reward_zero_std": 1.0, "grad_norm": 0.020545635887122625, "kl": 0.020628638565540314, "learning_rate": 7.075896808073263e-07, "loss": 0.0002, "num_tokens": 44915878.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4742006063461304, "sampling/importance_sampling_ratio/mean": 1.0002074241638184, "sampling/importance_sampling_ratio/min": 0.6125919222831726, "sampling/sampling_logp_difference/max": 0.49005627632141113, "sampling/sampling_logp_difference/mean": 0.013625890016555786, "step": 1049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1564.0, "completions/max_terminated_length": 1564.0, "completions/mean_length": 439.3125, "completions/mean_terminated_length": 439.3125, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "entropy": 0.3815806210041046, "epoch": 1.2867647058823528, "frac_reward_zero_std": 0.75, "grad_norm": 0.5868859710198372, "kl": 0.020792817696928978, "learning_rate": 7.069413906989523e-07, "loss": -0.008, "num_tokens": 44962186.0, "reward": 0.3125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.8618828058242798, "sampling/importance_sampling_ratio/mean": 1.000143051147461, "sampling/importance_sampling_ratio/min": 0.6179900765419006, "sampling/sampling_logp_difference/max": 0.6215882301330566, "sampling/sampling_logp_difference/mean": 0.011894311755895615, "step": 1050 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 742.0, "completions/max_terminated_length": 742.0, "completions/mean_length": 418.8125, "completions/mean_terminated_length": 418.8125, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "entropy": 0.45513778924942017, "epoch": 1.2879901960784315, "frac_reward_zero_std": 0.75, "grad_norm": 0.4764090226614409, "kl": 0.024853510782122612, "learning_rate": 7.062926805126652e-07, "loss": 0.0448, "num_tokens": 45007198.0, "reward": 0.71875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.5653382539749146, "sampling/importance_sampling_ratio/mean": 0.9997885227203369, "sampling/importance_sampling_ratio/min": 0.6762459874153137, "sampling/sampling_logp_difference/max": 0.4481019973754883, "sampling/sampling_logp_difference/mean": 0.015298489481210709, "step": 1051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 749.0, "completions/max_terminated_length": 749.0, "completions/mean_length": 353.890625, "completions/mean_terminated_length": 353.890625, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "entropy": 0.4477284848690033, "epoch": 1.2892156862745099, "frac_reward_zero_std": 1.0, "grad_norm": 0.02140059274515903, "kl": 0.022819381207227707, "learning_rate": 7.056435515653058e-07, "loss": 0.0002, "num_tokens": 45046503.0, "reward": -0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": -0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.60073983669281, "sampling/importance_sampling_ratio/mean": 1.0002648830413818, "sampling/importance_sampling_ratio/min": 0.6314740777015686, "sampling/sampling_logp_difference/max": 0.47046589851379395, "sampling/sampling_logp_difference/mean": 0.015144643373787403, "step": 1052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1197.0, "completions/max_terminated_length": 1197.0, "completions/mean_length": 374.21875, "completions/mean_terminated_length": 374.21875, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "entropy": 0.5198514461517334, "epoch": 1.2904411764705883, "frac_reward_zero_std": 0.25, "grad_norm": 0.8660327498419588, "kl": 0.02972821332514286, "learning_rate": 7.049940051745646e-07, "loss": -0.0123, "num_tokens": 45086117.0, "reward": 0.46875, "reward_std": 0.6424696445465088, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.445256233215332, "sampling/importance_sampling_ratio/mean": 0.9999802708625793, "sampling/importance_sampling_ratio/min": 0.6792042255401611, "sampling/sampling_logp_difference/max": 0.38683342933654785, "sampling/sampling_logp_difference/mean": 0.015414590016007423, "step": 1053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 911.0, "completions/max_terminated_length": 911.0, "completions/mean_length": 436.4375, "completions/mean_terminated_length": 436.4375, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "entropy": 0.518287718296051, "epoch": 1.2916666666666667, "frac_reward_zero_std": 0.5, "grad_norm": 0.7311081180326742, "kl": 0.029296036809682846, "learning_rate": 7.043440426589795e-07, "loss": -0.0317, "num_tokens": 45136369.0, "reward": 0.46875, "reward_std": 0.3723389506340027, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.4656418561935425, "sampling/importance_sampling_ratio/mean": 0.9997543692588806, "sampling/importance_sampling_ratio/min": 0.5410807728767395, "sampling/sampling_logp_difference/max": 0.6141867637634277, "sampling/sampling_logp_difference/mean": 0.01418614387512207, "step": 1054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 777.0, "completions/max_terminated_length": 777.0, "completions/mean_length": 369.96875, "completions/mean_terminated_length": 369.96875, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "entropy": 0.3943636417388916, "epoch": 1.2928921568627452, "frac_reward_zero_std": 0.5, "grad_norm": 0.8489732606346772, "kl": 0.026676783338189125, "learning_rate": 7.036936653379335e-07, "loss": 0.0094, "num_tokens": 45178639.0, "reward": 0.1875, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": 0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 1.418445110321045, "sampling/importance_sampling_ratio/mean": 1.0001137256622314, "sampling/importance_sampling_ratio/min": 0.709783136844635, "sampling/sampling_logp_difference/max": 0.3495612144470215, "sampling/sampling_logp_difference/mean": 0.012097825296223164, "step": 1055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1885.0, "completions/max_terminated_length": 1885.0, "completions/mean_length": 627.40625, "completions/mean_terminated_length": 627.40625, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "entropy": 0.5432260036468506, "epoch": 1.2941176470588236, "frac_reward_zero_std": 0.75, "grad_norm": 0.3134972910187544, "kl": 0.023896776139736176, "learning_rate": 7.030428745316512e-07, "loss": 0.0072, "num_tokens": 45241225.0, "reward": -0.15625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": -0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.5504884719848633, "sampling/importance_sampling_ratio/mean": 1.0000076293945312, "sampling/importance_sampling_ratio/min": 0.6326375603675842, "sampling/sampling_logp_difference/max": 0.457857608795166, "sampling/sampling_logp_difference/mean": 0.014100970700383186, "step": 1056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1510.0, "completions/max_terminated_length": 1510.0, "completions/mean_length": 487.09375, "completions/mean_terminated_length": 487.09375, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "entropy": 0.505561351776123, "epoch": 1.295343137254902, "frac_reward_zero_std": 0.75, "grad_norm": 0.4555759134274302, "kl": 0.023679140955209732, "learning_rate": 7.023916715611968e-07, "loss": 0.0003, "num_tokens": 45293167.0, "reward": 0.59375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.873821496963501, "sampling/importance_sampling_ratio/mean": 0.9997483491897583, "sampling/importance_sampling_ratio/min": 0.5338325500488281, "sampling/sampling_logp_difference/max": 0.6279799938201904, "sampling/sampling_logp_difference/mean": 0.014850996434688568, "step": 1057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1192.0, "completions/max_terminated_length": 1192.0, "completions/mean_length": 495.75, "completions/mean_terminated_length": 495.75, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "entropy": 0.4953930079936981, "epoch": 1.2965686274509804, "frac_reward_zero_std": 0.5, "grad_norm": 0.6996348793664555, "kl": 0.027619725093245506, "learning_rate": 7.017400577484712e-07, "loss": -0.0338, "num_tokens": 45341471.0, "reward": 0.5, "reward_std": 0.40311288833618164, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999649524688721, "sampling/importance_sampling_ratio/min": 0.3552570939064026, "sampling/sampling_logp_difference/max": 1.0349135398864746, "sampling/sampling_logp_difference/mean": 0.014706959947943687, "step": 1058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1212.0, "completions/max_terminated_length": 1212.0, "completions/mean_length": 364.8125, "completions/mean_terminated_length": 364.8125, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "entropy": 0.3920440971851349, "epoch": 1.2977941176470589, "frac_reward_zero_std": 1.0, "grad_norm": 0.020366303214392516, "kl": 0.02276897430419922, "learning_rate": 7.010880344162086e-07, "loss": 0.0002, "num_tokens": 45383715.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0002293586730957, "sampling/importance_sampling_ratio/min": 0.6298664808273315, "sampling/sampling_logp_difference/max": 0.8171581029891968, "sampling/sampling_logp_difference/mean": 0.013425824232399464, "step": 1059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1212.0, "completions/max_terminated_length": 1212.0, "completions/mean_length": 495.984375, "completions/mean_terminated_length": 495.984375, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "entropy": 0.4926284849643707, "epoch": 1.2990196078431373, "frac_reward_zero_std": 0.75, "grad_norm": 0.5203311724482843, "kl": 0.021131355315446854, "learning_rate": 7.004356028879758e-07, "loss": 0.0196, "num_tokens": 45435714.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.5394798517227173, "sampling/importance_sampling_ratio/mean": 0.9998020529747009, "sampling/importance_sampling_ratio/min": 0.6299265623092651, "sampling/sampling_logp_difference/max": 0.46215200424194336, "sampling/sampling_logp_difference/mean": 0.015700723975896835, "step": 1060 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 810.0, "completions/max_terminated_length": 810.0, "completions/mean_length": 363.265625, "completions/mean_terminated_length": 363.265625, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "entropy": 0.4250093400478363, "epoch": 1.3002450980392157, "frac_reward_zero_std": 0.75, "grad_norm": 0.6334354693714751, "kl": 0.028053268790245056, "learning_rate": 6.99782764488167e-07, "loss": 0.0644, "num_tokens": 45477875.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.4647963047027588, "sampling/importance_sampling_ratio/mean": 1.0000923871994019, "sampling/importance_sampling_ratio/min": 0.5688039660453796, "sampling/sampling_logp_difference/max": 0.5642194747924805, "sampling/sampling_logp_difference/mean": 0.012749070301651955, "step": 1061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 912.0, "completions/max_terminated_length": 912.0, "completions/mean_length": 430.625, "completions/mean_terminated_length": 430.625, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.3947342038154602, "epoch": 1.3014705882352942, "frac_reward_zero_std": 0.75, "grad_norm": 0.4049068106874812, "kl": 0.018796036019921303, "learning_rate": 6.991295205420027e-07, "loss": -0.0113, "num_tokens": 45523211.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.5785202980041504, "sampling/importance_sampling_ratio/mean": 0.9998267889022827, "sampling/importance_sampling_ratio/min": 0.6902693510055542, "sampling/sampling_logp_difference/max": 0.45648789405822754, "sampling/sampling_logp_difference/mean": 0.01192258670926094, "step": 1062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 649.0, "completions/max_terminated_length": 649.0, "completions/mean_length": 380.15625, "completions/mean_terminated_length": 380.15625, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.5000351667404175, "epoch": 1.3026960784313726, "frac_reward_zero_std": 1.0, "grad_norm": 0.019511373166985917, "kl": 0.025792822241783142, "learning_rate": 6.984758723755272e-07, "loss": 0.0002, "num_tokens": 45566709.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4673956632614136, "sampling/importance_sampling_ratio/mean": 0.999936580657959, "sampling/importance_sampling_ratio/min": 0.5978190302848816, "sampling/sampling_logp_difference/max": 0.5144672393798828, "sampling/sampling_logp_difference/mean": 0.014816360548138618, "step": 1063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 772.0, "completions/max_terminated_length": 772.0, "completions/mean_length": 386.375, "completions/mean_terminated_length": 386.375, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "entropy": 0.6469102501869202, "epoch": 1.303921568627451, "frac_reward_zero_std": 0.5, "grad_norm": 0.6760122465815922, "kl": 0.03907229006290436, "learning_rate": 6.978218213156044e-07, "loss": -0.0231, "num_tokens": 45606573.0, "reward": 0.90625, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.6360697746276855, "sampling/importance_sampling_ratio/mean": 0.9998902082443237, "sampling/importance_sampling_ratio/min": 0.6159787178039551, "sampling/sampling_logp_difference/max": 0.4922969341278076, "sampling/sampling_logp_difference/mean": 0.017903437837958336, "step": 1064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 724.0, "completions/max_terminated_length": 724.0, "completions/mean_length": 397.21875, "completions/mean_terminated_length": 397.21875, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "entropy": 0.5503039360046387, "epoch": 1.3051470588235294, "frac_reward_zero_std": 0.75, "grad_norm": 0.5091955065166269, "kl": 0.028058521449565887, "learning_rate": 6.971673686899169e-07, "loss": 0.0095, "num_tokens": 45650827.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.4609055519104004, "sampling/importance_sampling_ratio/mean": 0.9999200105667114, "sampling/importance_sampling_ratio/min": 0.5469896197319031, "sampling/sampling_logp_difference/max": 0.603325366973877, "sampling/sampling_logp_difference/mean": 0.016456522047519684, "step": 1065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 874.0, "completions/max_terminated_length": 874.0, "completions/mean_length": 331.71875, "completions/mean_terminated_length": 331.71875, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.5090869665145874, "epoch": 1.3063725490196079, "frac_reward_zero_std": 0.75, "grad_norm": 0.5706143008475585, "kl": 0.044332459568977356, "learning_rate": 6.965125158269618e-07, "loss": -0.0144, "num_tokens": 45689913.0, "reward": -0.09375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": -0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.3486140966415405, "sampling/importance_sampling_ratio/mean": 1.0002920627593994, "sampling/importance_sampling_ratio/min": 0.7530038356781006, "sampling/sampling_logp_difference/max": 0.29907751083374023, "sampling/sampling_logp_difference/mean": 0.015276861377060413, "step": 1066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1429.0, "completions/max_terminated_length": 1429.0, "completions/mean_length": 381.484375, "completions/mean_terminated_length": 381.484375, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "entropy": 0.588262140750885, "epoch": 1.3075980392156863, "frac_reward_zero_std": 0.5, "grad_norm": 0.8608969200240565, "kl": 0.03859439864754677, "learning_rate": 6.958572640560491e-07, "loss": -0.0913, "num_tokens": 45737496.0, "reward": 0.25, "reward_std": 0.42078250646591187, "rewards/decision_reward_func/mean": 0.25, "rewards/decision_reward_func/std": 0.9759001135826111, "sampling/importance_sampling_ratio/max": 1.905246376991272, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 0.6159837245941162, "sampling/sampling_logp_difference/max": 0.6446113586425781, "sampling/sampling_logp_difference/mean": 0.01635122112929821, "step": 1067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/max_terminated_length": 530.0, "completions/mean_length": 280.953125, "completions/mean_terminated_length": 280.953125, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "entropy": 0.4325597882270813, "epoch": 1.3088235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.028339970216968314, "kl": 0.030801817774772644, "learning_rate": 6.952016147072981e-07, "loss": 0.0003, "num_tokens": 45770773.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.7100903987884521, "sampling/importance_sampling_ratio/mean": 1.0002284049987793, "sampling/importance_sampling_ratio/min": 0.651918351650238, "sampling/sampling_logp_difference/max": 0.5365462303161621, "sampling/sampling_logp_difference/mean": 0.015200365334749222, "step": 1068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 703.0, "completions/max_terminated_length": 703.0, "completions/mean_length": 407.1875, "completions/mean_terminated_length": 407.1875, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "entropy": 0.5719579458236694, "epoch": 1.3100490196078431, "frac_reward_zero_std": 1.0, "grad_norm": 0.018339786938961315, "kl": 0.02917315624654293, "learning_rate": 6.945455691116358e-07, "loss": 0.0003, "num_tokens": 45815217.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6708482503890991, "sampling/importance_sampling_ratio/mean": 0.9998995065689087, "sampling/importance_sampling_ratio/min": 0.7152495980262756, "sampling/sampling_logp_difference/max": 0.513331413269043, "sampling/sampling_logp_difference/mean": 0.016346432268619537, "step": 1069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 843.0, "completions/max_terminated_length": 843.0, "completions/mean_length": 359.84375, "completions/mean_terminated_length": 359.84375, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "entropy": 0.4011482894420624, "epoch": 1.3112745098039216, "frac_reward_zero_std": 1.0, "grad_norm": 0.01248339702031306, "kl": 0.018986068665981293, "learning_rate": 6.938891286007928e-07, "loss": 0.0002, "num_tokens": 45862375.0, "reward": -0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": -0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3867782354354858, "sampling/importance_sampling_ratio/mean": 1.0000050067901611, "sampling/importance_sampling_ratio/min": 0.7115983366966248, "sampling/sampling_logp_difference/max": 0.3402416706085205, "sampling/sampling_logp_difference/mean": 0.011908390559256077, "step": 1070 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1666.0, "completions/max_terminated_length": 1666.0, "completions/mean_length": 390.140625, "completions/mean_terminated_length": 390.140625, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "entropy": 0.5268430709838867, "epoch": 1.3125, "frac_reward_zero_std": 0.75, "grad_norm": 0.7099210781123385, "kl": 0.0270115714520216, "learning_rate": 6.932322945073023e-07, "loss": -0.1317, "num_tokens": 45901472.0, "reward": -0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": -0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.4352161884307861, "sampling/importance_sampling_ratio/mean": 0.9999416470527649, "sampling/importance_sampling_ratio/min": 0.6545575857162476, "sampling/sampling_logp_difference/max": 0.4237957000732422, "sampling/sampling_logp_difference/mean": 0.016040014103055, "step": 1071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 625.0, "completions/max_terminated_length": 625.0, "completions/mean_length": 347.578125, "completions/mean_terminated_length": 347.578125, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.41818857192993164, "epoch": 1.3137254901960784, "frac_reward_zero_std": 0.75, "grad_norm": 0.492794359353153, "kl": 0.01967809721827507, "learning_rate": 6.925750681644953e-07, "loss": -0.0174, "num_tokens": 45939333.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.5072135925292969, "sampling/importance_sampling_ratio/mean": 1.00026273727417, "sampling/importance_sampling_ratio/min": 0.6944725513458252, "sampling/sampling_logp_difference/max": 0.4102625846862793, "sampling/sampling_logp_difference/mean": 0.012953085824847221, "step": 1072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1321.0, "completions/max_terminated_length": 1321.0, "completions/mean_length": 443.984375, "completions/mean_terminated_length": 443.984375, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.2619286775588989, "epoch": 1.3149509803921569, "frac_reward_zero_std": 1.0, "grad_norm": 0.009226624145602122, "kl": 0.013219642452895641, "learning_rate": 6.919174509065003e-07, "loss": 0.0001, "num_tokens": 45994660.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6906968355178833, "sampling/importance_sampling_ratio/mean": 1.0000584125518799, "sampling/importance_sampling_ratio/min": 0.6745651960372925, "sampling/sampling_logp_difference/max": 0.5251407623291016, "sampling/sampling_logp_difference/mean": 0.008945705369114876, "step": 1073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 643.0, "completions/max_terminated_length": 643.0, "completions/mean_length": 361.125, "completions/mean_terminated_length": 361.125, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "entropy": 0.4319789707660675, "epoch": 1.3161764705882353, "frac_reward_zero_std": 0.75, "grad_norm": 0.5194959688871319, "kl": 0.03888122737407684, "learning_rate": 6.91259444068238e-07, "loss": 0.0083, "num_tokens": 46035996.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.4683146476745605, "sampling/importance_sampling_ratio/mean": 0.9998224973678589, "sampling/importance_sampling_ratio/min": 0.4398719072341919, "sampling/sampling_logp_difference/max": 0.8212716579437256, "sampling/sampling_logp_difference/mean": 0.013224095106124878, "step": 1074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1468.0, "completions/max_terminated_length": 1468.0, "completions/mean_length": 544.59375, "completions/mean_terminated_length": 544.59375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.6040948033332825, "epoch": 1.3174019607843137, "frac_reward_zero_std": 0.25, "grad_norm": 0.8408212468053594, "kl": 0.028002135455608368, "learning_rate": 6.906010489854209e-07, "loss": -0.1094, "num_tokens": 46092930.0, "reward": 0.28125, "reward_std": 0.7033873796463013, "rewards/decision_reward_func/mean": 0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0002355575561523, "sampling/importance_sampling_ratio/min": 0.5554744005203247, "sampling/sampling_logp_difference/max": 1.001079797744751, "sampling/sampling_logp_difference/mean": 0.016128236427903175, "step": 1075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 836.0, "completions/max_terminated_length": 836.0, "completions/mean_length": 445.171875, "completions/mean_terminated_length": 445.171875, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.571749210357666, "epoch": 1.3186274509803921, "frac_reward_zero_std": 0.75, "grad_norm": 0.5202885801793266, "kl": 0.024459026753902435, "learning_rate": 6.899422669945493e-07, "loss": -0.0445, "num_tokens": 46139389.0, "reward": 0.28125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 1.4860079288482666, "sampling/importance_sampling_ratio/mean": 1.0001866817474365, "sampling/importance_sampling_ratio/min": 0.6298379898071289, "sampling/sampling_logp_difference/max": 0.4622926712036133, "sampling/sampling_logp_difference/mean": 0.016198385506868362, "step": 1076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1115.0, "completions/max_terminated_length": 1115.0, "completions/mean_length": 365.640625, "completions/mean_terminated_length": 365.640625, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.45092785358428955, "epoch": 1.3198529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.01547204081752284, "kl": 0.02100953832268715, "learning_rate": 6.892830994329088e-07, "loss": 0.0002, "num_tokens": 46183174.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6035957336425781, "sampling/importance_sampling_ratio/mean": 0.9995350241661072, "sampling/importance_sampling_ratio/min": 0.6531136631965637, "sampling/sampling_logp_difference/max": 0.4722484350204468, "sampling/sampling_logp_difference/mean": 0.01405034214258194, "step": 1077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1513.0, "completions/max_terminated_length": 1513.0, "completions/mean_length": 469.515625, "completions/mean_terminated_length": 469.515625, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "entropy": 0.6292785406112671, "epoch": 1.321078431372549, "frac_reward_zero_std": 0.75, "grad_norm": 0.499744877493297, "kl": 0.03575638681650162, "learning_rate": 6.886235476385681e-07, "loss": -0.0147, "num_tokens": 46231159.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.946412444114685, "sampling/importance_sampling_ratio/mean": 0.999834418296814, "sampling/importance_sampling_ratio/min": 0.5977939367294312, "sampling/sampling_logp_difference/max": 0.6659879684448242, "sampling/sampling_logp_difference/mean": 0.01711287908256054, "step": 1078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1562.0, "completions/max_terminated_length": 1562.0, "completions/mean_length": 481.453125, "completions/mean_terminated_length": 481.453125, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "entropy": 0.46010106801986694, "epoch": 1.3223039215686274, "frac_reward_zero_std": 0.5, "grad_norm": 0.6281259421632105, "kl": 0.024134144186973572, "learning_rate": 6.879636129503751e-07, "loss": 0.0505, "num_tokens": 46282260.0, "reward": 0.625, "reward_std": 0.42078250646591187, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.393554449081421, "sampling/importance_sampling_ratio/mean": 0.9999953508377075, "sampling/importance_sampling_ratio/min": 0.6337423324584961, "sampling/sampling_logp_difference/max": 0.4561128616333008, "sampling/sampling_logp_difference/mean": 0.013498177751898766, "step": 1079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1364.0, "completions/max_terminated_length": 1364.0, "completions/mean_length": 453.96875, "completions/mean_terminated_length": 453.96875, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "entropy": 0.43930792808532715, "epoch": 1.3235294117647058, "frac_reward_zero_std": 0.75, "grad_norm": 0.5965920919578588, "kl": 0.01687948778271675, "learning_rate": 6.87303296707956e-07, "loss": 0.0527, "num_tokens": 46333138.0, "reward": 0.15625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.406952977180481, "sampling/importance_sampling_ratio/mean": 0.9999668002128601, "sampling/importance_sampling_ratio/min": 0.6486191749572754, "sampling/sampling_logp_difference/max": 0.4329094886779785, "sampling/sampling_logp_difference/mean": 0.012881270609796047, "step": 1080 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 938.0, "completions/max_terminated_length": 938.0, "completions/mean_length": 374.046875, "completions/mean_terminated_length": 374.046875, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.48398557305336, "epoch": 1.3247549019607843, "frac_reward_zero_std": 0.5, "grad_norm": 0.8132184607790353, "kl": 0.02851429209113121, "learning_rate": 6.866426002517105e-07, "loss": -0.0262, "num_tokens": 46370613.0, "reward": 0.375, "reward_std": 0.4577302038669586, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.4878132343292236, "sampling/importance_sampling_ratio/mean": 0.9999468922615051, "sampling/importance_sampling_ratio/min": 0.688579797744751, "sampling/sampling_logp_difference/max": 0.3973073959350586, "sampling/sampling_logp_difference/mean": 0.01428760215640068, "step": 1081 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1712.0, "completions/max_terminated_length": 1712.0, "completions/mean_length": 430.078125, "completions/mean_terminated_length": 430.078125, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.4958285987377167, "epoch": 1.3259803921568627, "frac_reward_zero_std": 0.5, "grad_norm": 0.7634106305901037, "kl": 0.026075240224599838, "learning_rate": 6.859815249228105e-07, "loss": -0.0953, "num_tokens": 46414794.0, "reward": 0.1875, "reward_std": 0.40311288833618164, "rewards/decision_reward_func/mean": 0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 1.646270990371704, "sampling/importance_sampling_ratio/mean": 1.0003693103790283, "sampling/importance_sampling_ratio/min": 0.697822630405426, "sampling/sampling_logp_difference/max": 0.49851274490356445, "sampling/sampling_logp_difference/mean": 0.014573863707482815, "step": 1082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1145.0, "completions/max_terminated_length": 1145.0, "completions/mean_length": 484.75, "completions/mean_terminated_length": 484.75, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.49378061294555664, "epoch": 1.3272058823529411, "frac_reward_zero_std": 0.5, "grad_norm": 0.680673486197895, "kl": 0.021520443260669708, "learning_rate": 6.853200720631972e-07, "loss": 0.0794, "num_tokens": 46461386.0, "reward": 0.46875, "reward_std": 0.3723389506340027, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.5277607440948486, "sampling/importance_sampling_ratio/mean": 1.0001574754714966, "sampling/importance_sampling_ratio/min": 0.5360126495361328, "sampling/sampling_logp_difference/max": 0.6235975027084351, "sampling/sampling_logp_difference/mean": 0.013850251212716103, "step": 1083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 866.0, "completions/max_terminated_length": 866.0, "completions/mean_length": 413.203125, "completions/mean_terminated_length": 413.203125, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.45087099075317383, "epoch": 1.3284313725490196, "frac_reward_zero_std": 0.75, "grad_norm": 0.5471450847143523, "kl": 0.01949397847056389, "learning_rate": 6.846582430155781e-07, "loss": -0.0298, "num_tokens": 46502743.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.5785545110702515, "sampling/importance_sampling_ratio/mean": 0.9999931454658508, "sampling/importance_sampling_ratio/min": 0.7031835317611694, "sampling/sampling_logp_difference/max": 0.4565095901489258, "sampling/sampling_logp_difference/mean": 0.014205368235707283, "step": 1084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 771.0, "completions/max_terminated_length": 771.0, "completions/mean_length": 447.46875, "completions/mean_terminated_length": 447.46875, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "entropy": 0.5406452417373657, "epoch": 1.329656862745098, "frac_reward_zero_std": 0.5, "grad_norm": 0.7098944210226331, "kl": 0.03266742825508118, "learning_rate": 6.839960391234242e-07, "loss": -0.0019, "num_tokens": 46544357.0, "reward": 0.15625, "reward_std": 0.42695626616477966, "rewards/decision_reward_func/mean": 0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.6962671279907227, "sampling/importance_sampling_ratio/mean": 0.9998524785041809, "sampling/importance_sampling_ratio/min": 0.6190524101257324, "sampling/sampling_logp_difference/max": 0.5284299850463867, "sampling/sampling_logp_difference/mean": 0.01662532426416874, "step": 1085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1150.0, "completions/max_terminated_length": 1150.0, "completions/mean_length": 385.03125, "completions/mean_terminated_length": 385.03125, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.4769316017627716, "epoch": 1.3308823529411764, "frac_reward_zero_std": 0.75, "grad_norm": 0.5437086449808335, "kl": 0.022556068375706673, "learning_rate": 6.833334617309672e-07, "loss": -0.0025, "num_tokens": 46588535.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.4408091306686401, "sampling/importance_sampling_ratio/mean": 0.9999881386756897, "sampling/importance_sampling_ratio/min": 0.617272138595581, "sampling/sampling_logp_difference/max": 0.48244524002075195, "sampling/sampling_logp_difference/mean": 0.014406125992536545, "step": 1086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 893.0, "completions/max_terminated_length": 893.0, "completions/mean_length": 341.953125, "completions/mean_terminated_length": 341.953125, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.4753386378288269, "epoch": 1.3321078431372548, "frac_reward_zero_std": 0.75, "grad_norm": 0.6432327206665067, "kl": 0.028154617175459862, "learning_rate": 6.826705121831976e-07, "loss": 0.0526, "num_tokens": 46626644.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.433655023574829, "sampling/importance_sampling_ratio/mean": 1.0001981258392334, "sampling/importance_sampling_ratio/min": 0.6933043003082275, "sampling/sampling_logp_difference/max": 0.3662862777709961, "sampling/sampling_logp_difference/mean": 0.014713498763740063, "step": 1087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1067.0, "completions/max_terminated_length": 1067.0, "completions/mean_length": 529.5625, "completions/mean_terminated_length": 529.5625, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "entropy": 0.41329705715179443, "epoch": 1.3333333333333333, "frac_reward_zero_std": 0.5, "grad_norm": 0.6324522674978027, "kl": 0.01818171888589859, "learning_rate": 6.820071918258605e-07, "loss": -0.0453, "num_tokens": 46679304.0, "reward": 0.6875, "reward_std": 0.3811737596988678, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9997724294662476, "sampling/importance_sampling_ratio/min": 0.5430805087089539, "sampling/sampling_logp_difference/max": 0.7049317359924316, "sampling/sampling_logp_difference/mean": 0.012366356328129768, "step": 1088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 854.0, "completions/max_terminated_length": 854.0, "completions/mean_length": 430.953125, "completions/mean_terminated_length": 430.953125, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "entropy": 0.32516154646873474, "epoch": 1.3345588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 0.016892090822267503, "kl": 0.01902255043387413, "learning_rate": 6.813435020054548e-07, "loss": 0.0002, "num_tokens": 46722293.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001013278961182, "sampling/importance_sampling_ratio/min": 0.6725914478302002, "sampling/sampling_logp_difference/max": 0.9277136325836182, "sampling/sampling_logp_difference/mean": 0.010388237424194813, "step": 1089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1304.0, "completions/max_terminated_length": 1304.0, "completions/mean_length": 515.71875, "completions/mean_terminated_length": 515.71875, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "entropy": 0.4929153025150299, "epoch": 1.3357843137254901, "frac_reward_zero_std": 0.75, "grad_norm": 0.4481664820926955, "kl": 0.02458004653453827, "learning_rate": 6.806794440692282e-07, "loss": 0.0046, "num_tokens": 46772547.0, "reward": 0.0625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.8624823093414307, "sampling/importance_sampling_ratio/mean": 1.00014328956604, "sampling/importance_sampling_ratio/min": 0.37312787771224976, "sampling/sampling_logp_difference/max": 0.9858341217041016, "sampling/sampling_logp_difference/mean": 0.014151749201118946, "step": 1090 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 543.0, "completions/max_terminated_length": 543.0, "completions/mean_length": 321.515625, "completions/mean_terminated_length": 321.515625, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.34754663705825806, "epoch": 1.3370098039215685, "frac_reward_zero_std": 1.0, "grad_norm": 0.02007181043317323, "kl": 0.02686876617372036, "learning_rate": 6.800150193651767e-07, "loss": 0.0002, "num_tokens": 46809860.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5907870531082153, "sampling/importance_sampling_ratio/mean": 1.0001407861709595, "sampling/importance_sampling_ratio/min": 0.6225562691688538, "sampling/sampling_logp_difference/max": 0.4739212989807129, "sampling/sampling_logp_difference/mean": 0.011540542356669903, "step": 1091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1111.0, "completions/max_terminated_length": 1111.0, "completions/mean_length": 396.359375, "completions/mean_terminated_length": 396.359375, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.41029417514801025, "epoch": 1.3382352941176472, "frac_reward_zero_std": 0.75, "grad_norm": 0.6092636560717889, "kl": 0.028097236528992653, "learning_rate": 6.793502292420401e-07, "loss": 0.0017, "num_tokens": 46850939.0, "reward": 0.8125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.4487906694412231, "sampling/importance_sampling_ratio/mean": 1.0003739595413208, "sampling/importance_sampling_ratio/min": 0.4437774121761322, "sampling/sampling_logp_difference/max": 0.8124321699142456, "sampling/sampling_logp_difference/mean": 0.012465642765164375, "step": 1092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1166.0, "completions/max_terminated_length": 1166.0, "completions/mean_length": 473.40625, "completions/mean_terminated_length": 473.40625, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.682903528213501, "epoch": 1.3394607843137254, "frac_reward_zero_std": 0.5, "grad_norm": 0.6846336892407526, "kl": 0.028618909418582916, "learning_rate": 6.786850750493005e-07, "loss": 0.0277, "num_tokens": 46899189.0, "reward": -0.15625, "reward_std": 0.375, "rewards/decision_reward_func/mean": -0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.5474967956542969, "sampling/importance_sampling_ratio/mean": 0.9999856352806091, "sampling/importance_sampling_ratio/min": 0.6009047627449036, "sampling/sampling_logp_difference/max": 0.5093188285827637, "sampling/sampling_logp_difference/mean": 0.01883115991950035, "step": 1093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 620.0, "completions/max_terminated_length": 620.0, "completions/mean_length": 392.90625, "completions/mean_terminated_length": 392.90625, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "entropy": 0.3100617527961731, "epoch": 1.340686274509804, "frac_reward_zero_std": 1.0, "grad_norm": 0.01939371259888601, "kl": 0.01900457963347435, "learning_rate": 6.780195581371784e-07, "loss": 0.0002, "num_tokens": 46938367.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4583042860031128, "sampling/importance_sampling_ratio/mean": 1.0001964569091797, "sampling/importance_sampling_ratio/min": 0.6596994400024414, "sampling/sampling_logp_difference/max": 0.415971040725708, "sampling/sampling_logp_difference/mean": 0.010199109092354774, "step": 1094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 921.0, "completions/max_terminated_length": 921.0, "completions/mean_length": 420.015625, "completions/mean_terminated_length": 420.015625, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "entropy": 0.38042402267456055, "epoch": 1.3419117647058822, "frac_reward_zero_std": 1.0, "grad_norm": 0.016054216224321353, "kl": 0.01903538964688778, "learning_rate": 6.773536798566313e-07, "loss": 0.0002, "num_tokens": 46982784.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000816583633423, "sampling/importance_sampling_ratio/min": 0.14959952235221863, "sampling/sampling_logp_difference/max": 1.8997933864593506, "sampling/sampling_logp_difference/mean": 0.012973608449101448, "step": 1095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 913.0, "completions/max_terminated_length": 913.0, "completions/mean_length": 484.734375, "completions/mean_terminated_length": 484.734375, "completions/min_length": 303.0, "completions/min_terminated_length": 303.0, "entropy": 0.3359946608543396, "epoch": 1.343137254901961, "frac_reward_zero_std": 1.0, "grad_norm": 0.015599811958423714, "kl": 0.01781240478157997, "learning_rate": 6.766874415593495e-07, "loss": 0.0002, "num_tokens": 47031135.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4863255023956299, "sampling/importance_sampling_ratio/mean": 1.0000743865966797, "sampling/importance_sampling_ratio/min": 0.6285126209259033, "sampling/sampling_logp_difference/max": 0.4643990993499756, "sampling/sampling_logp_difference/mean": 0.010504579171538353, "step": 1096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 739.0, "completions/max_terminated_length": 739.0, "completions/mean_length": 407.671875, "completions/mean_terminated_length": 407.671875, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "entropy": 0.3758111298084259, "epoch": 1.344362745098039, "frac_reward_zero_std": 1.0, "grad_norm": 0.014960905055304846, "kl": 0.020082034170627594, "learning_rate": 6.760208445977549e-07, "loss": 0.0002, "num_tokens": 47071850.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4081143140792847, "sampling/importance_sampling_ratio/mean": 1.0001299381256104, "sampling/importance_sampling_ratio/min": 0.30037379264831543, "sampling/sampling_logp_difference/max": 1.2027275562286377, "sampling/sampling_logp_difference/mean": 0.012581909075379372, "step": 1097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2410.0, "completions/max_terminated_length": 2410.0, "completions/mean_length": 568.65625, "completions/mean_terminated_length": 568.65625, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.46978792548179626, "epoch": 1.3455882352941178, "frac_reward_zero_std": 0.75, "grad_norm": 0.6634973795644802, "kl": 0.027534209191799164, "learning_rate": 6.753538903249974e-07, "loss": 0.1395, "num_tokens": 47134372.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.5756248235702515, "sampling/importance_sampling_ratio/mean": 0.9998065233230591, "sampling/importance_sampling_ratio/min": 0.5767483711242676, "sampling/sampling_logp_difference/max": 0.550349235534668, "sampling/sampling_logp_difference/mean": 0.015069235116243362, "step": 1098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 817.0, "completions/max_terminated_length": 817.0, "completions/mean_length": 476.15625, "completions/mean_terminated_length": 476.15625, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "entropy": 0.5526300072669983, "epoch": 1.346813725490196, "frac_reward_zero_std": 0.75, "grad_norm": 0.41863082464604395, "kl": 0.02468073181807995, "learning_rate": 6.74686580094951e-07, "loss": 0.0483, "num_tokens": 47182798.0, "reward": -0.34375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": -0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 1.4238134622573853, "sampling/importance_sampling_ratio/mean": 1.0002875328063965, "sampling/importance_sampling_ratio/min": 0.7299416065216064, "sampling/sampling_logp_difference/max": 0.35333871841430664, "sampling/sampling_logp_difference/mean": 0.01563005894422531, "step": 1099 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2121.0, "completions/max_terminated_length": 2121.0, "completions/mean_length": 717.375, "completions/mean_terminated_length": 717.375, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "entropy": 0.6209925413131714, "epoch": 1.3480392156862746, "frac_reward_zero_std": 0.25, "grad_norm": 0.6544199613012899, "kl": 0.024588558822870255, "learning_rate": 6.740189152622142e-07, "loss": 0.0332, "num_tokens": 47248230.0, "reward": 0.15625, "reward_std": 0.5539814233779907, "rewards/decision_reward_func/mean": 0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.4619221687316895, "sampling/importance_sampling_ratio/mean": 0.999993085861206, "sampling/importance_sampling_ratio/min": 0.689284086227417, "sampling/sampling_logp_difference/max": 0.37975215911865234, "sampling/sampling_logp_difference/mean": 0.015766700729727745, "step": 1100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 878.0, "completions/max_terminated_length": 878.0, "completions/mean_length": 382.59375, "completions/mean_terminated_length": 382.59375, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.42933958768844604, "epoch": 1.3492647058823528, "frac_reward_zero_std": 1.0, "grad_norm": 0.018429198511380272, "kl": 0.021387893706560135, "learning_rate": 6.733508971821036e-07, "loss": 0.0002, "num_tokens": 47288060.0, "reward": -0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": -0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4871734380722046, "sampling/importance_sampling_ratio/mean": 1.0002140998840332, "sampling/importance_sampling_ratio/min": 0.6477000713348389, "sampling/sampling_logp_difference/max": 0.4343276023864746, "sampling/sampling_logp_difference/mean": 0.013736543245613575, "step": 1101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2041.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 753.890625, "completions/mean_terminated_length": 753.890625, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "entropy": 0.44956982135772705, "epoch": 1.3504901960784315, "frac_reward_zero_std": 1.0, "grad_norm": 0.012539536335203817, "kl": 0.016654353588819504, "learning_rate": 6.726825272106538e-07, "loss": 0.0001, "num_tokens": 47354517.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.7577861547470093, "sampling/importance_sampling_ratio/mean": 0.9999486207962036, "sampling/importance_sampling_ratio/min": 0.46594101190567017, "sampling/sampling_logp_difference/max": 0.7636961936950684, "sampling/sampling_logp_difference/mean": 0.012812981382012367, "step": 1102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2346.0, "completions/max_terminated_length": 2346.0, "completions/mean_length": 692.859375, "completions/mean_terminated_length": 692.859375, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "entropy": 0.736609697341919, "epoch": 1.3517156862745099, "frac_reward_zero_std": 0.5, "grad_norm": 0.46162325668956233, "kl": 0.05006270110607147, "learning_rate": 6.720138067046134e-07, "loss": -0.0122, "num_tokens": 47415564.0, "reward": 0.0625, "reward_std": 0.44091323018074036, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.460358738899231, "sampling/importance_sampling_ratio/mean": 0.9998582005500793, "sampling/importance_sampling_ratio/min": 0.5038226842880249, "sampling/sampling_logp_difference/max": 0.6855309009552002, "sampling/sampling_logp_difference/mean": 0.017986277118325233, "step": 1103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 590.0, "completions/max_terminated_length": 590.0, "completions/mean_length": 304.375, "completions/mean_terminated_length": 304.375, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.31454092264175415, "epoch": 1.3529411764705883, "frac_reward_zero_std": 1.0, "grad_norm": 0.019772430049925894, "kl": 0.026443101465702057, "learning_rate": 6.713447370214431e-07, "loss": 0.0002, "num_tokens": 47449876.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5748443603515625, "sampling/importance_sampling_ratio/mean": 1.000182867050171, "sampling/importance_sampling_ratio/min": 0.6677209734916687, "sampling/sampling_logp_difference/max": 0.45415639877319336, "sampling/sampling_logp_difference/mean": 0.011747622862458229, "step": 1104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1040.0, "completions/max_terminated_length": 1040.0, "completions/mean_length": 428.375, "completions/mean_terminated_length": 428.375, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "entropy": 0.44761887192726135, "epoch": 1.3541666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.019154668458661855, "kl": 0.025261159986257553, "learning_rate": 6.706753195193116e-07, "loss": 0.0002, "num_tokens": 47493036.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.7848808765411377, "sampling/importance_sampling_ratio/mean": 1.00015389919281, "sampling/importance_sampling_ratio/min": 0.4853349030017853, "sampling/sampling_logp_difference/max": 0.7229161262512207, "sampling/sampling_logp_difference/mean": 0.014573746360838413, "step": 1105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 677.0, "completions/max_terminated_length": 677.0, "completions/mean_length": 351.171875, "completions/mean_terminated_length": 351.171875, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "entropy": 0.39600643515586853, "epoch": 1.3553921568627452, "frac_reward_zero_std": 1.0, "grad_norm": 0.02066519457341775, "kl": 0.022760869935154915, "learning_rate": 6.700055555570941e-07, "loss": 0.0002, "num_tokens": 47532679.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4130514860153198, "sampling/importance_sampling_ratio/mean": 0.9999544620513916, "sampling/importance_sampling_ratio/min": 0.6302655339241028, "sampling/sampling_logp_difference/max": 0.46161413192749023, "sampling/sampling_logp_difference/mean": 0.012590741738677025, "step": 1106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 923.0, "completions/max_terminated_length": 923.0, "completions/mean_length": 357.515625, "completions/mean_terminated_length": 357.515625, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "entropy": 0.44258081912994385, "epoch": 1.3566176470588236, "frac_reward_zero_std": 0.75, "grad_norm": 0.586366318472274, "kl": 0.02622789889574051, "learning_rate": 6.693354464943688e-07, "loss": 0.0484, "num_tokens": 47570808.0, "reward": 0.65625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.4236316680908203, "sampling/importance_sampling_ratio/mean": 1.0003180503845215, "sampling/importance_sampling_ratio/min": 0.6077550053596497, "sampling/sampling_logp_difference/max": 0.497983455657959, "sampling/sampling_logp_difference/mean": 0.013904957100749016, "step": 1107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1953.0, "completions/max_terminated_length": 1953.0, "completions/mean_length": 439.65625, "completions/mean_terminated_length": 439.65625, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "entropy": 0.25640082359313965, "epoch": 1.357843137254902, "frac_reward_zero_std": 1.0, "grad_norm": 0.014330874910915375, "kl": 0.01637561246752739, "learning_rate": 6.68664993691415e-07, "loss": 0.0002, "num_tokens": 47621282.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.473986268043518, "sampling/importance_sampling_ratio/mean": 1.0002580881118774, "sampling/importance_sampling_ratio/min": 0.5676478147506714, "sampling/sampling_logp_difference/max": 0.5662540197372437, "sampling/sampling_logp_difference/mean": 0.008711785078048706, "step": 1108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1010.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 370.734375, "completions/mean_terminated_length": 370.734375, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.4096805155277252, "epoch": 1.3590686274509804, "frac_reward_zero_std": 1.0, "grad_norm": 0.01676802021601461, "kl": 0.02559025026857853, "learning_rate": 6.679941985092092e-07, "loss": 0.0002, "num_tokens": 47665025.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.439803957939148, "sampling/importance_sampling_ratio/mean": 1.0000698566436768, "sampling/importance_sampling_ratio/min": 0.6162269711494446, "sampling/sampling_logp_difference/max": 0.48413991928100586, "sampling/sampling_logp_difference/mean": 0.014010273851454258, "step": 1109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1137.0, "completions/max_terminated_length": 1137.0, "completions/mean_length": 518.046875, "completions/mean_terminated_length": 518.046875, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.5069876909255981, "epoch": 1.3602941176470589, "frac_reward_zero_std": 0.75, "grad_norm": 0.4690030820772882, "kl": 0.02302650734782219, "learning_rate": 6.673230623094231e-07, "loss": -0.011, "num_tokens": 47716900.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.464584469795227, "sampling/importance_sampling_ratio/mean": 1.0002241134643555, "sampling/importance_sampling_ratio/min": 0.6132798194885254, "sampling/sampling_logp_difference/max": 0.4889340400695801, "sampling/sampling_logp_difference/mean": 0.01491369865834713, "step": 1110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1297.0, "completions/max_terminated_length": 1297.0, "completions/mean_length": 558.09375, "completions/mean_terminated_length": 558.09375, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "entropy": 0.4806044101715088, "epoch": 1.3615196078431373, "frac_reward_zero_std": 0.25, "grad_norm": 0.8201750490745927, "kl": 0.024125883355736732, "learning_rate": 6.666515864544208e-07, "loss": 0.0773, "num_tokens": 47770010.0, "reward": 0.3125, "reward_std": 0.4973389506340027, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.6358217000961304, "sampling/importance_sampling_ratio/mean": 0.9998418092727661, "sampling/importance_sampling_ratio/min": 0.2561973035335541, "sampling/sampling_logp_difference/max": 1.3618073463439941, "sampling/sampling_logp_difference/mean": 0.012995224446058273, "step": 1111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1017.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 365.375, "completions/mean_terminated_length": 365.375, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.5101863145828247, "epoch": 1.3627450980392157, "frac_reward_zero_std": 0.75, "grad_norm": 0.7796178389550589, "kl": 0.02946890890598297, "learning_rate": 6.659797723072558e-07, "loss": 0.0164, "num_tokens": 47813506.0, "reward": 0.25, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.25, "rewards/decision_reward_func/std": 0.9759001135826111, "sampling/importance_sampling_ratio/max": 1.4569485187530518, "sampling/importance_sampling_ratio/mean": 1.0000574588775635, "sampling/importance_sampling_ratio/min": 0.592578113079071, "sampling/sampling_logp_difference/max": 0.5232726335525513, "sampling/sampling_logp_difference/mean": 0.01631433516740799, "step": 1112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1259.0, "completions/max_terminated_length": 1259.0, "completions/mean_length": 525.34375, "completions/mean_terminated_length": 525.34375, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "entropy": 0.42389827966690063, "epoch": 1.3639705882352942, "frac_reward_zero_std": 0.75, "grad_norm": 0.4513976275139492, "kl": 0.02157052606344223, "learning_rate": 6.653076212316681e-07, "loss": -0.0023, "num_tokens": 47866696.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.6306228637695312, "sampling/importance_sampling_ratio/mean": 1.0002343654632568, "sampling/importance_sampling_ratio/min": 0.7040144801139832, "sampling/sampling_logp_difference/max": 0.4889620542526245, "sampling/sampling_logp_difference/mean": 0.012590251863002777, "step": 1113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 813.0, "completions/max_terminated_length": 813.0, "completions/mean_length": 471.265625, "completions/mean_terminated_length": 471.265625, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "entropy": 0.3183565139770508, "epoch": 1.3651960784313726, "frac_reward_zero_std": 1.0, "grad_norm": 0.012891409986111643, "kl": 0.016516033560037613, "learning_rate": 6.646351345920818e-07, "loss": 0.0002, "num_tokens": 47915545.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5488605499267578, "sampling/importance_sampling_ratio/mean": 0.9996253252029419, "sampling/importance_sampling_ratio/min": 0.620280385017395, "sampling/sampling_logp_difference/max": 0.477583646774292, "sampling/sampling_logp_difference/mean": 0.010519235394895077, "step": 1114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1815.0, "completions/max_terminated_length": 1815.0, "completions/mean_length": 499.015625, "completions/mean_terminated_length": 499.015625, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "entropy": 0.3754943609237671, "epoch": 1.366421568627451, "frac_reward_zero_std": 0.75, "grad_norm": 0.397827360308643, "kl": 0.021563805639743805, "learning_rate": 6.639623137536022e-07, "loss": 0.043, "num_tokens": 47963210.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.446108341217041, "sampling/importance_sampling_ratio/mean": 1.0001139640808105, "sampling/importance_sampling_ratio/min": 0.6474900841712952, "sampling/sampling_logp_difference/max": 0.43465185165405273, "sampling/sampling_logp_difference/mean": 0.011442841030657291, "step": 1115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 854.0, "completions/max_terminated_length": 854.0, "completions/mean_length": 386.046875, "completions/mean_terminated_length": 386.046875, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "entropy": 0.32580357789993286, "epoch": 1.3676470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.014336852665143777, "kl": 0.020645271986722946, "learning_rate": 6.63289160082013e-07, "loss": 0.0002, "num_tokens": 48003309.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.672722339630127, "sampling/importance_sampling_ratio/mean": 0.9997730255126953, "sampling/importance_sampling_ratio/min": 0.7107616066932678, "sampling/sampling_logp_difference/max": 0.5144524574279785, "sampling/sampling_logp_difference/mean": 0.010986298322677612, "step": 1116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1807.0, "completions/max_terminated_length": 1807.0, "completions/mean_length": 501.046875, "completions/mean_terminated_length": 501.046875, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "entropy": 0.5223156809806824, "epoch": 1.3688725490196079, "frac_reward_zero_std": 0.75, "grad_norm": 0.4256417221899985, "kl": 0.021632639691233635, "learning_rate": 6.626156749437736e-07, "loss": -0.0308, "num_tokens": 48052976.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.9910355806350708, "sampling/importance_sampling_ratio/mean": 1.0000313520431519, "sampling/importance_sampling_ratio/min": 0.6071475744247437, "sampling/sampling_logp_difference/max": 0.688654899597168, "sampling/sampling_logp_difference/mean": 0.015145332552492619, "step": 1117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1095.0, "completions/max_terminated_length": 1095.0, "completions/mean_length": 416.234375, "completions/mean_terminated_length": 416.234375, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "entropy": 0.2877940833568573, "epoch": 1.3700980392156863, "frac_reward_zero_std": 1.0, "grad_norm": 0.012327512824212974, "kl": 0.018439821898937225, "learning_rate": 6.619418597060159e-07, "loss": 0.0002, "num_tokens": 48097023.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.9373289346694946, "sampling/importance_sampling_ratio/mean": 1.0001513957977295, "sampling/importance_sampling_ratio/min": 0.6122764348983765, "sampling/sampling_logp_difference/max": 0.6613101959228516, "sampling/sampling_logp_difference/mean": 0.009845242835581303, "step": 1118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1843.0, "completions/max_terminated_length": 1843.0, "completions/mean_length": 574.34375, "completions/mean_terminated_length": 574.34375, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "entropy": 0.39036303758621216, "epoch": 1.3713235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.012317102155952246, "kl": 0.017180591821670532, "learning_rate": 6.612677157365425e-07, "loss": 0.0001, "num_tokens": 48150821.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6029541492462158, "sampling/importance_sampling_ratio/mean": 0.9999475479125977, "sampling/importance_sampling_ratio/min": 0.6196553111076355, "sampling/sampling_logp_difference/max": 0.4785919189453125, "sampling/sampling_logp_difference/mean": 0.012880701571702957, "step": 1119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1347.0, "completions/max_terminated_length": 1347.0, "completions/mean_length": 537.4375, "completions/mean_terminated_length": 537.4375, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "entropy": 0.7331448793411255, "epoch": 1.3725490196078431, "frac_reward_zero_std": 0.25, "grad_norm": 0.7717992678520822, "kl": 0.0370030552148819, "learning_rate": 6.605932444038228e-07, "loss": -0.0271, "num_tokens": 48201425.0, "reward": 0.15625, "reward_std": 0.7129635810852051, "rewards/decision_reward_func/mean": 0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.4030437469482422, "sampling/importance_sampling_ratio/mean": 1.0000500679016113, "sampling/importance_sampling_ratio/min": 0.5827260613441467, "sampling/sampling_logp_difference/max": 0.5400381088256836, "sampling/sampling_logp_difference/mean": 0.01842891052365303, "step": 1120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1447.0, "completions/max_terminated_length": 1447.0, "completions/mean_length": 496.34375, "completions/mean_terminated_length": 496.34375, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.5485855937004089, "epoch": 1.3737745098039216, "frac_reward_zero_std": 0.5, "grad_norm": 0.6766437978234073, "kl": 0.029225023463368416, "learning_rate": 6.599184470769908e-07, "loss": -0.0191, "num_tokens": 48247623.0, "reward": 0.46875, "reward_std": 0.5061737298965454, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.4969453811645508, "sampling/importance_sampling_ratio/mean": 0.9996618032455444, "sampling/importance_sampling_ratio/min": 0.6283012628555298, "sampling/sampling_logp_difference/max": 0.4647355079650879, "sampling/sampling_logp_difference/mean": 0.0157216377556324, "step": 1121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 928.0, "completions/max_terminated_length": 928.0, "completions/mean_length": 398.3125, "completions/mean_terminated_length": 398.3125, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.3560401201248169, "epoch": 1.375, "frac_reward_zero_std": 1.0, "grad_norm": 0.020075460188142026, "kl": 0.0204083900898695, "learning_rate": 6.592433251258422e-07, "loss": 0.0002, "num_tokens": 48295307.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.693841576576233, "sampling/importance_sampling_ratio/mean": 1.0000779628753662, "sampling/importance_sampling_ratio/min": 0.6292317509651184, "sampling/sampling_logp_difference/max": 0.5269989967346191, "sampling/sampling_logp_difference/mean": 0.012316283769905567, "step": 1122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1485.0, "completions/max_terminated_length": 1485.0, "completions/mean_length": 400.359375, "completions/mean_terminated_length": 400.359375, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "entropy": 0.4123362898826599, "epoch": 1.3762254901960784, "frac_reward_zero_std": 1.0, "grad_norm": 0.01680511094744772, "kl": 0.02156393602490425, "learning_rate": 6.58567879920832e-07, "loss": 0.0002, "num_tokens": 48338642.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5467678308486938, "sampling/importance_sampling_ratio/mean": 1.000119686126709, "sampling/importance_sampling_ratio/min": 0.6176401972770691, "sampling/sampling_logp_difference/max": 0.48184919357299805, "sampling/sampling_logp_difference/mean": 0.013239812105894089, "step": 1123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 727.0, "completions/max_terminated_length": 727.0, "completions/mean_length": 428.875, "completions/mean_terminated_length": 428.875, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.39540719985961914, "epoch": 1.3774509803921569, "frac_reward_zero_std": 1.0, "grad_norm": 0.01747039145360083, "kl": 0.021693535149097443, "learning_rate": 6.578921128330714e-07, "loss": 0.0002, "num_tokens": 48379962.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.8992633819580078, "sampling/importance_sampling_ratio/mean": 0.9998611807823181, "sampling/importance_sampling_ratio/min": 0.5483981966972351, "sampling/sampling_logp_difference/max": 0.6414661407470703, "sampling/sampling_logp_difference/mean": 0.012689922004938126, "step": 1124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1208.0, "completions/max_terminated_length": 1208.0, "completions/mean_length": 417.546875, "completions/mean_terminated_length": 417.546875, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.4589959979057312, "epoch": 1.3786764705882353, "frac_reward_zero_std": 0.75, "grad_norm": 0.5488033675358436, "kl": 0.023006446659564972, "learning_rate": 6.572160252343242e-07, "loss": 0.0304, "num_tokens": 48427453.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.400309443473816, "sampling/importance_sampling_ratio/mean": 1.0006864070892334, "sampling/importance_sampling_ratio/min": 0.606393575668335, "sampling/sampling_logp_difference/max": 0.5002260208129883, "sampling/sampling_logp_difference/mean": 0.014714536257088184, "step": 1125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1461.0, "completions/max_terminated_length": 1461.0, "completions/mean_length": 430.34375, "completions/mean_terminated_length": 430.34375, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "entropy": 0.5502604842185974, "epoch": 1.3799019607843137, "frac_reward_zero_std": 0.5, "grad_norm": 0.7351383420499503, "kl": 0.027934987097978592, "learning_rate": 6.565396184970059e-07, "loss": -0.0399, "num_tokens": 48475139.0, "reward": 0.53125, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.6019448041915894, "sampling/importance_sampling_ratio/mean": 1.0000488758087158, "sampling/importance_sampling_ratio/min": 0.43682554364204407, "sampling/sampling_logp_difference/max": 0.8282214403152466, "sampling/sampling_logp_difference/mean": 0.016512054949998856, "step": 1126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 805.0, "completions/max_terminated_length": 805.0, "completions/mean_length": 451.390625, "completions/mean_terminated_length": 451.390625, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "entropy": 0.4656364321708679, "epoch": 1.3811274509803921, "frac_reward_zero_std": 0.75, "grad_norm": 0.424979005730951, "kl": 0.022666607052087784, "learning_rate": 6.558628939941791e-07, "loss": -0.0358, "num_tokens": 48526428.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.8315849304199219, "sampling/importance_sampling_ratio/mean": 1.0003211498260498, "sampling/importance_sampling_ratio/min": 0.73978590965271, "sampling/sampling_logp_difference/max": 0.6051816940307617, "sampling/sampling_logp_difference/mean": 0.013239732012152672, "step": 1127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2312.0, "completions/max_terminated_length": 2312.0, "completions/mean_length": 671.078125, "completions/mean_terminated_length": 671.078125, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "entropy": 0.6319130659103394, "epoch": 1.3823529411764706, "frac_reward_zero_std": 0.75, "grad_norm": 0.3029657867420882, "kl": 0.023727266117930412, "learning_rate": 6.551858530995517e-07, "loss": 0.0236, "num_tokens": 48589921.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.5797536373138428, "sampling/importance_sampling_ratio/mean": 1.0002167224884033, "sampling/importance_sampling_ratio/min": 0.22315461933612823, "sampling/sampling_logp_difference/max": 1.4998904466629028, "sampling/sampling_logp_difference/mean": 0.015634385868906975, "step": 1128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3594.0, "completions/max_terminated_length": 3594.0, "completions/mean_length": 687.0625, "completions/mean_terminated_length": 687.0625, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "entropy": 0.622481107711792, "epoch": 1.383578431372549, "frac_reward_zero_std": 0.75, "grad_norm": 0.3571633105349095, "kl": 0.02702293172478676, "learning_rate": 6.545084971874736e-07, "loss": 0.0249, "num_tokens": 48654581.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.45551335811615, "sampling/importance_sampling_ratio/mean": 0.9998813271522522, "sampling/importance_sampling_ratio/min": 0.606942355632782, "sampling/sampling_logp_difference/max": 0.49932146072387695, "sampling/sampling_logp_difference/mean": 0.016178341582417488, "step": 1129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2347.0, "completions/max_terminated_length": 2347.0, "completions/mean_length": 552.3125, "completions/mean_terminated_length": 552.3125, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "entropy": 0.379790723323822, "epoch": 1.3848039215686274, "frac_reward_zero_std": 1.0, "grad_norm": 0.013311079527846341, "kl": 0.017920376732945442, "learning_rate": 6.538308276329349e-07, "loss": 0.0001, "num_tokens": 48709961.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4061660766601562, "sampling/importance_sampling_ratio/mean": 0.9999300241470337, "sampling/importance_sampling_ratio/min": 0.6176558136940002, "sampling/sampling_logp_difference/max": 0.4818239212036133, "sampling/sampling_logp_difference/mean": 0.011792091652750969, "step": 1130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/max_terminated_length": 495.0, "completions/mean_length": 259.5, "completions/mean_terminated_length": 259.5, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.5043773651123047, "epoch": 1.3860294117647058, "frac_reward_zero_std": 0.75, "grad_norm": 0.7870225415743204, "kl": 0.039771657437086105, "learning_rate": 6.531528458115614e-07, "loss": -0.0478, "num_tokens": 48742153.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.3783780336380005, "sampling/importance_sampling_ratio/mean": 0.9999634027481079, "sampling/importance_sampling_ratio/min": 0.6166958212852478, "sampling/sampling_logp_difference/max": 0.4833793640136719, "sampling/sampling_logp_difference/mean": 0.01683327741920948, "step": 1131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1096.0, "completions/max_terminated_length": 1096.0, "completions/mean_length": 399.53125, "completions/mean_terminated_length": 399.53125, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "entropy": 0.5050068497657776, "epoch": 1.3872549019607843, "frac_reward_zero_std": 0.75, "grad_norm": 0.6251064665437476, "kl": 0.028162578120827675, "learning_rate": 6.524745530996136e-07, "loss": -0.0674, "num_tokens": 48786331.0, "reward": 0.0625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.5488721132278442, "sampling/importance_sampling_ratio/mean": 0.9999977350234985, "sampling/importance_sampling_ratio/min": 0.641350269317627, "sampling/sampling_logp_difference/max": 0.4441795349121094, "sampling/sampling_logp_difference/mean": 0.013770410791039467, "step": 1132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1027.0, "completions/max_terminated_length": 1027.0, "completions/mean_length": 401.953125, "completions/mean_terminated_length": 401.953125, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.3899313807487488, "epoch": 1.3884803921568627, "frac_reward_zero_std": 0.75, "grad_norm": 0.5446482145209902, "kl": 0.025088950991630554, "learning_rate": 6.517959508739825e-07, "loss": -0.0149, "num_tokens": 48830872.0, "reward": 0.6875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.4754021167755127, "sampling/importance_sampling_ratio/mean": 0.9998937249183655, "sampling/importance_sampling_ratio/min": 0.3210548460483551, "sampling/sampling_logp_difference/max": 1.1361433267593384, "sampling/sampling_logp_difference/mean": 0.01189700048416853, "step": 1133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1777.0, "completions/max_terminated_length": 1777.0, "completions/mean_length": 634.796875, "completions/mean_terminated_length": 634.796875, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.4049384593963623, "epoch": 1.3897058823529411, "frac_reward_zero_std": 0.5, "grad_norm": 0.5593749480512695, "kl": 0.02683548629283905, "learning_rate": 6.511170405121877e-07, "loss": -0.0026, "num_tokens": 48888443.0, "reward": 0.4375, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.5277665853500366, "sampling/importance_sampling_ratio/mean": 1.0001497268676758, "sampling/importance_sampling_ratio/min": 0.5922386050224304, "sampling/sampling_logp_difference/max": 0.5238456726074219, "sampling/sampling_logp_difference/mean": 0.013242663815617561, "step": 1134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 788.0, "completions/max_terminated_length": 788.0, "completions/mean_length": 348.421875, "completions/mean_terminated_length": 348.421875, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.45097535848617554, "epoch": 1.3909313725490196, "frac_reward_zero_std": 0.75, "grad_norm": 0.5748129149681577, "kl": 0.035865455865859985, "learning_rate": 6.504378233923742e-07, "loss": 0.0193, "num_tokens": 48925302.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.4304996728897095, "sampling/importance_sampling_ratio/mean": 1.0000404119491577, "sampling/importance_sampling_ratio/min": 0.6457683444023132, "sampling/sampling_logp_difference/max": 0.4373144507408142, "sampling/sampling_logp_difference/mean": 0.0147782564163208, "step": 1135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1831.0, "completions/max_terminated_length": 1831.0, "completions/mean_length": 534.359375, "completions/mean_terminated_length": 534.359375, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.5715688467025757, "epoch": 1.392156862745098, "frac_reward_zero_std": 0.75, "grad_norm": 0.5700701269689047, "kl": 0.024340158328413963, "learning_rate": 6.497583008933097e-07, "loss": -0.0656, "num_tokens": 48977597.0, "reward": 0.625, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.4093266725540161, "sampling/importance_sampling_ratio/mean": 0.999865710735321, "sampling/importance_sampling_ratio/min": 0.5308951139450073, "sampling/sampling_logp_difference/max": 0.6331908702850342, "sampling/sampling_logp_difference/mean": 0.015870768576860428, "step": 1136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 706.0, "completions/max_terminated_length": 706.0, "completions/mean_length": 428.59375, "completions/mean_terminated_length": 428.59375, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "entropy": 0.33831915259361267, "epoch": 1.3933823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.013309469151434538, "kl": 0.014920201152563095, "learning_rate": 6.490784743943818e-07, "loss": 0.0002, "num_tokens": 49020355.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6052453517913818, "sampling/importance_sampling_ratio/mean": 1.0001347064971924, "sampling/importance_sampling_ratio/min": 0.6183459758758545, "sampling/sampling_logp_difference/max": 0.48070716857910156, "sampling/sampling_logp_difference/mean": 0.011547118425369263, "step": 1137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1261.0, "completions/max_terminated_length": 1261.0, "completions/mean_length": 502.0, "completions/mean_terminated_length": 502.0, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "entropy": 0.5159387588500977, "epoch": 1.3946078431372548, "frac_reward_zero_std": 0.75, "grad_norm": 0.4219469503800046, "kl": 0.031102575361728668, "learning_rate": 6.483983452755952e-07, "loss": 0.0057, "num_tokens": 49074531.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.6007364988327026, "sampling/importance_sampling_ratio/mean": 0.9999628067016602, "sampling/importance_sampling_ratio/min": 0.3345867395401001, "sampling/sampling_logp_difference/max": 1.0948591232299805, "sampling/sampling_logp_difference/mean": 0.014032615348696709, "step": 1138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1623.0, "completions/max_terminated_length": 1623.0, "completions/mean_length": 600.578125, "completions/mean_terminated_length": 600.578125, "completions/min_length": 275.0, "completions/min_terminated_length": 275.0, "entropy": 0.4651230275630951, "epoch": 1.3958333333333333, "frac_reward_zero_std": 0.75, "grad_norm": 0.5252324888181313, "kl": 0.019738977774977684, "learning_rate": 6.477179149175692e-07, "loss": 0.0227, "num_tokens": 49135160.0, "reward": -0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": -0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9997169375419617, "sampling/importance_sampling_ratio/min": 0.5096377730369568, "sampling/sampling_logp_difference/max": 0.7727789878845215, "sampling/sampling_logp_difference/mean": 0.013515777885913849, "step": 1139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 802.0, "completions/max_terminated_length": 802.0, "completions/mean_length": 490.296875, "completions/mean_terminated_length": 490.296875, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "entropy": 0.5630381107330322, "epoch": 1.3970588235294117, "frac_reward_zero_std": 0.5, "grad_norm": 0.6106481608402462, "kl": 0.030729882419109344, "learning_rate": 6.470371847015341e-07, "loss": -0.0273, "num_tokens": 49188427.0, "reward": -0.5625, "reward_std": 0.44091323018074036, "rewards/decision_reward_func/mean": -0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.3092069625854492, "sampling/importance_sampling_ratio/mean": 0.9997998476028442, "sampling/importance_sampling_ratio/min": 0.7050881385803223, "sampling/sampling_logp_difference/max": 0.34943246841430664, "sampling/sampling_logp_difference/mean": 0.015573747456073761, "step": 1140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1170.0, "completions/max_terminated_length": 1170.0, "completions/mean_length": 400.625, "completions/mean_terminated_length": 400.625, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.5083440542221069, "epoch": 1.3982843137254901, "frac_reward_zero_std": 1.0, "grad_norm": 0.017684377749928044, "kl": 0.02596021629869938, "learning_rate": 6.463561560093292e-07, "loss": 0.0002, "num_tokens": 49233251.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.505231499671936, "sampling/importance_sampling_ratio/mean": 1.0001683235168457, "sampling/importance_sampling_ratio/min": 0.6971338391304016, "sampling/sampling_logp_difference/max": 0.4089467525482178, "sampling/sampling_logp_difference/mean": 0.01562635600566864, "step": 1141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1399.0, "completions/max_terminated_length": 1399.0, "completions/mean_length": 665.59375, "completions/mean_terminated_length": 665.59375, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.43741387128829956, "epoch": 1.3995098039215685, "frac_reward_zero_std": 0.5, "grad_norm": 0.5495367887024398, "kl": 0.018884576857089996, "learning_rate": 6.456748302233994e-07, "loss": -0.0141, "num_tokens": 49293337.0, "reward": 0.5, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.2884211540222168, "sampling/importance_sampling_ratio/mean": 1.000001072883606, "sampling/importance_sampling_ratio/min": 0.48468708992004395, "sampling/sampling_logp_difference/max": 0.7242517471313477, "sampling/sampling_logp_difference/mean": 0.012087279930710793, "step": 1142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 897.0, "completions/max_terminated_length": 897.0, "completions/mean_length": 420.59375, "completions/mean_terminated_length": 420.59375, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.5223529934883118, "epoch": 1.4007352941176472, "frac_reward_zero_std": 0.25, "grad_norm": 0.7769805801783896, "kl": 0.034994468092918396, "learning_rate": 6.449932087267931e-07, "loss": 0.0172, "num_tokens": 49336015.0, "reward": 0.40625, "reward_std": 0.375, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.4330562353134155, "sampling/importance_sampling_ratio/mean": 1.0000258684158325, "sampling/importance_sampling_ratio/min": 0.4626893103122711, "sampling/sampling_logp_difference/max": 0.7706995010375977, "sampling/sampling_logp_difference/mean": 0.0153665104880929, "step": 1143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1052.0, "completions/max_terminated_length": 1052.0, "completions/mean_length": 397.28125, "completions/mean_terminated_length": 397.28125, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "entropy": 0.3504667580127716, "epoch": 1.4019607843137254, "frac_reward_zero_std": 1.0, "grad_norm": 0.016142459525262835, "kl": 0.02030666545033455, "learning_rate": 6.443112929031586e-07, "loss": 0.0002, "num_tokens": 49376241.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4352115392684937, "sampling/importance_sampling_ratio/mean": 0.9998257756233215, "sampling/importance_sampling_ratio/min": 0.6474764943122864, "sampling/sampling_logp_difference/max": 0.43467283248901367, "sampling/sampling_logp_difference/mean": 0.011693388223648071, "step": 1144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1809.0, "completions/max_terminated_length": 1809.0, "completions/mean_length": 642.671875, "completions/mean_terminated_length": 642.671875, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "entropy": 0.4946010708808899, "epoch": 1.403186274509804, "frac_reward_zero_std": 0.75, "grad_norm": 0.508278415497633, "kl": 0.02438090555369854, "learning_rate": 6.43629084136742e-07, "loss": 0.0584, "num_tokens": 49434780.0, "reward": -0.0625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": -0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.3955433368682861, "sampling/importance_sampling_ratio/mean": 0.9999412298202515, "sampling/importance_sampling_ratio/min": 0.6135085225105286, "sampling/sampling_logp_difference/max": 0.48856115341186523, "sampling/sampling_logp_difference/mean": 0.013337365351617336, "step": 1145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 709.0, "completions/max_terminated_length": 709.0, "completions/mean_length": 398.234375, "completions/mean_terminated_length": 398.234375, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "entropy": 0.5258816480636597, "epoch": 1.4044117647058822, "frac_reward_zero_std": 1.0, "grad_norm": 0.020799232489530865, "kl": 0.025813067331910133, "learning_rate": 6.429465838123838e-07, "loss": 0.0003, "num_tokens": 49477563.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.3287229537963867, "sampling/importance_sampling_ratio/mean": 1.000058650970459, "sampling/importance_sampling_ratio/min": 0.612504780292511, "sampling/sampling_logp_difference/max": 0.49019861221313477, "sampling/sampling_logp_difference/mean": 0.015313014388084412, "step": 1146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1422.0, "completions/max_terminated_length": 1422.0, "completions/mean_length": 634.03125, "completions/mean_terminated_length": 634.03125, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "entropy": 0.526667594909668, "epoch": 1.405637254901961, "frac_reward_zero_std": 0.25, "grad_norm": 0.6645579544329833, "kl": 0.025799131020903587, "learning_rate": 6.422637933155162e-07, "loss": 0.0232, "num_tokens": 49536349.0, "reward": 0.5625, "reward_std": 0.6613117456436157, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.546738624572754, "sampling/importance_sampling_ratio/mean": 1.0000035762786865, "sampling/importance_sampling_ratio/min": 0.6079587936401367, "sampling/sampling_logp_difference/max": 0.4976482391357422, "sampling/sampling_logp_difference/mean": 0.014344925060868263, "step": 1147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 652.0, "completions/max_terminated_length": 652.0, "completions/mean_length": 282.390625, "completions/mean_terminated_length": 282.390625, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.3495868444442749, "epoch": 1.406862745098039, "frac_reward_zero_std": 1.0, "grad_norm": 0.021569346500213984, "kl": 0.024074317887425423, "learning_rate": 6.41580714032161e-07, "loss": 0.0002, "num_tokens": 49567830.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.7916730642318726, "sampling/importance_sampling_ratio/mean": 0.9998537302017212, "sampling/importance_sampling_ratio/min": 0.5724741816520691, "sampling/sampling_logp_difference/max": 0.5831499099731445, "sampling/sampling_logp_difference/mean": 0.011534319259226322, "step": 1148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 579.0, "completions/max_terminated_length": 579.0, "completions/mean_length": 377.390625, "completions/mean_terminated_length": 377.390625, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "entropy": 0.42255449295043945, "epoch": 1.4080882352941178, "frac_reward_zero_std": 1.0, "grad_norm": 0.02225167535854728, "kl": 0.02737264335155487, "learning_rate": 6.408973473489257e-07, "loss": 0.0003, "num_tokens": 49607695.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5514975786209106, "sampling/importance_sampling_ratio/mean": 0.9999551773071289, "sampling/importance_sampling_ratio/min": 0.6059669256210327, "sampling/sampling_logp_difference/max": 0.5009298324584961, "sampling/sampling_logp_difference/mean": 0.013471214100718498, "step": 1149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2420.0, "completions/max_terminated_length": 2420.0, "completions/mean_length": 564.40625, "completions/mean_terminated_length": 564.40625, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.5184845924377441, "epoch": 1.409313725490196, "frac_reward_zero_std": 0.25, "grad_norm": 2.8913980503038053, "kl": 0.029544293880462646, "learning_rate": 6.402136946530014e-07, "loss": 0.0005, "num_tokens": 49663881.0, "reward": 0.46875, "reward_std": 0.375, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.6058188676834106, "sampling/importance_sampling_ratio/mean": 1.000511884689331, "sampling/importance_sampling_ratio/min": 0.6406955718994141, "sampling/sampling_logp_difference/max": 0.4736337661743164, "sampling/sampling_logp_difference/mean": 0.01638016849756241, "step": 1150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 933.0, "completions/max_terminated_length": 933.0, "completions/mean_length": 421.5625, "completions/mean_terminated_length": 421.5625, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "entropy": 0.5606126189231873, "epoch": 1.4105392156862746, "frac_reward_zero_std": 0.75, "grad_norm": 0.5331187790383101, "kl": 0.03442231938242912, "learning_rate": 6.395297573321597e-07, "loss": 0.0595, "num_tokens": 49706397.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.7520582675933838, "sampling/importance_sampling_ratio/mean": 1.000045895576477, "sampling/importance_sampling_ratio/min": 0.46420907974243164, "sampling/sampling_logp_difference/max": 0.7674202919006348, "sampling/sampling_logp_difference/mean": 0.016393832862377167, "step": 1151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1264.0, "completions/max_terminated_length": 1264.0, "completions/mean_length": 543.828125, "completions/mean_terminated_length": 543.828125, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "entropy": 0.5799061059951782, "epoch": 1.4117647058823528, "frac_reward_zero_std": 0.75, "grad_norm": 0.40904168173655975, "kl": 0.03061947599053383, "learning_rate": 6.388455367747502e-07, "loss": 0.038, "num_tokens": 49760018.0, "reward": 0.8125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.6429405212402344, "sampling/importance_sampling_ratio/mean": 0.9997661113739014, "sampling/importance_sampling_ratio/min": 0.6766353249549866, "sampling/sampling_logp_difference/max": 0.4964876174926758, "sampling/sampling_logp_difference/mean": 0.015471646562218666, "step": 1152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1647.0, "completions/max_terminated_length": 1647.0, "completions/mean_length": 514.25, "completions/mean_terminated_length": 514.25, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.5300447344779968, "epoch": 1.4129901960784315, "frac_reward_zero_std": 0.25, "grad_norm": 0.8471492490140847, "kl": 0.036937929689884186, "learning_rate": 6.38161034369697e-07, "loss": -0.0625, "num_tokens": 49810594.0, "reward": 0.09375, "reward_std": 0.6337460875511169, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.3737398386001587, "sampling/importance_sampling_ratio/mean": 0.9999651312828064, "sampling/importance_sampling_ratio/min": 0.6801987886428833, "sampling/sampling_logp_difference/max": 0.38537025451660156, "sampling/sampling_logp_difference/mean": 0.014528707601130009, "step": 1153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3057.0, "completions/max_terminated_length": 3057.0, "completions/mean_length": 573.5625, "completions/mean_terminated_length": 573.5625, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "entropy": 0.5074350833892822, "epoch": 1.4142156862745099, "frac_reward_zero_std": 0.5, "grad_norm": 0.6623247497940851, "kl": 0.02196347713470459, "learning_rate": 6.37476251506497e-07, "loss": -0.038, "num_tokens": 49863334.0, "reward": 0.125, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 1.400469422340393, "sampling/importance_sampling_ratio/mean": 0.9999778270721436, "sampling/importance_sampling_ratio/min": 0.7052949666976929, "sampling/sampling_logp_difference/max": 0.3491392135620117, "sampling/sampling_logp_difference/mean": 0.013471854850649834, "step": 1154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1255.0, "completions/max_terminated_length": 1255.0, "completions/mean_length": 462.296875, "completions/mean_terminated_length": 462.296875, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "entropy": 0.6178364753723145, "epoch": 1.4154411764705883, "frac_reward_zero_std": 0.75, "grad_norm": 0.4592154538098635, "kl": 0.03183010220527649, "learning_rate": 6.367911895752158e-07, "loss": -0.0253, "num_tokens": 49914073.0, "reward": 0.78125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.5312392711639404, "sampling/importance_sampling_ratio/mean": 0.9998778700828552, "sampling/importance_sampling_ratio/min": 0.6533763408660889, "sampling/sampling_logp_difference/max": 0.42607736587524414, "sampling/sampling_logp_difference/mean": 0.016583478078246117, "step": 1155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1232.0, "completions/max_terminated_length": 1232.0, "completions/mean_length": 462.640625, "completions/mean_terminated_length": 462.640625, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "entropy": 0.49430492520332336, "epoch": 1.4166666666666667, "frac_reward_zero_std": 0.5, "grad_norm": 0.8579208942341476, "kl": 0.024691104888916016, "learning_rate": 6.361058499664855e-07, "loss": -0.1089, "num_tokens": 49964114.0, "reward": 0.03125, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0003738403320312, "sampling/importance_sampling_ratio/min": 0.6180609464645386, "sampling/sampling_logp_difference/max": 0.7390444278717041, "sampling/sampling_logp_difference/mean": 0.014024931937456131, "step": 1156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1002.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 406.515625, "completions/mean_terminated_length": 406.515625, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.5750739574432373, "epoch": 1.4178921568627452, "frac_reward_zero_std": 0.75, "grad_norm": 0.6233321695888618, "kl": 0.02939312905073166, "learning_rate": 6.354202340715026e-07, "loss": 0.057, "num_tokens": 50009043.0, "reward": 0.375, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.3976637125015259, "sampling/importance_sampling_ratio/mean": 0.9998137950897217, "sampling/importance_sampling_ratio/min": 0.687768280506134, "sampling/sampling_logp_difference/max": 0.37430334091186523, "sampling/sampling_logp_difference/mean": 0.016652654856443405, "step": 1157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 973.0, "completions/max_terminated_length": 973.0, "completions/mean_length": 454.59375, "completions/mean_terminated_length": 454.59375, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.4465128779411316, "epoch": 1.4191176470588236, "frac_reward_zero_std": 1.0, "grad_norm": 0.015656664343381522, "kl": 0.027095917612314224, "learning_rate": 6.347343432820234e-07, "loss": 0.0002, "num_tokens": 50058489.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4353653192520142, "sampling/importance_sampling_ratio/mean": 0.9996160864830017, "sampling/importance_sampling_ratio/min": 0.6550092101097107, "sampling/sampling_logp_difference/max": 0.42310595512390137, "sampling/sampling_logp_difference/mean": 0.013850221410393715, "step": 1158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2163.0, "completions/max_terminated_length": 2163.0, "completions/mean_length": 426.015625, "completions/mean_terminated_length": 426.015625, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "entropy": 0.4408278167247772, "epoch": 1.420343137254902, "frac_reward_zero_std": 0.75, "grad_norm": 0.6232921044931458, "kl": 0.02482795901596546, "learning_rate": 6.340481789903634e-07, "loss": 0.0454, "num_tokens": 50109482.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.7170393466949463, "sampling/importance_sampling_ratio/mean": 1.0000063180923462, "sampling/importance_sampling_ratio/min": 0.518310010433197, "sampling/sampling_logp_difference/max": 0.6571817398071289, "sampling/sampling_logp_difference/mean": 0.014278700575232506, "step": 1159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 999.0, "completions/max_terminated_length": 999.0, "completions/mean_length": 425.390625, "completions/mean_terminated_length": 425.390625, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.5410072803497314, "epoch": 1.4215686274509804, "frac_reward_zero_std": 0.5, "grad_norm": 0.7619135256781644, "kl": 0.030522331595420837, "learning_rate": 6.333617425893919e-07, "loss": 0.0499, "num_tokens": 50152707.0, "reward": 0.875, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.3898175954818726, "sampling/importance_sampling_ratio/mean": 0.9996793866157532, "sampling/importance_sampling_ratio/min": 0.7105375528335571, "sampling/sampling_logp_difference/max": 0.341733455657959, "sampling/sampling_logp_difference/mean": 0.01517491601407528, "step": 1160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1066.0, "completions/max_terminated_length": 1066.0, "completions/mean_length": 411.34375, "completions/mean_terminated_length": 411.34375, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "entropy": 0.4682633578777313, "epoch": 1.4227941176470589, "frac_reward_zero_std": 0.75, "grad_norm": 0.49108939590925565, "kl": 0.02891628071665764, "learning_rate": 6.326750354725319e-07, "loss": -0.006, "num_tokens": 50196793.0, "reward": 0.15625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9997924566268921, "sampling/importance_sampling_ratio/min": 0.5964679718017578, "sampling/sampling_logp_difference/max": 0.7272229194641113, "sampling/sampling_logp_difference/mean": 0.014163859188556671, "step": 1161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1473.0, "completions/max_terminated_length": 1473.0, "completions/mean_length": 481.21875, "completions/mean_terminated_length": 481.21875, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "entropy": 0.5201012492179871, "epoch": 1.4240196078431373, "frac_reward_zero_std": 0.75, "grad_norm": 0.3977336448899917, "kl": 0.030357178300619125, "learning_rate": 6.319880590337548e-07, "loss": 0.0154, "num_tokens": 50246343.0, "reward": 0.375, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.6262215375900269, "sampling/importance_sampling_ratio/mean": 0.9999926090240479, "sampling/importance_sampling_ratio/min": 0.7197120785713196, "sampling/sampling_logp_difference/max": 0.48625922203063965, "sampling/sampling_logp_difference/mean": 0.014770915731787682, "step": 1162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 667.0, "completions/max_terminated_length": 667.0, "completions/mean_length": 337.984375, "completions/mean_terminated_length": 337.984375, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.47754186391830444, "epoch": 1.4252450980392157, "frac_reward_zero_std": 0.75, "grad_norm": 0.599852671765985, "kl": 0.03373897448182106, "learning_rate": 6.313008146675799e-07, "loss": 0.035, "num_tokens": 50287206.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.3253237009048462, "sampling/importance_sampling_ratio/mean": 0.9996599555015564, "sampling/importance_sampling_ratio/min": 0.48981526494026184, "sampling/sampling_logp_difference/max": 0.7137269973754883, "sampling/sampling_logp_difference/mean": 0.015037433244287968, "step": 1163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2195.0, "completions/max_terminated_length": 2195.0, "completions/mean_length": 615.5625, "completions/mean_terminated_length": 615.5625, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "entropy": 0.37451493740081787, "epoch": 1.4264705882352942, "frac_reward_zero_std": 1.0, "grad_norm": 0.013547640188772098, "kl": 0.018160028383135796, "learning_rate": 6.306133037690692e-07, "loss": 0.0002, "num_tokens": 50344682.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.612839698791504, "sampling/importance_sampling_ratio/mean": 1.0000910758972168, "sampling/importance_sampling_ratio/min": 0.68998783826828, "sampling/sampling_logp_difference/max": 0.4779963493347168, "sampling/sampling_logp_difference/mean": 0.010432192124426365, "step": 1164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 716.0, "completions/max_terminated_length": 716.0, "completions/mean_length": 378.625, "completions/mean_terminated_length": 378.625, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "entropy": 0.3703806400299072, "epoch": 1.4276960784313726, "frac_reward_zero_std": 1.0, "grad_norm": 0.01924986626536427, "kl": 0.024987293407320976, "learning_rate": 6.299255277338264e-07, "loss": 0.0002, "num_tokens": 50387922.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5071685314178467, "sampling/importance_sampling_ratio/mean": 1.000119686126709, "sampling/importance_sampling_ratio/min": 0.3723011016845703, "sampling/sampling_logp_difference/max": 0.9880523681640625, "sampling/sampling_logp_difference/mean": 0.01244286447763443, "step": 1165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1058.0, "completions/max_terminated_length": 1058.0, "completions/mean_length": 406.140625, "completions/mean_terminated_length": 406.140625, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.53037029504776, "epoch": 1.428921568627451, "frac_reward_zero_std": 0.75, "grad_norm": 0.6188491409344916, "kl": 0.030412282794713974, "learning_rate": 6.292374879579934e-07, "loss": 0.0206, "num_tokens": 50428123.0, "reward": 0.40625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.412147879600525, "sampling/importance_sampling_ratio/mean": 0.999767541885376, "sampling/importance_sampling_ratio/min": 0.6416726112365723, "sampling/sampling_logp_difference/max": 0.44367706775665283, "sampling/sampling_logp_difference/mean": 0.015187774784862995, "step": 1166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 718.0, "completions/max_terminated_length": 718.0, "completions/mean_length": 415.953125, "completions/mean_terminated_length": 415.953125, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.5310487747192383, "epoch": 1.4301470588235294, "frac_reward_zero_std": 0.75, "grad_norm": 0.5155737601698172, "kl": 0.03211888670921326, "learning_rate": 6.285491858382473e-07, "loss": -0.009, "num_tokens": 50474280.0, "reward": 0.6875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.52013099193573, "sampling/importance_sampling_ratio/mean": 1.0000507831573486, "sampling/importance_sampling_ratio/min": 0.6245322823524475, "sampling/sampling_logp_difference/max": 0.4707522392272949, "sampling/sampling_logp_difference/mean": 0.015505343675613403, "step": 1167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1567.0, "completions/max_terminated_length": 1567.0, "completions/mean_length": 532.0, "completions/mean_terminated_length": 532.0, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "entropy": 0.551400899887085, "epoch": 1.4313725490196079, "frac_reward_zero_std": 0.5, "grad_norm": 0.7361870634827177, "kl": 0.033572301268577576, "learning_rate": 6.278606227717978e-07, "loss": 0.0702, "num_tokens": 50530360.0, "reward": 0.875, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.4273009300231934, "sampling/importance_sampling_ratio/mean": 0.9998166561126709, "sampling/importance_sampling_ratio/min": 0.4982428550720215, "sampling/sampling_logp_difference/max": 0.6966676712036133, "sampling/sampling_logp_difference/mean": 0.014876754023134708, "step": 1168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 860.0, "completions/max_terminated_length": 860.0, "completions/mean_length": 381.546875, "completions/mean_terminated_length": 381.546875, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.6293909549713135, "epoch": 1.4325980392156863, "frac_reward_zero_std": 0.25, "grad_norm": 0.9945327149428661, "kl": 0.040086984634399414, "learning_rate": 6.271718001563843e-07, "loss": -0.0102, "num_tokens": 50571115.0, "reward": 0.0, "reward_std": 0.4973389506340027, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.4226717948913574, "sampling/importance_sampling_ratio/mean": 0.9999020099639893, "sampling/importance_sampling_ratio/min": 0.7006127834320068, "sampling/sampling_logp_difference/max": 0.35579991340637207, "sampling/sampling_logp_difference/mean": 0.016847405582666397, "step": 1169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1398.0, "completions/max_terminated_length": 1398.0, "completions/mean_length": 564.796875, "completions/mean_terminated_length": 564.796875, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "entropy": 0.5432097911834717, "epoch": 1.4338235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.5523515203795141, "kl": 0.032453373074531555, "learning_rate": 6.264827193902731e-07, "loss": 0.0552, "num_tokens": 50629358.0, "reward": 0.25, "reward_std": 0.44091323018074036, "rewards/decision_reward_func/mean": 0.25, "rewards/decision_reward_func/std": 0.9759001135826111, "sampling/importance_sampling_ratio/max": 1.6709322929382324, "sampling/importance_sampling_ratio/mean": 1.0000666379928589, "sampling/importance_sampling_ratio/min": 0.4904073476791382, "sampling/sampling_logp_difference/max": 0.7125189304351807, "sampling/sampling_logp_difference/mean": 0.015342243015766144, "step": 1170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 885.0, "completions/max_terminated_length": 885.0, "completions/mean_length": 460.03125, "completions/mean_terminated_length": 460.03125, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "entropy": 0.6049646139144897, "epoch": 1.4350490196078431, "frac_reward_zero_std": 0.5, "grad_norm": 0.7042981958064115, "kl": 0.03611728549003601, "learning_rate": 6.257933818722542e-07, "loss": 0.052, "num_tokens": 50679056.0, "reward": 0.3125, "reward_std": 0.3943893015384674, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.6089513301849365, "sampling/importance_sampling_ratio/mean": 1.0002251863479614, "sampling/importance_sampling_ratio/min": 0.6998952031135559, "sampling/sampling_logp_difference/max": 0.4755825996398926, "sampling/sampling_logp_difference/mean": 0.01717609539628029, "step": 1171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1789.0, "completions/max_terminated_length": 1789.0, "completions/mean_length": 614.4375, "completions/mean_terminated_length": 614.4375, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "entropy": 0.5584208369255066, "epoch": 1.4362745098039216, "frac_reward_zero_std": 0.5, "grad_norm": 0.6202391487740209, "kl": 0.023074641823768616, "learning_rate": 6.251037890016395e-07, "loss": 0.1076, "num_tokens": 50733788.0, "reward": 0.46875, "reward_std": 0.3723389506340027, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.4413622617721558, "sampling/importance_sampling_ratio/mean": 0.9995719194412231, "sampling/importance_sampling_ratio/min": 0.5622560381889343, "sampling/sampling_logp_difference/max": 0.5757979154586792, "sampling/sampling_logp_difference/mean": 0.014859793707728386, "step": 1172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2040.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 450.578125, "completions/mean_terminated_length": 450.578125, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "entropy": 0.5694482326507568, "epoch": 1.4375, "frac_reward_zero_std": 0.5, "grad_norm": 0.659417564050271, "kl": 0.038201332092285156, "learning_rate": 6.244139421782587e-07, "loss": -0.0136, "num_tokens": 50776369.0, "reward": 0.125, "reward_std": 0.42078250646591187, "rewards/decision_reward_func/mean": 0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 1.546984314918518, "sampling/importance_sampling_ratio/mean": 0.9998511075973511, "sampling/importance_sampling_ratio/min": 0.6135115623474121, "sampling/sampling_logp_difference/max": 0.4885561466217041, "sampling/sampling_logp_difference/mean": 0.014758330769836903, "step": 1173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1045.0, "completions/max_terminated_length": 1045.0, "completions/mean_length": 519.40625, "completions/mean_terminated_length": 519.40625, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "entropy": 0.4003647267818451, "epoch": 1.4387254901960784, "frac_reward_zero_std": 0.75, "grad_norm": 0.4576702757689522, "kl": 0.016652289777994156, "learning_rate": 6.237238428024571e-07, "loss": -0.0202, "num_tokens": 50828875.0, "reward": 0.0625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.4401297569274902, "sampling/importance_sampling_ratio/mean": 1.000327706336975, "sampling/importance_sampling_ratio/min": 0.6556806564331055, "sampling/sampling_logp_difference/max": 0.4220813512802124, "sampling/sampling_logp_difference/mean": 0.011952435597777367, "step": 1174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 721.0, "completions/max_terminated_length": 721.0, "completions/mean_length": 392.453125, "completions/mean_terminated_length": 392.453125, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "entropy": 0.48704445362091064, "epoch": 1.4399509803921569, "frac_reward_zero_std": 0.5, "grad_norm": 0.8448350658076738, "kl": 0.030235296115279198, "learning_rate": 6.230334922750929e-07, "loss": 0.0073, "num_tokens": 50868136.0, "reward": 0.6875, "reward_std": 0.3811737596988678, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.327230453491211, "sampling/importance_sampling_ratio/mean": 1.0003479719161987, "sampling/importance_sampling_ratio/min": 0.7363001108169556, "sampling/sampling_logp_difference/max": 0.30611753463745117, "sampling/sampling_logp_difference/mean": 0.013905182480812073, "step": 1175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 613.0, "completions/max_terminated_length": 613.0, "completions/mean_length": 407.125, "completions/mean_terminated_length": 407.125, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "entropy": 0.429293692111969, "epoch": 1.4411764705882353, "frac_reward_zero_std": 0.75, "grad_norm": 0.4157248810517595, "kl": 0.02356378361582756, "learning_rate": 6.223428919975338e-07, "loss": -0.0122, "num_tokens": 50913088.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.466173768043518, "sampling/importance_sampling_ratio/mean": 0.9998127222061157, "sampling/importance_sampling_ratio/min": 0.6693886518478394, "sampling/sampling_logp_difference/max": 0.4013904333114624, "sampling/sampling_logp_difference/mean": 0.012548428028821945, "step": 1176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 697.0, "completions/max_terminated_length": 697.0, "completions/mean_length": 381.046875, "completions/mean_terminated_length": 381.046875, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.6513147354125977, "epoch": 1.4424019607843137, "frac_reward_zero_std": 1.0, "grad_norm": 0.022427025690847844, "kl": 0.04392850771546364, "learning_rate": 6.216520433716544e-07, "loss": 0.0004, "num_tokens": 50955187.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5250000953674316, "sampling/importance_sampling_ratio/mean": 1.000330924987793, "sampling/importance_sampling_ratio/min": 0.703342854976654, "sampling/sampling_logp_difference/max": 0.4219944477081299, "sampling/sampling_logp_difference/mean": 0.01756935566663742, "step": 1177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1403.0, "completions/max_terminated_length": 1403.0, "completions/mean_length": 504.484375, "completions/mean_terminated_length": 504.484375, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "entropy": 0.45238038897514343, "epoch": 1.4436274509803921, "frac_reward_zero_std": 0.5, "grad_norm": 0.6530129949226561, "kl": 0.0205577090382576, "learning_rate": 6.209609477998338e-07, "loss": 0.0679, "num_tokens": 51008322.0, "reward": 0.40625, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.8521831035614014, "sampling/importance_sampling_ratio/mean": 1.00010347366333, "sampling/importance_sampling_ratio/min": 0.6124925017356873, "sampling/sampling_logp_difference/max": 0.6163649559020996, "sampling/sampling_logp_difference/mean": 0.01342932227998972, "step": 1178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1039.0, "completions/max_terminated_length": 1039.0, "completions/mean_length": 379.1875, "completions/mean_terminated_length": 379.1875, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.5409349799156189, "epoch": 1.4448529411764706, "frac_reward_zero_std": 0.75, "grad_norm": 0.618845045321985, "kl": 0.04629231616854668, "learning_rate": 6.202696066849524e-07, "loss": 0.0128, "num_tokens": 51047086.0, "reward": -0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": -0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.291703462600708, "sampling/importance_sampling_ratio/mean": 0.9998644590377808, "sampling/importance_sampling_ratio/min": 0.6285853981971741, "sampling/sampling_logp_difference/max": 0.4642833471298218, "sampling/sampling_logp_difference/mean": 0.015438507311046124, "step": 1179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 875.0, "completions/max_terminated_length": 875.0, "completions/mean_length": 480.171875, "completions/mean_terminated_length": 480.171875, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "entropy": 0.4399579167366028, "epoch": 1.446078431372549, "frac_reward_zero_std": 0.5, "grad_norm": 0.5457415061730835, "kl": 0.029252566397190094, "learning_rate": 6.195780214303887e-07, "loss": 0.0114, "num_tokens": 51100697.0, "reward": 0.78125, "reward_std": 0.4101392924785614, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.5205997228622437, "sampling/importance_sampling_ratio/mean": 1.000070333480835, "sampling/importance_sampling_ratio/min": 0.6546991467475891, "sampling/sampling_logp_difference/max": 0.42357945442199707, "sampling/sampling_logp_difference/mean": 0.01269370224326849, "step": 1180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1325.0, "completions/max_terminated_length": 1325.0, "completions/mean_length": 484.609375, "completions/mean_terminated_length": 484.609375, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "entropy": 0.46973681449890137, "epoch": 1.4473039215686274, "frac_reward_zero_std": 1.0, "grad_norm": 0.014901044623184506, "kl": 0.025085825473070145, "learning_rate": 6.188861934400171e-07, "loss": 0.0002, "num_tokens": 51155200.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0002539157867432, "sampling/importance_sampling_ratio/min": 0.6461853981018066, "sampling/sampling_logp_difference/max": 0.9306650161743164, "sampling/sampling_logp_difference/mean": 0.01415589451789856, "step": 1181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 858.0, "completions/max_terminated_length": 858.0, "completions/mean_length": 475.515625, "completions/mean_terminated_length": 475.515625, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "entropy": 0.5185382962226868, "epoch": 1.4485294117647058, "frac_reward_zero_std": 0.5, "grad_norm": 0.6869048395275043, "kl": 0.024425484240055084, "learning_rate": 6.181941241182043e-07, "loss": -0.0563, "num_tokens": 51208385.0, "reward": 0.625, "reward_std": 0.4577302038669586, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.5536580085754395, "sampling/importance_sampling_ratio/mean": 1.0001342296600342, "sampling/importance_sampling_ratio/min": 0.6025477051734924, "sampling/sampling_logp_difference/max": 0.5065884590148926, "sampling/sampling_logp_difference/mean": 0.014569227583706379, "step": 1182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 679.0, "completions/max_terminated_length": 679.0, "completions/mean_length": 372.296875, "completions/mean_terminated_length": 372.296875, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.5652612447738647, "epoch": 1.4497549019607843, "frac_reward_zero_std": 0.5, "grad_norm": 0.8512878547892672, "kl": 0.042586423456668854, "learning_rate": 6.175018148698076e-07, "loss": -0.002, "num_tokens": 51250644.0, "reward": -0.1875, "reward_std": 0.4787135720252991, "rewards/decision_reward_func/mean": -0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 1.3517013788223267, "sampling/importance_sampling_ratio/mean": 0.9998493790626526, "sampling/importance_sampling_ratio/min": 0.6984497904777527, "sampling/sampling_logp_difference/max": 0.35889196395874023, "sampling/sampling_logp_difference/mean": 0.015965577214956284, "step": 1183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1236.0, "completions/max_terminated_length": 1236.0, "completions/mean_length": 421.390625, "completions/mean_terminated_length": 421.390625, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "entropy": 0.656187891960144, "epoch": 1.4509803921568627, "frac_reward_zero_std": 0.25, "grad_norm": 0.9414837948277957, "kl": 0.046463992446660995, "learning_rate": 6.168092671001705e-07, "loss": -0.0181, "num_tokens": 51298237.0, "reward": 0.4375, "reward_std": 0.6831300258636475, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.822308897972107, "sampling/importance_sampling_ratio/mean": 1.000313639640808, "sampling/importance_sampling_ratio/min": 0.6482823491096497, "sampling/sampling_logp_difference/max": 0.6001043319702148, "sampling/sampling_logp_difference/mean": 0.018264394253492355, "step": 1184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 749.0, "completions/max_terminated_length": 749.0, "completions/mean_length": 394.21875, "completions/mean_terminated_length": 394.21875, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "entropy": 0.4961603879928589, "epoch": 1.4522058823529411, "frac_reward_zero_std": 1.0, "grad_norm": 0.01806727528330865, "kl": 0.021866336464881897, "learning_rate": 6.161164822151213e-07, "loss": 0.0002, "num_tokens": 51342011.0, "reward": -0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": -0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6598310470581055, "sampling/importance_sampling_ratio/mean": 0.999944806098938, "sampling/importance_sampling_ratio/min": 0.5513415336608887, "sampling/sampling_logp_difference/max": 0.5954008102416992, "sampling/sampling_logp_difference/mean": 0.01506449282169342, "step": 1185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 876.0, "completions/max_terminated_length": 876.0, "completions/mean_length": 444.125, "completions/mean_terminated_length": 444.125, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "entropy": 0.40920332074165344, "epoch": 1.4534313725490196, "frac_reward_zero_std": 0.75, "grad_norm": 0.5065296810083237, "kl": 0.024736281484365463, "learning_rate": 6.154234616209692e-07, "loss": -0.0047, "num_tokens": 51390003.0, "reward": 0.28125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 1.6311161518096924, "sampling/importance_sampling_ratio/mean": 0.9999580383300781, "sampling/importance_sampling_ratio/min": 0.6596052646636963, "sampling/sampling_logp_difference/max": 0.4892646074295044, "sampling/sampling_logp_difference/mean": 0.011733708903193474, "step": 1186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1618.0, "completions/max_terminated_length": 1618.0, "completions/mean_length": 612.90625, "completions/mean_terminated_length": 612.90625, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "entropy": 0.5395448207855225, "epoch": 1.454656862745098, "frac_reward_zero_std": 0.25, "grad_norm": 0.7514134731327464, "kl": 0.024566709995269775, "learning_rate": 6.147302067245028e-07, "loss": 0.0409, "num_tokens": 51445501.0, "reward": 0.65625, "reward_std": 0.5539814233779907, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999076724052429, "sampling/importance_sampling_ratio/min": 0.5476773381233215, "sampling/sampling_logp_difference/max": 0.7688384056091309, "sampling/sampling_logp_difference/mean": 0.014941539615392685, "step": 1187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 990.0, "completions/max_terminated_length": 990.0, "completions/mean_length": 454.046875, "completions/mean_terminated_length": 454.046875, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "entropy": 0.5687962770462036, "epoch": 1.4558823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 0.655232732858212, "kl": 0.024690965190529823, "learning_rate": 6.140367189329847e-07, "loss": 0.0161, "num_tokens": 51492288.0, "reward": 0.84375, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.4823106527328491, "sampling/importance_sampling_ratio/mean": 1.000169038772583, "sampling/importance_sampling_ratio/min": 0.6463026404380798, "sampling/sampling_logp_difference/max": 0.43648743629455566, "sampling/sampling_logp_difference/mean": 0.01610930636525154, "step": 1188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1495.0, "completions/max_terminated_length": 1495.0, "completions/mean_length": 419.0625, "completions/mean_terminated_length": 419.0625, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.5566768050193787, "epoch": 1.4571078431372548, "frac_reward_zero_std": 0.75, "grad_norm": 0.6871222714663869, "kl": 0.026821233332157135, "learning_rate": 6.133429996541518e-07, "loss": 0.0372, "num_tokens": 51537572.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.3873220682144165, "sampling/importance_sampling_ratio/mean": 0.9996312856674194, "sampling/importance_sampling_ratio/min": 0.6666699647903442, "sampling/sampling_logp_difference/max": 0.4054602384567261, "sampling/sampling_logp_difference/mean": 0.01500630285590887, "step": 1189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 630.0, "completions/max_terminated_length": 630.0, "completions/mean_length": 354.28125, "completions/mean_terminated_length": 354.28125, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.5070624351501465, "epoch": 1.4583333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.02041732844445401, "kl": 0.024799419566988945, "learning_rate": 6.1264905029621e-07, "loss": 0.0002, "num_tokens": 51582214.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5744845867156982, "sampling/importance_sampling_ratio/mean": 0.9995591640472412, "sampling/importance_sampling_ratio/min": 0.5974079370498657, "sampling/sampling_logp_difference/max": 0.5151550769805908, "sampling/sampling_logp_difference/mean": 0.016240712255239487, "step": 1190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 956.0, "completions/max_terminated_length": 956.0, "completions/mean_length": 398.8125, "completions/mean_terminated_length": 398.8125, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.49642765522003174, "epoch": 1.4595588235294117, "frac_reward_zero_std": 0.75, "grad_norm": 0.5960454607449738, "kl": 0.031967148184776306, "learning_rate": 6.119548722678327e-07, "loss": 0.0018, "num_tokens": 51629210.0, "reward": 0.40625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.646484613418579, "sampling/importance_sampling_ratio/mean": 1.0000829696655273, "sampling/importance_sampling_ratio/min": 0.6407893300056458, "sampling/sampling_logp_difference/max": 0.4986424446105957, "sampling/sampling_logp_difference/mean": 0.0150093799456954, "step": 1191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/max_terminated_length": 527.0, "completions/mean_length": 354.71875, "completions/mean_terminated_length": 354.71875, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "entropy": 0.38593152165412903, "epoch": 1.4607843137254901, "frac_reward_zero_std": 1.0, "grad_norm": 0.014426405993638, "kl": 0.019831988960504532, "learning_rate": 6.112604669781572e-07, "loss": 0.0002, "num_tokens": 51669352.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4058058261871338, "sampling/importance_sampling_ratio/mean": 1.0001893043518066, "sampling/importance_sampling_ratio/min": 0.7300211191177368, "sampling/sampling_logp_difference/max": 0.3406106233596802, "sampling/sampling_logp_difference/mean": 0.011708008125424385, "step": 1192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1276.0, "completions/max_terminated_length": 1276.0, "completions/mean_length": 455.0, "completions/mean_terminated_length": 455.0, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "entropy": 0.600143551826477, "epoch": 1.4620098039215685, "frac_reward_zero_std": 1.0, "grad_norm": 0.02131378378620538, "kl": 0.02796955779194832, "learning_rate": 6.105658358367822e-07, "loss": 0.0003, "num_tokens": 51717560.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.528290033340454, "sampling/importance_sampling_ratio/mean": 0.9996665716171265, "sampling/importance_sampling_ratio/min": 0.6633703112602234, "sampling/sampling_logp_difference/max": 0.4241495132446289, "sampling/sampling_logp_difference/mean": 0.015535151585936546, "step": 1193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 921.0, "completions/max_terminated_length": 921.0, "completions/mean_length": 343.734375, "completions/mean_terminated_length": 343.734375, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "entropy": 0.37109270691871643, "epoch": 1.4632352941176472, "frac_reward_zero_std": 1.0, "grad_norm": 0.015653711440314338, "kl": 0.021914657205343246, "learning_rate": 6.098709802537653e-07, "loss": 0.0002, "num_tokens": 51752871.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5990926027297974, "sampling/importance_sampling_ratio/mean": 1.00034761428833, "sampling/importance_sampling_ratio/min": 0.6771636605262756, "sampling/sampling_logp_difference/max": 0.4694364070892334, "sampling/sampling_logp_difference/mean": 0.012475812807679176, "step": 1194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1033.0, "completions/max_terminated_length": 1033.0, "completions/mean_length": 373.40625, "completions/mean_terminated_length": 373.40625, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.4696049392223358, "epoch": 1.4644607843137254, "frac_reward_zero_std": 1.0, "grad_norm": 0.017635781383970327, "kl": 0.024216197431087494, "learning_rate": 6.091759016396188e-07, "loss": 0.0002, "num_tokens": 51794129.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5071556568145752, "sampling/importance_sampling_ratio/mean": 1.0009249448776245, "sampling/importance_sampling_ratio/min": 0.6119223833084106, "sampling/sampling_logp_difference/max": 0.49114990234375, "sampling/sampling_logp_difference/mean": 0.014795937575399876, "step": 1195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1163.0, "completions/max_terminated_length": 1163.0, "completions/mean_length": 393.171875, "completions/mean_terminated_length": 393.171875, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "entropy": 0.494595468044281, "epoch": 1.465686274509804, "frac_reward_zero_std": 0.75, "grad_norm": 0.5934869435246309, "kl": 0.029739778488874435, "learning_rate": 6.084806014053086e-07, "loss": -0.0213, "num_tokens": 51835356.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.5421230792999268, "sampling/importance_sampling_ratio/mean": 0.9999922513961792, "sampling/importance_sampling_ratio/min": 0.6719419956207275, "sampling/sampling_logp_difference/max": 0.43316006660461426, "sampling/sampling_logp_difference/mean": 0.015194709412753582, "step": 1196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 721.0, "completions/max_terminated_length": 721.0, "completions/mean_length": 390.09375, "completions/mean_terminated_length": 390.09375, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.2778139114379883, "epoch": 1.4669117647058822, "frac_reward_zero_std": 1.0, "grad_norm": 0.012568813020825544, "kl": 0.019348476082086563, "learning_rate": 6.077850809622498e-07, "loss": 0.0002, "num_tokens": 51877858.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999648332595825, "sampling/importance_sampling_ratio/min": 0.5910114645957947, "sampling/sampling_logp_difference/max": 0.7514228820800781, "sampling/sampling_logp_difference/mean": 0.009245004504919052, "step": 1197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 788.0, "completions/max_terminated_length": 788.0, "completions/mean_length": 345.828125, "completions/mean_terminated_length": 345.828125, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.6207694411277771, "epoch": 1.468137254901961, "frac_reward_zero_std": 0.5, "grad_norm": 0.7546150961917895, "kl": 0.059916622936725616, "learning_rate": 6.070893417223052e-07, "loss": 0.0104, "num_tokens": 51913863.0, "reward": 0.4375, "reward_std": 0.5081988573074341, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.381687879562378, "sampling/importance_sampling_ratio/mean": 1.000241994857788, "sampling/importance_sampling_ratio/min": 0.6745367050170898, "sampling/sampling_logp_difference/max": 0.39372920989990234, "sampling/sampling_logp_difference/mean": 0.017135154455900192, "step": 1198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1490.0, "completions/max_terminated_length": 1490.0, "completions/mean_length": 535.125, "completions/mean_terminated_length": 535.125, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.4233894348144531, "epoch": 1.469362745098039, "frac_reward_zero_std": 0.5, "grad_norm": 0.6952937389490423, "kl": 0.025438860058784485, "learning_rate": 6.06393385097781e-07, "loss": 0.0891, "num_tokens": 51968271.0, "reward": 0.53125, "reward_std": 0.42516323924064636, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.4097667932510376, "sampling/importance_sampling_ratio/mean": 0.999982476234436, "sampling/importance_sampling_ratio/min": 0.6172911524772644, "sampling/sampling_logp_difference/max": 0.48241448402404785, "sampling/sampling_logp_difference/mean": 0.01219320297241211, "step": 1199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 738.0, "completions/max_terminated_length": 738.0, "completions/mean_length": 409.1875, "completions/mean_terminated_length": 409.1875, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.4163162112236023, "epoch": 1.4705882352941178, "frac_reward_zero_std": 0.75, "grad_norm": 0.48334538892280104, "kl": 0.0272955521941185, "learning_rate": 6.056972125014254e-07, "loss": -0.0038, "num_tokens": 52011819.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.5281147956848145, "sampling/importance_sampling_ratio/mean": 0.9997579455375671, "sampling/importance_sampling_ratio/min": 0.4982426166534424, "sampling/sampling_logp_difference/max": 0.6966681480407715, "sampling/sampling_logp_difference/mean": 0.012651880271732807, "step": 1200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 840.0, "completions/max_terminated_length": 840.0, "completions/mean_length": 399.828125, "completions/mean_terminated_length": 399.828125, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.5757938027381897, "epoch": 1.471813725490196, "frac_reward_zero_std": 0.5, "grad_norm": 0.8338269414129044, "kl": 0.046009600162506104, "learning_rate": 6.050008253464246e-07, "loss": 0.0064, "num_tokens": 52056096.0, "reward": 0.4375, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.4271239042282104, "sampling/importance_sampling_ratio/mean": 0.9998281598091125, "sampling/importance_sampling_ratio/min": 0.6171547174453735, "sampling/sampling_logp_difference/max": 0.482635498046875, "sampling/sampling_logp_difference/mean": 0.01721961237490177, "step": 1201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 592.0, "completions/max_terminated_length": 592.0, "completions/mean_length": 430.46875, "completions/mean_terminated_length": 430.46875, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "entropy": 0.5403873920440674, "epoch": 1.4730392156862746, "frac_reward_zero_std": 0.5, "grad_norm": 0.6665013882626817, "kl": 0.028006453067064285, "learning_rate": 6.043042250464004e-07, "loss": -0.0073, "num_tokens": 52106158.0, "reward": 0.3125, "reward_std": 0.3811737596988678, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.603816270828247, "sampling/importance_sampling_ratio/mean": 0.9999220371246338, "sampling/importance_sampling_ratio/min": 0.6005942225456238, "sampling/sampling_logp_difference/max": 0.5098357200622559, "sampling/sampling_logp_difference/mean": 0.015560070052742958, "step": 1202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 771.0, "completions/max_terminated_length": 771.0, "completions/mean_length": 394.65625, "completions/mean_terminated_length": 394.65625, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.5158013701438904, "epoch": 1.4742647058823528, "frac_reward_zero_std": 1.0, "grad_norm": 0.02724495370190254, "kl": 0.04686982184648514, "learning_rate": 6.036074130154071e-07, "loss": 0.0005, "num_tokens": 52148168.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.561400055885315, "sampling/importance_sampling_ratio/mean": 1.0004632472991943, "sampling/importance_sampling_ratio/min": 0.5495344996452332, "sampling/sampling_logp_difference/max": 0.5986837148666382, "sampling/sampling_logp_difference/mean": 0.014698922634124756, "step": 1203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1067.0, "completions/max_terminated_length": 1067.0, "completions/mean_length": 427.484375, "completions/mean_terminated_length": 427.484375, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "entropy": 0.38033032417297363, "epoch": 1.4754901960784315, "frac_reward_zero_std": 1.0, "grad_norm": 0.014563494456149536, "kl": 0.022128473967313766, "learning_rate": 6.029103906679293e-07, "loss": 0.0002, "num_tokens": 52192167.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4563186168670654, "sampling/importance_sampling_ratio/mean": 0.9997975826263428, "sampling/importance_sampling_ratio/min": 0.6620621085166931, "sampling/sampling_logp_difference/max": 0.4123959541320801, "sampling/sampling_logp_difference/mean": 0.011722361668944359, "step": 1204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1810.0, "completions/max_terminated_length": 1810.0, "completions/mean_length": 565.984375, "completions/mean_terminated_length": 565.984375, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.47316232323646545, "epoch": 1.4767156862745099, "frac_reward_zero_std": 1.0, "grad_norm": 0.013592924726218905, "kl": 0.021838298067450523, "learning_rate": 6.022131594188777e-07, "loss": 0.0002, "num_tokens": 52252822.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5765693187713623, "sampling/importance_sampling_ratio/mean": 1.0003819465637207, "sampling/importance_sampling_ratio/min": 0.22631359100341797, "sampling/sampling_logp_difference/max": 1.4858336448669434, "sampling/sampling_logp_difference/mean": 0.013530624099075794, "step": 1205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 845.0, "completions/max_terminated_length": 845.0, "completions/mean_length": 434.0, "completions/mean_terminated_length": 434.0, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.5944957733154297, "epoch": 1.4779411764705883, "frac_reward_zero_std": 1.0, "grad_norm": 0.020249944416700714, "kl": 0.028778500854969025, "learning_rate": 6.01515720683588e-07, "loss": 0.0003, "num_tokens": 52294214.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4531595706939697, "sampling/importance_sampling_ratio/mean": 0.999995768070221, "sampling/importance_sampling_ratio/min": 0.6598353981971741, "sampling/sampling_logp_difference/max": 0.4157649278640747, "sampling/sampling_logp_difference/mean": 0.016931481659412384, "step": 1206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1666.0, "completions/max_terminated_length": 1666.0, "completions/mean_length": 475.890625, "completions/mean_terminated_length": 475.890625, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.5711390972137451, "epoch": 1.4791666666666667, "frac_reward_zero_std": 0.5, "grad_norm": 0.6658647087305107, "kl": 0.028563648462295532, "learning_rate": 6.008180758778166e-07, "loss": 0.0107, "num_tokens": 52342303.0, "reward": -0.125, "reward_std": 0.4577302038669586, "rewards/decision_reward_func/mean": -0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 1.4769821166992188, "sampling/importance_sampling_ratio/mean": 0.9998538494110107, "sampling/importance_sampling_ratio/min": 0.1384407877922058, "sampling/sampling_logp_difference/max": 1.9773125648498535, "sampling/sampling_logp_difference/mean": 0.01639428548514843, "step": 1207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1771.0, "completions/max_terminated_length": 1771.0, "completions/mean_length": 535.453125, "completions/mean_terminated_length": 535.453125, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "entropy": 0.604948878288269, "epoch": 1.4803921568627452, "frac_reward_zero_std": 0.5, "grad_norm": 0.6584062423754772, "kl": 0.030792122706770897, "learning_rate": 6.001202264177382e-07, "loss": -0.059, "num_tokens": 52395740.0, "reward": 0.875, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.999986469745636, "sampling/importance_sampling_ratio/min": 0.5521731972694397, "sampling/sampling_logp_difference/max": 0.9884486198425293, "sampling/sampling_logp_difference/mean": 0.015697356313467026, "step": 1208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 596.0, "completions/max_terminated_length": 596.0, "completions/mean_length": 376.5, "completions/mean_terminated_length": 376.5, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "entropy": 0.44362109899520874, "epoch": 1.4816176470588236, "frac_reward_zero_std": 1.0, "grad_norm": 0.014237786436668384, "kl": 0.02218572050333023, "learning_rate": 5.99422173719943e-07, "loss": 0.0002, "num_tokens": 52436572.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6549978256225586, "sampling/importance_sampling_ratio/mean": 1.0000085830688477, "sampling/importance_sampling_ratio/min": 0.3292819857597351, "sampling/sampling_logp_difference/max": 1.1108407974243164, "sampling/sampling_logp_difference/mean": 0.012789356522262096, "step": 1209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 902.0, "completions/max_terminated_length": 902.0, "completions/mean_length": 366.609375, "completions/mean_terminated_length": 366.609375, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.6291374564170837, "epoch": 1.482843137254902, "frac_reward_zero_std": 1.0, "grad_norm": 0.03085771309604077, "kl": 0.0374436154961586, "learning_rate": 5.987239192014335e-07, "loss": 0.0003, "num_tokens": 52482019.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999542236328125, "sampling/importance_sampling_ratio/min": 0.6107114553451538, "sampling/sampling_logp_difference/max": 1.0031967163085938, "sampling/sampling_logp_difference/mean": 0.017536725848913193, "step": 1210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1045.0, "completions/max_terminated_length": 1045.0, "completions/mean_length": 330.546875, "completions/mean_terminated_length": 330.546875, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "entropy": 0.5885103940963745, "epoch": 1.4840686274509804, "frac_reward_zero_std": 0.5, "grad_norm": 1.0017024785756747, "kl": 0.044531576335430145, "learning_rate": 5.980254642796226e-07, "loss": 0.1278, "num_tokens": 52520134.0, "reward": 0.8125, "reward_std": 0.3943893015384674, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.6364585161209106, "sampling/importance_sampling_ratio/mean": 0.9999128580093384, "sampling/importance_sampling_ratio/min": 0.6090309023857117, "sampling/sampling_logp_difference/max": 0.49588632583618164, "sampling/sampling_logp_difference/mean": 0.015765581279993057, "step": 1211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 835.0, "completions/max_terminated_length": 835.0, "completions/mean_length": 319.734375, "completions/mean_terminated_length": 319.734375, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.5783239603042603, "epoch": 1.4852941176470589, "frac_reward_zero_std": 0.5, "grad_norm": 0.958899253191887, "kl": 0.0346609391272068, "learning_rate": 5.973268103723293e-07, "loss": -0.0364, "num_tokens": 52555621.0, "reward": 0.71875, "reward_std": 0.38319888710975647, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.6379132270812988, "sampling/importance_sampling_ratio/mean": 1.0004684925079346, "sampling/importance_sampling_ratio/min": 0.6622576713562012, "sampling/sampling_logp_difference/max": 0.4934229850769043, "sampling/sampling_logp_difference/mean": 0.016750652343034744, "step": 1212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1692.0, "completions/max_terminated_length": 1692.0, "completions/mean_length": 585.96875, "completions/mean_terminated_length": 585.96875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.5747024416923523, "epoch": 1.4865196078431373, "frac_reward_zero_std": 1.0, "grad_norm": 0.013965465850331954, "kl": 0.020558426156640053, "learning_rate": 5.966279588977766e-07, "loss": 0.0002, "num_tokens": 52613587.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4542256593704224, "sampling/importance_sampling_ratio/mean": 1.0000832080841064, "sampling/importance_sampling_ratio/min": 0.5362950563430786, "sampling/sampling_logp_difference/max": 0.6230708360671997, "sampling/sampling_logp_difference/mean": 0.01493415329605341, "step": 1213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1013.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 439.28125, "completions/mean_terminated_length": 439.28125, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "entropy": 0.5008476972579956, "epoch": 1.4877450980392157, "frac_reward_zero_std": 0.25, "grad_norm": 0.9055437006888781, "kl": 0.027895614504814148, "learning_rate": 5.959289112745891e-07, "loss": -0.0346, "num_tokens": 52659829.0, "reward": -0.09375, "reward_std": 0.519389271736145, "rewards/decision_reward_func/mean": -0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.8048404455184937, "sampling/importance_sampling_ratio/mean": 0.9998764991760254, "sampling/importance_sampling_ratio/min": 0.6841003894805908, "sampling/sampling_logp_difference/max": 0.5904722213745117, "sampling/sampling_logp_difference/mean": 0.013850938528776169, "step": 1214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 901.0, "completions/max_terminated_length": 901.0, "completions/mean_length": 422.421875, "completions/mean_terminated_length": 422.421875, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.5481004118919373, "epoch": 1.4889705882352942, "frac_reward_zero_std": 0.75, "grad_norm": 0.4447730324591705, "kl": 0.03023402951657772, "learning_rate": 5.952296689217889e-07, "loss": 0.0276, "num_tokens": 52702720.0, "reward": 0.75, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0002691745758057, "sampling/importance_sampling_ratio/min": 0.6735841631889343, "sampling/sampling_logp_difference/max": 1.1260194778442383, "sampling/sampling_logp_difference/mean": 0.01574394851922989, "step": 1215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1616.0, "completions/max_terminated_length": 1616.0, "completions/mean_length": 375.328125, "completions/mean_terminated_length": 375.328125, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.41246601939201355, "epoch": 1.4901960784313726, "frac_reward_zero_std": 0.75, "grad_norm": 2.661693370643514, "kl": 0.023715132847428322, "learning_rate": 5.945302332587938e-07, "loss": 0.1512, "num_tokens": 52743333.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.5467777252197266, "sampling/importance_sampling_ratio/mean": 1.0001940727233887, "sampling/importance_sampling_ratio/min": 0.6829707622528076, "sampling/sampling_logp_difference/max": 0.4361739158630371, "sampling/sampling_logp_difference/mean": 0.013123776763677597, "step": 1216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 817.0, "completions/max_terminated_length": 817.0, "completions/mean_length": 355.65625, "completions/mean_terminated_length": 355.65625, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "entropy": 0.4654247462749481, "epoch": 1.491421568627451, "frac_reward_zero_std": 1.0, "grad_norm": 0.01609559991906069, "kl": 0.028806384652853012, "learning_rate": 5.938306057054138e-07, "loss": 0.0003, "num_tokens": 52782447.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4136556386947632, "sampling/importance_sampling_ratio/mean": 0.9997189044952393, "sampling/importance_sampling_ratio/min": 0.7300189137458801, "sampling/sampling_logp_difference/max": 0.3461790084838867, "sampling/sampling_logp_difference/mean": 0.01323087140917778, "step": 1217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 699.0, "completions/max_terminated_length": 699.0, "completions/mean_length": 384.28125, "completions/mean_terminated_length": 384.28125, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.38054656982421875, "epoch": 1.4926470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.011840282960027248, "kl": 0.016814030706882477, "learning_rate": 5.931307876818487e-07, "loss": 0.0002, "num_tokens": 52825953.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5584927797317505, "sampling/importance_sampling_ratio/mean": 1.0002799034118652, "sampling/importance_sampling_ratio/min": 0.686096727848053, "sampling/sampling_logp_difference/max": 0.44371914863586426, "sampling/sampling_logp_difference/mean": 0.011886363849043846, "step": 1218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 882.0, "completions/max_terminated_length": 882.0, "completions/mean_length": 350.390625, "completions/mean_terminated_length": 350.390625, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.552463948726654, "epoch": 1.4938725490196079, "frac_reward_zero_std": 0.5, "grad_norm": 0.7932911072274587, "kl": 0.03970145061612129, "learning_rate": 5.924307806086843e-07, "loss": 0.0199, "num_tokens": 52864650.0, "reward": 0.53125, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.5096708536148071, "sampling/importance_sampling_ratio/mean": 0.9994312524795532, "sampling/importance_sampling_ratio/min": 0.6082014441490173, "sampling/sampling_logp_difference/max": 0.49724912643432617, "sampling/sampling_logp_difference/mean": 0.016974708065390587, "step": 1219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 652.0, "completions/max_terminated_length": 652.0, "completions/mean_length": 320.21875, "completions/mean_terminated_length": 320.21875, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.44685542583465576, "epoch": 1.4950980392156863, "frac_reward_zero_std": 0.75, "grad_norm": 0.7508588646282508, "kl": 0.022271428257226944, "learning_rate": 5.917305859068911e-07, "loss": -0.0069, "num_tokens": 52902280.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.4885724782943726, "sampling/importance_sampling_ratio/mean": 0.9999949932098389, "sampling/importance_sampling_ratio/min": 0.6488879323005676, "sampling/sampling_logp_difference/max": 0.43249523639678955, "sampling/sampling_logp_difference/mean": 0.014417397789657116, "step": 1220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 779.0, "completions/max_terminated_length": 779.0, "completions/mean_length": 384.71875, "completions/mean_terminated_length": 384.71875, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "entropy": 0.36060237884521484, "epoch": 1.4963235294117647, "frac_reward_zero_std": 0.75, "grad_norm": 0.5352750480685737, "kl": 0.01775163784623146, "learning_rate": 5.910302049978199e-07, "loss": 0.0367, "num_tokens": 52944198.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.5537320375442505, "sampling/importance_sampling_ratio/mean": 0.9998181462287903, "sampling/importance_sampling_ratio/min": 0.7274041175842285, "sampling/sampling_logp_difference/max": 0.440659761428833, "sampling/sampling_logp_difference/mean": 0.01135160680860281, "step": 1221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 621.0, "completions/max_terminated_length": 621.0, "completions/mean_length": 382.1875, "completions/mean_terminated_length": 382.1875, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "entropy": 0.5645700693130493, "epoch": 1.4975490196078431, "frac_reward_zero_std": 0.75, "grad_norm": 0.531896837467146, "kl": 0.02695106528699398, "learning_rate": 5.903296393031995e-07, "loss": 0.0047, "num_tokens": 52986834.0, "reward": 0.09375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.275146484375, "sampling/importance_sampling_ratio/mean": 0.9994782209396362, "sampling/importance_sampling_ratio/min": 0.6535549163818359, "sampling/sampling_logp_difference/max": 0.42532873153686523, "sampling/sampling_logp_difference/mean": 0.015823733061552048, "step": 1222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 981.0, "completions/max_terminated_length": 981.0, "completions/mean_length": 458.078125, "completions/mean_terminated_length": 458.078125, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "entropy": 0.4545527398586273, "epoch": 1.4987745098039216, "frac_reward_zero_std": 0.75, "grad_norm": 0.45275214859872975, "kl": 0.01809149608016014, "learning_rate": 5.896288902451338e-07, "loss": 0.0118, "num_tokens": 53038871.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.8841400146484375, "sampling/importance_sampling_ratio/mean": 1.0000050067901611, "sampling/importance_sampling_ratio/min": 0.5480577349662781, "sampling/sampling_logp_difference/max": 0.6334714889526367, "sampling/sampling_logp_difference/mean": 0.013124614022672176, "step": 1223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 617.0, "completions/max_terminated_length": 617.0, "completions/mean_length": 260.453125, "completions/mean_terminated_length": 260.453125, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.37737131118774414, "epoch": 1.5, "frac_reward_zero_std": 0.75, "grad_norm": 0.8785688309165153, "kl": 0.02622716873884201, "learning_rate": 5.88927959246099e-07, "loss": -0.0775, "num_tokens": 53071572.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.5344412326812744, "sampling/importance_sampling_ratio/mean": 0.9998807907104492, "sampling/importance_sampling_ratio/min": 0.6768407225608826, "sampling/sampling_logp_difference/max": 0.4281662702560425, "sampling/sampling_logp_difference/mean": 0.012583919800817966, "step": 1224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 763.0, "completions/max_terminated_length": 763.0, "completions/mean_length": 394.578125, "completions/mean_terminated_length": 394.578125, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "entropy": 0.5095852017402649, "epoch": 1.5012254901960784, "frac_reward_zero_std": 0.5, "grad_norm": 0.789671930883544, "kl": 0.024531448259949684, "learning_rate": 5.882268477289408e-07, "loss": -0.0172, "num_tokens": 53118105.0, "reward": 0.5, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.8032361268997192, "sampling/importance_sampling_ratio/mean": 0.9999364614486694, "sampling/importance_sampling_ratio/min": 0.6262656450271606, "sampling/sampling_logp_difference/max": 0.5895829200744629, "sampling/sampling_logp_difference/mean": 0.015175054781138897, "step": 1225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1324.0, "completions/max_terminated_length": 1324.0, "completions/mean_length": 404.875, "completions/mean_terminated_length": 404.875, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.5304100513458252, "epoch": 1.5024509803921569, "frac_reward_zero_std": 0.75, "grad_norm": 0.6996874432729274, "kl": 0.01914961263537407, "learning_rate": 5.875255571168709e-07, "loss": 0.0899, "num_tokens": 53159985.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.5632814168930054, "sampling/importance_sampling_ratio/mean": 1.0001071691513062, "sampling/importance_sampling_ratio/min": 0.7726065516471863, "sampling/sampling_logp_difference/max": 0.44678711891174316, "sampling/sampling_logp_difference/mean": 0.015443658456206322, "step": 1226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 656.0, "completions/max_terminated_length": 656.0, "completions/mean_length": 375.015625, "completions/mean_terminated_length": 375.015625, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "entropy": 0.39420026540756226, "epoch": 1.5036764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.012136236686997292, "kl": 0.01725343056023121, "learning_rate": 5.868240888334652e-07, "loss": 0.0002, "num_tokens": 53200258.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6444485187530518, "sampling/importance_sampling_ratio/mean": 0.9999780058860779, "sampling/importance_sampling_ratio/min": 0.5019737482070923, "sampling/sampling_logp_difference/max": 0.6892074346542358, "sampling/sampling_logp_difference/mean": 0.013112051412463188, "step": 1227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 688.0, "completions/max_terminated_length": 688.0, "completions/mean_length": 385.765625, "completions/mean_terminated_length": 385.765625, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "entropy": 0.5383455157279968, "epoch": 1.5049019607843137, "frac_reward_zero_std": 1.0, "grad_norm": 0.02322106139400914, "kl": 0.03493364527821541, "learning_rate": 5.861224443026595e-07, "loss": 0.0003, "num_tokens": 53245171.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4851622581481934, "sampling/importance_sampling_ratio/mean": 1.0003526210784912, "sampling/importance_sampling_ratio/min": 0.5566736459732056, "sampling/sampling_logp_difference/max": 0.585776150226593, "sampling/sampling_logp_difference/mean": 0.014934931881725788, "step": 1228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1005.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 363.953125, "completions/mean_terminated_length": 363.953125, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.5384191274642944, "epoch": 1.5061274509803921, "frac_reward_zero_std": 0.75, "grad_norm": 0.5132142310252358, "kl": 0.02944401279091835, "learning_rate": 5.854206249487478e-07, "loss": -0.0087, "num_tokens": 53283408.0, "reward": 0.25, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.25, "rewards/decision_reward_func/std": 0.9759001135826111, "sampling/importance_sampling_ratio/max": 1.3022958040237427, "sampling/importance_sampling_ratio/mean": 0.9997113943099976, "sampling/importance_sampling_ratio/min": 0.4309251308441162, "sampling/sampling_logp_difference/max": 0.8418209552764893, "sampling/sampling_logp_difference/mean": 0.015233570709824562, "step": 1229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 606.0, "completions/max_terminated_length": 606.0, "completions/mean_length": 349.171875, "completions/mean_terminated_length": 349.171875, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.35951751470565796, "epoch": 1.5073529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.010835901994919872, "kl": 0.01974605768918991, "learning_rate": 5.847186321963792e-07, "loss": 0.0002, "num_tokens": 53324347.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4730799198150635, "sampling/importance_sampling_ratio/mean": 0.9995729327201843, "sampling/importance_sampling_ratio/min": 0.5757108330726624, "sampling/sampling_logp_difference/max": 0.552149772644043, "sampling/sampling_logp_difference/mean": 0.011699637398123741, "step": 1230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 883.0, "completions/max_terminated_length": 883.0, "completions/mean_length": 389.03125, "completions/mean_terminated_length": 389.03125, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.4742935299873352, "epoch": 1.508578431372549, "frac_reward_zero_std": 0.75, "grad_norm": 0.6367886017807545, "kl": 0.022771939635276794, "learning_rate": 5.840164674705542e-07, "loss": 0.0305, "num_tokens": 53367933.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.7187939882278442, "sampling/importance_sampling_ratio/mean": 0.9999854564666748, "sampling/importance_sampling_ratio/min": 0.7318440079689026, "sampling/sampling_logp_difference/max": 0.5416228771209717, "sampling/sampling_logp_difference/mean": 0.01460953801870346, "step": 1231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1019.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 466.03125, "completions/mean_terminated_length": 466.03125, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "entropy": 0.3894349932670593, "epoch": 1.5098039215686274, "frac_reward_zero_std": 1.0, "grad_norm": 0.009250307562962435, "kl": 0.013244688510894775, "learning_rate": 5.833141321966228e-07, "loss": 0.0001, "num_tokens": 53418527.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5640572309494019, "sampling/importance_sampling_ratio/mean": 1.0002397298812866, "sampling/importance_sampling_ratio/min": 0.49609941244125366, "sampling/sampling_logp_difference/max": 0.7009789943695068, "sampling/sampling_logp_difference/mean": 0.012192562222480774, "step": 1232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 849.0, "completions/max_terminated_length": 849.0, "completions/mean_length": 370.34375, "completions/mean_terminated_length": 370.34375, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "entropy": 0.33082956075668335, "epoch": 1.5110294117647058, "frac_reward_zero_std": 1.0, "grad_norm": 0.010267647491270147, "kl": 0.015763510018587112, "learning_rate": 5.826116278002813e-07, "loss": 0.0002, "num_tokens": 53456773.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.540906548500061, "sampling/importance_sampling_ratio/mean": 1.0001130104064941, "sampling/importance_sampling_ratio/min": 0.5374171733856201, "sampling/sampling_logp_difference/max": 0.6209806203842163, "sampling/sampling_logp_difference/mean": 0.011225579306483269, "step": 1233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 662.0, "completions/max_terminated_length": 662.0, "completions/mean_length": 402.96875, "completions/mean_terminated_length": 402.96875, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "entropy": 0.5502533912658691, "epoch": 1.5122549019607843, "frac_reward_zero_std": 1.0, "grad_norm": 0.016010271360306666, "kl": 0.028002284467220306, "learning_rate": 5.819089557075688e-07, "loss": 0.0003, "num_tokens": 53500531.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5849493741989136, "sampling/importance_sampling_ratio/mean": 1.0000953674316406, "sampling/importance_sampling_ratio/min": 0.5693737864494324, "sampling/sampling_logp_difference/max": 0.5632181167602539, "sampling/sampling_logp_difference/mean": 0.01531283650547266, "step": 1234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 582.0, "completions/max_terminated_length": 582.0, "completions/mean_length": 371.71875, "completions/mean_terminated_length": 371.71875, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "entropy": 0.4450437128543854, "epoch": 1.5134803921568627, "frac_reward_zero_std": 0.5, "grad_norm": 0.7548796266324405, "kl": 0.028558284044265747, "learning_rate": 5.812061173448654e-07, "loss": 0.0183, "num_tokens": 53544961.0, "reward": 0.46875, "reward_std": 0.3723389506340027, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.7520533800125122, "sampling/importance_sampling_ratio/mean": 1.0003105401992798, "sampling/importance_sampling_ratio/min": 0.6283126473426819, "sampling/sampling_logp_difference/max": 0.5607883930206299, "sampling/sampling_logp_difference/mean": 0.014127160422503948, "step": 1235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1094.0, "completions/max_terminated_length": 1094.0, "completions/mean_length": 456.46875, "completions/mean_terminated_length": 456.46875, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "entropy": 0.5178282260894775, "epoch": 1.5147058823529411, "frac_reward_zero_std": 0.5, "grad_norm": 0.7297667838095948, "kl": 0.02084452286362648, "learning_rate": 5.805031141388883e-07, "loss": 0.0615, "num_tokens": 53595343.0, "reward": 0.5, "reward_std": 0.40311288833618164, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5417104959487915, "sampling/importance_sampling_ratio/mean": 1.0000627040863037, "sampling/importance_sampling_ratio/min": 0.616231381893158, "sampling/sampling_logp_difference/max": 0.4841327667236328, "sampling/sampling_logp_difference/mean": 0.013662491925060749, "step": 1236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 822.0, "completions/max_terminated_length": 822.0, "completions/mean_length": 461.859375, "completions/mean_terminated_length": 461.859375, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "entropy": 0.4259073734283447, "epoch": 1.5159313725490198, "frac_reward_zero_std": 0.75, "grad_norm": 0.43628947768930926, "kl": 0.02235550433397293, "learning_rate": 5.797999475166896e-07, "loss": 0.0124, "num_tokens": 53656566.0, "reward": 0.3125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.4478929042816162, "sampling/importance_sampling_ratio/mean": 1.0002455711364746, "sampling/importance_sampling_ratio/min": 0.7694786787033081, "sampling/sampling_logp_difference/max": 0.37010931968688965, "sampling/sampling_logp_difference/mean": 0.012528982944786549, "step": 1237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 933.0, "completions/max_terminated_length": 933.0, "completions/mean_length": 427.078125, "completions/mean_terminated_length": 427.078125, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "entropy": 0.5931280255317688, "epoch": 1.517156862745098, "frac_reward_zero_std": 0.25, "grad_norm": 0.8011467594779683, "kl": 0.03711611032485962, "learning_rate": 5.790966189056529e-07, "loss": -0.0652, "num_tokens": 53703675.0, "reward": -0.21875, "reward_std": 0.7297805547714233, "rewards/decision_reward_func/mean": -0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 1.5339051485061646, "sampling/importance_sampling_ratio/mean": 0.9994949698448181, "sampling/importance_sampling_ratio/min": 0.6470662951469421, "sampling/sampling_logp_difference/max": 0.4353065490722656, "sampling/sampling_logp_difference/mean": 0.016457125544548035, "step": 1238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 943.0, "completions/max_terminated_length": 943.0, "completions/mean_length": 442.03125, "completions/mean_terminated_length": 442.03125, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "entropy": 0.5814867615699768, "epoch": 1.5183823529411766, "frac_reward_zero_std": 0.25, "grad_norm": 0.8394738878569395, "kl": 0.03457724675536156, "learning_rate": 5.783931297334907e-07, "loss": -0.0334, "num_tokens": 53754397.0, "reward": -0.34375, "reward_std": 0.6223389506340027, "rewards/decision_reward_func/mean": -0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 1.3693273067474365, "sampling/importance_sampling_ratio/mean": 0.9999021291732788, "sampling/importance_sampling_ratio/min": 0.629837691783905, "sampling/sampling_logp_difference/max": 0.4622931480407715, "sampling/sampling_logp_difference/mean": 0.01567952334880829, "step": 1239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1081.0, "completions/max_terminated_length": 1081.0, "completions/mean_length": 431.21875, "completions/mean_terminated_length": 431.21875, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.5343937873840332, "epoch": 1.5196078431372548, "frac_reward_zero_std": 1.0, "grad_norm": 0.012921474371393054, "kl": 0.037964750081300735, "learning_rate": 5.776894814282415e-07, "loss": 0.0002, "num_tokens": 53799259.0, "reward": -0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": -0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6553616523742676, "sampling/importance_sampling_ratio/mean": 1.000335693359375, "sampling/importance_sampling_ratio/min": 0.5483723282814026, "sampling/sampling_logp_difference/max": 0.6008007526397705, "sampling/sampling_logp_difference/mean": 0.01605311781167984, "step": 1240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1011.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 376.203125, "completions/mean_terminated_length": 376.203125, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "entropy": 0.547390341758728, "epoch": 1.5208333333333335, "frac_reward_zero_std": 0.75, "grad_norm": 0.5036048504825699, "kl": 0.02275344543159008, "learning_rate": 5.769856754182667e-07, "loss": 0.0172, "num_tokens": 53842152.0, "reward": 0.78125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.40597403049469, "sampling/importance_sampling_ratio/mean": 0.9996819496154785, "sampling/importance_sampling_ratio/min": 0.6407023668289185, "sampling/sampling_logp_difference/max": 0.44519031047821045, "sampling/sampling_logp_difference/mean": 0.016155727207660675, "step": 1241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 748.0, "completions/max_terminated_length": 748.0, "completions/mean_length": 411.546875, "completions/mean_terminated_length": 411.546875, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.529614269733429, "epoch": 1.5220588235294117, "frac_reward_zero_std": 0.75, "grad_norm": 0.5244102912257762, "kl": 0.021531354635953903, "learning_rate": 5.762817131322481e-07, "loss": -0.0147, "num_tokens": 53885627.0, "reward": 0.375, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.4969340562820435, "sampling/importance_sampling_ratio/mean": 0.9999495148658752, "sampling/importance_sampling_ratio/min": 0.6927025318145752, "sampling/sampling_logp_difference/max": 0.40341901779174805, "sampling/sampling_logp_difference/mean": 0.015577960759401321, "step": 1242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 603.0, "completions/max_terminated_length": 603.0, "completions/mean_length": 339.09375, "completions/mean_terminated_length": 339.09375, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.5070111751556396, "epoch": 1.5232843137254903, "frac_reward_zero_std": 0.5, "grad_norm": 0.6307394548655779, "kl": 0.028307627886533737, "learning_rate": 5.755775959991844e-07, "loss": -0.0311, "num_tokens": 53925073.0, "reward": 0.9375, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.2975199222564697, "sampling/importance_sampling_ratio/mean": 1.0000696182250977, "sampling/importance_sampling_ratio/min": 0.3292369246482849, "sampling/sampling_logp_difference/max": 1.1109776496887207, "sampling/sampling_logp_difference/mean": 0.01489480584859848, "step": 1243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 783.0, "completions/max_terminated_length": 783.0, "completions/mean_length": 312.609375, "completions/mean_terminated_length": 312.609375, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.6272789239883423, "epoch": 1.5245098039215685, "frac_reward_zero_std": 0.75, "grad_norm": 0.628385658638869, "kl": 0.02861296758055687, "learning_rate": 5.74873325448389e-07, "loss": 0.0172, "num_tokens": 53960120.0, "reward": 0.21875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 1.3624465465545654, "sampling/importance_sampling_ratio/mean": 1.0000855922698975, "sampling/importance_sampling_ratio/min": 0.617550790309906, "sampling/sampling_logp_difference/max": 0.4819939136505127, "sampling/sampling_logp_difference/mean": 0.017871703952550888, "step": 1244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 709.0, "completions/max_terminated_length": 709.0, "completions/mean_length": 346.03125, "completions/mean_terminated_length": 346.03125, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.37722277641296387, "epoch": 1.5257352941176472, "frac_reward_zero_std": 0.75, "grad_norm": 0.4768617715848921, "kl": 0.01730375550687313, "learning_rate": 5.741689029094861e-07, "loss": 0.0075, "num_tokens": 53998698.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.299555778503418, "sampling/importance_sampling_ratio/mean": 0.9998328685760498, "sampling/importance_sampling_ratio/min": 0.6280912756919861, "sampling/sampling_logp_difference/max": 0.4650697708129883, "sampling/sampling_logp_difference/mean": 0.011571005918085575, "step": 1245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 874.0, "completions/max_terminated_length": 874.0, "completions/mean_length": 327.546875, "completions/mean_terminated_length": 327.546875, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.5031652450561523, "epoch": 1.5269607843137254, "frac_reward_zero_std": 0.5, "grad_norm": 0.8260926982484541, "kl": 0.034432388842105865, "learning_rate": 5.73464329812409e-07, "loss": -0.0178, "num_tokens": 54034381.0, "reward": 0.6875, "reward_std": 0.47360679507255554, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.5467315912246704, "sampling/importance_sampling_ratio/mean": 1.000455379486084, "sampling/importance_sampling_ratio/min": 0.639574408531189, "sampling/sampling_logp_difference/max": 0.44695234298706055, "sampling/sampling_logp_difference/mean": 0.014496665447950363, "step": 1246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 901.0, "completions/max_terminated_length": 901.0, "completions/mean_length": 425.859375, "completions/mean_terminated_length": 425.859375, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.41088563203811646, "epoch": 1.528186274509804, "frac_reward_zero_std": 1.0, "grad_norm": 0.009881934138493805, "kl": 0.019506972283124924, "learning_rate": 5.727596075873965e-07, "loss": 0.0002, "num_tokens": 54076484.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5075654983520508, "sampling/importance_sampling_ratio/mean": 1.0004677772521973, "sampling/importance_sampling_ratio/min": 0.7220172882080078, "sampling/sampling_logp_difference/max": 0.4104961156845093, "sampling/sampling_logp_difference/mean": 0.012386942282319069, "step": 1247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 617.0, "completions/max_terminated_length": 617.0, "completions/mean_length": 342.328125, "completions/mean_terminated_length": 342.328125, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.4479956030845642, "epoch": 1.5294117647058822, "frac_reward_zero_std": 0.75, "grad_norm": 0.5749072223900837, "kl": 0.025183815509080887, "learning_rate": 5.7205473766499e-07, "loss": 0.011, "num_tokens": 54117993.0, "reward": 0.8125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.4754114151000977, "sampling/importance_sampling_ratio/mean": 0.9996820688247681, "sampling/importance_sampling_ratio/min": 0.5382962822914124, "sampling/sampling_logp_difference/max": 0.6193461418151855, "sampling/sampling_logp_difference/mean": 0.014818201772868633, "step": 1248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1078.0, "completions/max_terminated_length": 1078.0, "completions/mean_length": 309.40625, "completions/mean_terminated_length": 309.40625, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "entropy": 0.43672311305999756, "epoch": 1.530637254901961, "frac_reward_zero_std": 0.75, "grad_norm": 0.5246818928895554, "kl": 0.03689942881464958, "learning_rate": 5.71349721476031e-07, "loss": -0.0026, "num_tokens": 54157891.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.6494462490081787, "sampling/importance_sampling_ratio/mean": 1.0001641511917114, "sampling/importance_sampling_ratio/min": 0.6536652445793152, "sampling/sampling_logp_difference/max": 0.5004396438598633, "sampling/sampling_logp_difference/mean": 0.014175605028867722, "step": 1249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1159.0, "completions/max_terminated_length": 1159.0, "completions/mean_length": 429.9375, "completions/mean_terminated_length": 429.9375, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "entropy": 0.5640647411346436, "epoch": 1.531862745098039, "frac_reward_zero_std": 0.5, "grad_norm": 0.989146215455603, "kl": 0.020887240767478943, "learning_rate": 5.706445604516574e-07, "loss": -0.0481, "num_tokens": 54213887.0, "reward": 0.625, "reward_std": 0.49553054571151733, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9998482465744019, "sampling/importance_sampling_ratio/min": 0.5748000144958496, "sampling/sampling_logp_difference/max": 1.5259888172149658, "sampling/sampling_logp_difference/mean": 0.016372408717870712, "step": 1250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 783.0, "completions/max_terminated_length": 783.0, "completions/mean_length": 358.828125, "completions/mean_terminated_length": 358.828125, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.4974798560142517, "epoch": 1.5330882352941178, "frac_reward_zero_std": 0.75, "grad_norm": 1.5336212110058998, "kl": 0.02554183080792427, "learning_rate": 5.699392560233017e-07, "loss": -0.0162, "num_tokens": 54254164.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.624358057975769, "sampling/importance_sampling_ratio/mean": 0.9996934533119202, "sampling/importance_sampling_ratio/min": 0.6553473472595215, "sampling/sampling_logp_difference/max": 0.48511266708374023, "sampling/sampling_logp_difference/mean": 0.015980985015630722, "step": 1251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 853.0, "completions/max_terminated_length": 853.0, "completions/mean_length": 318.359375, "completions/mean_terminated_length": 318.359375, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "entropy": 0.3860938251018524, "epoch": 1.534313725490196, "frac_reward_zero_std": 1.0, "grad_norm": 0.0115327035344808, "kl": 0.019919615238904953, "learning_rate": 5.69233809622687e-07, "loss": 0.0002, "num_tokens": 54291899.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4245604276657104, "sampling/importance_sampling_ratio/mean": 0.999795138835907, "sampling/importance_sampling_ratio/min": 0.6779323220252991, "sampling/sampling_logp_difference/max": 0.3887077569961548, "sampling/sampling_logp_difference/mean": 0.012770789675414562, "step": 1252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1035.0, "completions/max_terminated_length": 1035.0, "completions/mean_length": 377.296875, "completions/mean_terminated_length": 377.296875, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.35473471879959106, "epoch": 1.5355392156862746, "frac_reward_zero_std": 1.0, "grad_norm": 0.010938130614275461, "kl": 0.016565797850489616, "learning_rate": 5.685282226818249e-07, "loss": 0.0002, "num_tokens": 54337038.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.561396837234497, "sampling/importance_sampling_ratio/mean": 1.0004351139068604, "sampling/importance_sampling_ratio/min": 0.6549174189567566, "sampling/sampling_logp_difference/max": 0.4455808401107788, "sampling/sampling_logp_difference/mean": 0.01247341651469469, "step": 1253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 702.0, "completions/max_terminated_length": 702.0, "completions/mean_length": 330.921875, "completions/mean_terminated_length": 330.921875, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "entropy": 0.5934406518936157, "epoch": 1.5367647058823528, "frac_reward_zero_std": 0.75, "grad_norm": 0.6080254547633512, "kl": 0.030591657385230064, "learning_rate": 5.678224966330119e-07, "loss": -0.0198, "num_tokens": 54379289.0, "reward": 0.25, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.25, "rewards/decision_reward_func/std": 0.9759001135826111, "sampling/importance_sampling_ratio/max": 1.4418613910675049, "sampling/importance_sampling_ratio/mean": 1.0001007318496704, "sampling/importance_sampling_ratio/min": 0.6824089884757996, "sampling/sampling_logp_difference/max": 0.3821260929107666, "sampling/sampling_logp_difference/mean": 0.016299622133374214, "step": 1254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 852.0, "completions/max_terminated_length": 852.0, "completions/mean_length": 356.765625, "completions/mean_terminated_length": 356.765625, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.4855304956436157, "epoch": 1.5379901960784315, "frac_reward_zero_std": 0.75, "grad_norm": 0.7700403258427826, "kl": 0.022024529054760933, "learning_rate": 5.671166329088277e-07, "loss": 0.0144, "num_tokens": 54417962.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.8628408908843994, "sampling/importance_sampling_ratio/mean": 0.9998144507408142, "sampling/importance_sampling_ratio/min": 0.7622833847999573, "sampling/sampling_logp_difference/max": 0.6221027374267578, "sampling/sampling_logp_difference/mean": 0.015431909821927547, "step": 1255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 918.0, "completions/max_terminated_length": 918.0, "completions/mean_length": 367.625, "completions/mean_terminated_length": 367.625, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "entropy": 0.39985209703445435, "epoch": 1.5392156862745097, "frac_reward_zero_std": 1.0, "grad_norm": 0.010374400415998344, "kl": 0.01721111685037613, "learning_rate": 5.664106329421305e-07, "loss": 0.0002, "num_tokens": 54459746.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4918559789657593, "sampling/importance_sampling_ratio/mean": 0.9996533989906311, "sampling/importance_sampling_ratio/min": 0.5882353186607361, "sampling/sampling_logp_difference/max": 0.5306282043457031, "sampling/sampling_logp_difference/mean": 0.012200879864394665, "step": 1256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1051.0, "completions/max_terminated_length": 1051.0, "completions/mean_length": 431.5625, "completions/mean_terminated_length": 431.5625, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "entropy": 0.4754295349121094, "epoch": 1.5404411764705883, "frac_reward_zero_std": 1.0, "grad_norm": 0.013949414787973171, "kl": 0.02088199183344841, "learning_rate": 5.657044981660559e-07, "loss": 0.0002, "num_tokens": 54507494.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3744484186172485, "sampling/importance_sampling_ratio/mean": 0.9994297027587891, "sampling/importance_sampling_ratio/min": 0.6221892833709717, "sampling/sampling_logp_difference/max": 0.47451090812683105, "sampling/sampling_logp_difference/mean": 0.014232205227017403, "step": 1257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 726.0, "completions/max_terminated_length": 726.0, "completions/mean_length": 310.125, "completions/mean_terminated_length": 310.125, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.5082278251647949, "epoch": 1.5416666666666665, "frac_reward_zero_std": 0.75, "grad_norm": 0.6516631592440605, "kl": 0.029323454946279526, "learning_rate": 5.649982300140123e-07, "loss": -0.0039, "num_tokens": 54544238.0, "reward": 0.375, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.4645951986312866, "sampling/importance_sampling_ratio/mean": 0.9999390244483948, "sampling/importance_sampling_ratio/min": 0.6056239008903503, "sampling/sampling_logp_difference/max": 0.5014960765838623, "sampling/sampling_logp_difference/mean": 0.016438694670796394, "step": 1258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 676.0, "completions/max_terminated_length": 676.0, "completions/mean_length": 346.765625, "completions/mean_terminated_length": 346.765625, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.3687049150466919, "epoch": 1.5428921568627452, "frac_reward_zero_std": 0.75, "grad_norm": 0.534910734779343, "kl": 0.018950633704662323, "learning_rate": 5.642918299196796e-07, "loss": 0.0286, "num_tokens": 54581295.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.5816667079925537, "sampling/importance_sampling_ratio/mean": 1.0000579357147217, "sampling/importance_sampling_ratio/min": 0.606658399105072, "sampling/sampling_logp_difference/max": 0.49978941679000854, "sampling/sampling_logp_difference/mean": 0.011555110104382038, "step": 1259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1132.0, "completions/max_terminated_length": 1132.0, "completions/mean_length": 374.828125, "completions/mean_terminated_length": 374.828125, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.3943699896335602, "epoch": 1.5441176470588234, "frac_reward_zero_std": 1.0, "grad_norm": 0.015606090455787695, "kl": 0.021698858588933945, "learning_rate": 5.635852993170052e-07, "loss": 0.0002, "num_tokens": 54619012.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.913936972618103, "sampling/importance_sampling_ratio/mean": 1.000141978263855, "sampling/importance_sampling_ratio/min": 0.6750997304916382, "sampling/sampling_logp_difference/max": 0.6491622924804688, "sampling/sampling_logp_difference/mean": 0.013291261158883572, "step": 1260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1208.0, "completions/max_terminated_length": 1208.0, "completions/mean_length": 454.109375, "completions/mean_terminated_length": 454.109375, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "entropy": 0.440851628780365, "epoch": 1.545343137254902, "frac_reward_zero_std": 0.75, "grad_norm": 0.45483343628917705, "kl": 0.01951705664396286, "learning_rate": 5.628786396402013e-07, "loss": 0.006, "num_tokens": 54666715.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.4189964532852173, "sampling/importance_sampling_ratio/mean": 1.000301480293274, "sampling/importance_sampling_ratio/min": 0.674616813659668, "sampling/sampling_logp_difference/max": 0.39361047744750977, "sampling/sampling_logp_difference/mean": 0.013009500689804554, "step": 1261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 535.0, "completions/max_terminated_length": 535.0, "completions/mean_length": 288.875, "completions/mean_terminated_length": 288.875, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.31896597146987915, "epoch": 1.5465686274509802, "frac_reward_zero_std": 1.0, "grad_norm": 0.01155417918659599, "kl": 0.0154807697981596, "learning_rate": 5.621718523237426e-07, "loss": 0.0002, "num_tokens": 54701251.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6354477405548096, "sampling/importance_sampling_ratio/mean": 1.0000834465026855, "sampling/importance_sampling_ratio/min": 0.6390791535377502, "sampling/sampling_logp_difference/max": 0.4919166564941406, "sampling/sampling_logp_difference/mean": 0.011269871145486832, "step": 1262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1684.0, "completions/max_terminated_length": 1684.0, "completions/mean_length": 456.765625, "completions/mean_terminated_length": 456.765625, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "entropy": 0.5132890939712524, "epoch": 1.5477941176470589, "frac_reward_zero_std": 0.5, "grad_norm": 0.6188029576625331, "kl": 0.0242614708840847, "learning_rate": 5.614649388023622e-07, "loss": 0.0248, "num_tokens": 54747428.0, "reward": 0.65625, "reward_std": 0.42695626616477966, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.5482476949691772, "sampling/importance_sampling_ratio/mean": 1.0001018047332764, "sampling/importance_sampling_ratio/min": 0.6056374907493591, "sampling/sampling_logp_difference/max": 0.5014736652374268, "sampling/sampling_logp_difference/mean": 0.014607463032007217, "step": 1263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 885.0, "completions/max_terminated_length": 885.0, "completions/mean_length": 353.71875, "completions/mean_terminated_length": 353.71875, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.3678520917892456, "epoch": 1.5490196078431373, "frac_reward_zero_std": 1.0, "grad_norm": 0.009874416857728049, "kl": 0.015252862125635147, "learning_rate": 5.607579005110502e-07, "loss": 0.0001, "num_tokens": 54785394.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5870492458343506, "sampling/importance_sampling_ratio/mean": 0.9999110698699951, "sampling/importance_sampling_ratio/min": 0.6978976130485535, "sampling/sampling_logp_difference/max": 0.46187639236450195, "sampling/sampling_logp_difference/mean": 0.011973822489380836, "step": 1264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 818.0, "completions/max_terminated_length": 818.0, "completions/mean_length": 342.3125, "completions/mean_terminated_length": 342.3125, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "entropy": 0.49694663286209106, "epoch": 1.5502450980392157, "frac_reward_zero_std": 0.75, "grad_norm": 0.5955302844379037, "kl": 0.02632983587682247, "learning_rate": 5.60050738885049e-07, "loss": 0.0081, "num_tokens": 54824662.0, "reward": 0.1875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 1.3975616693496704, "sampling/importance_sampling_ratio/mean": 0.9997671842575073, "sampling/importance_sampling_ratio/min": 0.355101615190506, "sampling/sampling_logp_difference/max": 1.035351276397705, "sampling/sampling_logp_difference/mean": 0.015573793090879917, "step": 1265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 784.0, "completions/max_terminated_length": 784.0, "completions/mean_length": 385.4375, "completions/mean_terminated_length": 385.4375, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "entropy": 0.5691608190536499, "epoch": 1.5514705882352942, "frac_reward_zero_std": 0.75, "grad_norm": 0.660104925601583, "kl": 0.022786684334278107, "learning_rate": 5.593434553598525e-07, "loss": -0.0077, "num_tokens": 54868994.0, "reward": 0.84375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.5829187631607056, "sampling/importance_sampling_ratio/mean": 0.9999390840530396, "sampling/importance_sampling_ratio/min": 0.6065770983695984, "sampling/sampling_logp_difference/max": 0.4999234676361084, "sampling/sampling_logp_difference/mean": 0.016600921750068665, "step": 1266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1141.0, "completions/max_terminated_length": 1141.0, "completions/mean_length": 452.984375, "completions/mean_terminated_length": 452.984375, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "entropy": 0.5256895422935486, "epoch": 1.5526960784313726, "frac_reward_zero_std": 0.5, "grad_norm": 0.7035240630157051, "kl": 0.01974230818450451, "learning_rate": 5.586360513712009e-07, "loss": -0.0166, "num_tokens": 54913313.0, "reward": 0.09375, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.721610426902771, "sampling/importance_sampling_ratio/mean": 1.0001120567321777, "sampling/importance_sampling_ratio/min": 0.5144166350364685, "sampling/sampling_logp_difference/max": 0.6647217273712158, "sampling/sampling_logp_difference/mean": 0.016143659129738808, "step": 1267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1023.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 484.765625, "completions/mean_terminated_length": 484.765625, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "entropy": 0.3827643394470215, "epoch": 1.553921568627451, "frac_reward_zero_std": 0.75, "grad_norm": 0.6350663306392631, "kl": 0.01417109277099371, "learning_rate": 5.579285283550797e-07, "loss": -0.0269, "num_tokens": 54964146.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.5291856527328491, "sampling/importance_sampling_ratio/mean": 0.9998300671577454, "sampling/importance_sampling_ratio/min": 0.5621426105499268, "sampling/sampling_logp_difference/max": 0.5759997367858887, "sampling/sampling_logp_difference/mean": 0.011216996237635612, "step": 1268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 959.0, "completions/max_terminated_length": 959.0, "completions/mean_length": 350.921875, "completions/mean_terminated_length": 350.921875, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "entropy": 0.4045935571193695, "epoch": 1.5551470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.013240968123356567, "kl": 0.019431820139288902, "learning_rate": 5.572208877477159e-07, "loss": 0.0002, "num_tokens": 55006381.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.5113599300384521, "sampling/importance_sampling_ratio/mean": 0.9998459815979004, "sampling/importance_sampling_ratio/min": 0.45550814270973206, "sampling/sampling_logp_difference/max": 0.786341667175293, "sampling/sampling_logp_difference/mean": 0.012766829691827297, "step": 1269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 412.0, "completions/max_terminated_length": 412.0, "completions/mean_length": 278.40625, "completions/mean_terminated_length": 278.40625, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.39910733699798584, "epoch": 1.5563725490196079, "frac_reward_zero_std": 1.0, "grad_norm": 0.013073835299833515, "kl": 0.018521498888731003, "learning_rate": 5.565131309855752e-07, "loss": 0.0002, "num_tokens": 55042199.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5306742191314697, "sampling/importance_sampling_ratio/mean": 1.000312089920044, "sampling/importance_sampling_ratio/min": 0.6738145351409912, "sampling/sampling_logp_difference/max": 0.4257082939147949, "sampling/sampling_logp_difference/mean": 0.014526212587952614, "step": 1270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 858.0, "completions/max_terminated_length": 858.0, "completions/mean_length": 445.28125, "completions/mean_terminated_length": 445.28125, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "entropy": 0.43154990673065186, "epoch": 1.5575980392156863, "frac_reward_zero_std": 0.75, "grad_norm": 0.4701578816085793, "kl": 0.019541777670383453, "learning_rate": 5.558052595053586e-07, "loss": -0.0064, "num_tokens": 55094713.0, "reward": 0.34375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 1.4237104654312134, "sampling/importance_sampling_ratio/mean": 1.0001124143600464, "sampling/importance_sampling_ratio/min": 0.6969876289367676, "sampling/sampling_logp_difference/max": 0.36098766326904297, "sampling/sampling_logp_difference/mean": 0.012226764112710953, "step": 1271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 770.0, "completions/max_terminated_length": 770.0, "completions/mean_length": 366.25, "completions/mean_terminated_length": 366.25, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.4666220247745514, "epoch": 1.5588235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.011637590761087726, "kl": 0.01788802444934845, "learning_rate": 5.550972747440005e-07, "loss": 0.0002, "num_tokens": 55132393.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.528370976448059, "sampling/importance_sampling_ratio/mean": 1.0002332925796509, "sampling/importance_sampling_ratio/min": 0.6772529482841492, "sampling/sampling_logp_difference/max": 0.42420244216918945, "sampling/sampling_logp_difference/mean": 0.01456377375870943, "step": 1272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 738.0, "completions/max_terminated_length": 738.0, "completions/mean_length": 474.0, "completions/mean_terminated_length": 474.0, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "entropy": 0.5375174283981323, "epoch": 1.5600490196078431, "frac_reward_zero_std": 0.5, "grad_norm": 0.7530059807407995, "kl": 0.03122366964817047, "learning_rate": 5.543891781386655e-07, "loss": 0.0273, "num_tokens": 55188265.0, "reward": 0.9375, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.3997794389724731, "sampling/importance_sampling_ratio/mean": 1.0001744031906128, "sampling/importance_sampling_ratio/min": 0.708476185798645, "sampling/sampling_logp_difference/max": 0.3446388244628906, "sampling/sampling_logp_difference/mean": 0.013696935027837753, "step": 1273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 679.0, "completions/max_terminated_length": 679.0, "completions/mean_length": 368.546875, "completions/mean_terminated_length": 368.546875, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "entropy": 0.4151255488395691, "epoch": 1.5612745098039216, "frac_reward_zero_std": 0.75, "grad_norm": 0.539377829217953, "kl": 0.016467614099383354, "learning_rate": 5.536809711267443e-07, "loss": 0.0149, "num_tokens": 55227692.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.622643232345581, "sampling/importance_sampling_ratio/mean": 0.9998278021812439, "sampling/importance_sampling_ratio/min": 0.6607893109321594, "sampling/sampling_logp_difference/max": 0.4840564727783203, "sampling/sampling_logp_difference/mean": 0.012999924831092358, "step": 1274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 534.0, "completions/max_terminated_length": 534.0, "completions/mean_length": 326.6875, "completions/mean_terminated_length": 326.6875, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "entropy": 0.47287437319755554, "epoch": 1.5625, "frac_reward_zero_std": 1.0, "grad_norm": 0.017602289985702787, "kl": 0.021029632538557053, "learning_rate": 5.529726551458526e-07, "loss": 0.0002, "num_tokens": 55269192.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.50780189037323, "sampling/importance_sampling_ratio/mean": 1.000274419784546, "sampling/importance_sampling_ratio/min": 0.5797584056854248, "sampling/sampling_logp_difference/max": 0.5451438426971436, "sampling/sampling_logp_difference/mean": 0.01538415253162384, "step": 1275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 721.0, "completions/max_terminated_length": 721.0, "completions/mean_length": 421.515625, "completions/mean_terminated_length": 421.515625, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "entropy": 0.4994185268878937, "epoch": 1.5637254901960784, "frac_reward_zero_std": 0.75, "grad_norm": 0.45431725453505967, "kl": 0.024154948070645332, "learning_rate": 5.522642316338268e-07, "loss": 0.0226, "num_tokens": 55314105.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001702308654785, "sampling/importance_sampling_ratio/min": 0.1356269121170044, "sampling/sampling_logp_difference/max": 1.997847557067871, "sampling/sampling_logp_difference/mean": 0.0146049614995718, "step": 1276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 683.0, "completions/max_terminated_length": 683.0, "completions/mean_length": 328.046875, "completions/mean_terminated_length": 328.046875, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.4673018753528595, "epoch": 1.5649509803921569, "frac_reward_zero_std": 1.0, "grad_norm": 0.026094732024043225, "kl": 0.037622567266225815, "learning_rate": 5.515557020287218e-07, "loss": 0.0003, "num_tokens": 55350652.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4625567197799683, "sampling/importance_sampling_ratio/mean": 1.0003015995025635, "sampling/importance_sampling_ratio/min": 0.6426177620887756, "sampling/sampling_logp_difference/max": 0.44220519065856934, "sampling/sampling_logp_difference/mean": 0.015523226000368595, "step": 1277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 859.0, "completions/max_terminated_length": 859.0, "completions/mean_length": 408.40625, "completions/mean_terminated_length": 408.40625, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "entropy": 0.4454231858253479, "epoch": 1.5661764705882353, "frac_reward_zero_std": 0.75, "grad_norm": 0.6153375329436466, "kl": 0.026309551671147346, "learning_rate": 5.508470677688078e-07, "loss": 0.0418, "num_tokens": 55395382.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.5278886556625366, "sampling/importance_sampling_ratio/mean": 0.9997844696044922, "sampling/importance_sampling_ratio/min": 0.5576876401901245, "sampling/sampling_logp_difference/max": 0.583956241607666, "sampling/sampling_logp_difference/mean": 0.013475478626787663, "step": 1278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1164.0, "completions/max_terminated_length": 1164.0, "completions/mean_length": 392.234375, "completions/mean_terminated_length": 392.234375, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.5486420392990112, "epoch": 1.5674019607843137, "frac_reward_zero_std": 0.75, "grad_norm": 0.5953188851477434, "kl": 0.02522438019514084, "learning_rate": 5.501383302925677e-07, "loss": -0.0406, "num_tokens": 55443189.0, "reward": 0.09375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.7042748928070068, "sampling/importance_sampling_ratio/mean": 1.0000051259994507, "sampling/importance_sampling_ratio/min": 0.6857565641403198, "sampling/sampling_logp_difference/max": 0.533139705657959, "sampling/sampling_logp_difference/mean": 0.01687958464026451, "step": 1279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1172.0, "completions/max_terminated_length": 1172.0, "completions/mean_length": 423.109375, "completions/mean_terminated_length": 423.109375, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.49776017665863037, "epoch": 1.5686274509803921, "frac_reward_zero_std": 0.75, "grad_norm": 0.3808971061102051, "kl": 0.025515900924801826, "learning_rate": 5.494294910386933e-07, "loss": -0.0011, "num_tokens": 55493324.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.627318024635315, "sampling/importance_sampling_ratio/mean": 0.999750554561615, "sampling/importance_sampling_ratio/min": 0.620280385017395, "sampling/sampling_logp_difference/max": 0.48693323135375977, "sampling/sampling_logp_difference/mean": 0.015981296077370644, "step": 1280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 450.0, "completions/max_terminated_length": 450.0, "completions/mean_length": 247.0, "completions/mean_terminated_length": 247.0, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.4774455726146698, "epoch": 1.5698529411764706, "frac_reward_zero_std": 0.75, "grad_norm": 0.9881313792000355, "kl": 0.03172894939780235, "learning_rate": 5.487205514460835e-07, "loss": 0.01, "num_tokens": 55525596.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.5048335790634155, "sampling/importance_sampling_ratio/mean": 0.9999541640281677, "sampling/importance_sampling_ratio/min": 0.7221868634223938, "sampling/sampling_logp_difference/max": 0.40868234634399414, "sampling/sampling_logp_difference/mean": 0.015762941911816597, "step": 1281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 730.0, "completions/max_terminated_length": 730.0, "completions/mean_length": 320.3125, "completions/mean_terminated_length": 320.3125, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.4024724066257477, "epoch": 1.571078431372549, "frac_reward_zero_std": 1.0, "grad_norm": 0.013813713658280248, "kl": 0.020267993211746216, "learning_rate": 5.480115129538409e-07, "loss": 0.0002, "num_tokens": 55564992.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4371734857559204, "sampling/importance_sampling_ratio/mean": 0.9999697804450989, "sampling/importance_sampling_ratio/min": 0.6132841110229492, "sampling/sampling_logp_difference/max": 0.4889270067214966, "sampling/sampling_logp_difference/mean": 0.014587713405489922, "step": 1282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 550.0, "completions/max_terminated_length": 550.0, "completions/mean_length": 332.515625, "completions/mean_terminated_length": 332.515625, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.5171206593513489, "epoch": 1.5723039215686274, "frac_reward_zero_std": 0.75, "grad_norm": 0.6808783337558836, "kl": 0.02792694792151451, "learning_rate": 5.473023770012686e-07, "loss": 0.0305, "num_tokens": 55602129.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.4682782888412476, "sampling/importance_sampling_ratio/mean": 1.0005104541778564, "sampling/importance_sampling_ratio/min": 0.616150975227356, "sampling/sampling_logp_difference/max": 0.48426318168640137, "sampling/sampling_logp_difference/mean": 0.017134325578808784, "step": 1283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 642.0, "completions/max_terminated_length": 642.0, "completions/mean_length": 323.9375, "completions/mean_terminated_length": 323.9375, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.5269834399223328, "epoch": 1.5735294117647058, "frac_reward_zero_std": 0.5, "grad_norm": 0.9003660103464647, "kl": 0.031370747834444046, "learning_rate": 5.465931450278676e-07, "loss": -0.0279, "num_tokens": 55642349.0, "reward": 0.65625, "reward_std": 0.48935678601264954, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.5674543380737305, "sampling/importance_sampling_ratio/mean": 1.0000296831130981, "sampling/importance_sampling_ratio/min": 0.6164764761924744, "sampling/sampling_logp_difference/max": 0.4837350845336914, "sampling/sampling_logp_difference/mean": 0.015482288785278797, "step": 1284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 820.0, "completions/max_terminated_length": 820.0, "completions/mean_length": 427.0625, "completions/mean_terminated_length": 427.0625, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.5146269798278809, "epoch": 1.5747549019607843, "frac_reward_zero_std": 0.75, "grad_norm": 0.459607993235506, "kl": 0.02759518288075924, "learning_rate": 5.458838184733341e-07, "loss": -0.0247, "num_tokens": 55685857.0, "reward": -0.25, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": -0.25, "rewards/decision_reward_func/std": 0.9759001135826111, "sampling/importance_sampling_ratio/max": 1.3244428634643555, "sampling/importance_sampling_ratio/mean": 1.0000243186950684, "sampling/importance_sampling_ratio/min": 0.6132948398590088, "sampling/sampling_logp_difference/max": 0.4889094829559326, "sampling/sampling_logp_difference/mean": 0.015087972395122051, "step": 1285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1154.0, "completions/max_terminated_length": 1154.0, "completions/mean_length": 366.625, "completions/mean_terminated_length": 366.625, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.4764830768108368, "epoch": 1.5759803921568627, "frac_reward_zero_std": 0.75, "grad_norm": 0.5923301594833283, "kl": 0.018237467855215073, "learning_rate": 5.451743987775559e-07, "loss": 0.0113, "num_tokens": 55728617.0, "reward": 0.71875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.6645445823669434, "sampling/importance_sampling_ratio/mean": 0.9999784231185913, "sampling/importance_sampling_ratio/min": 0.4187556505203247, "sampling/sampling_logp_difference/max": 0.8704676628112793, "sampling/sampling_logp_difference/mean": 0.016151685267686844, "step": 1286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1017.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 451.34375, "completions/mean_terminated_length": 451.34375, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "entropy": 0.4622216820716858, "epoch": 1.5772058823529411, "frac_reward_zero_std": 0.75, "grad_norm": 0.4972541830812956, "kl": 0.021944884210824966, "learning_rate": 5.444648873806101e-07, "loss": 0.0217, "num_tokens": 55772543.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.4383859634399414, "sampling/importance_sampling_ratio/mean": 1.0002816915512085, "sampling/importance_sampling_ratio/min": 0.38124769926071167, "sampling/sampling_logp_difference/max": 0.9643059968948364, "sampling/sampling_logp_difference/mean": 0.0146951237693429, "step": 1287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 682.0, "completions/max_terminated_length": 682.0, "completions/mean_length": 360.234375, "completions/mean_terminated_length": 360.234375, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.49088963866233826, "epoch": 1.5784313725490198, "frac_reward_zero_std": 0.75, "grad_norm": 0.5493150291177846, "kl": 0.023581478744745255, "learning_rate": 5.437552857227597e-07, "loss": 0.0032, "num_tokens": 55812158.0, "reward": 0.75, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.487601637840271, "sampling/importance_sampling_ratio/mean": 0.9995017647743225, "sampling/importance_sampling_ratio/min": 0.6230655908584595, "sampling/sampling_logp_difference/max": 0.47310352325439453, "sampling/sampling_logp_difference/mean": 0.014142753556370735, "step": 1288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 789.0, "completions/max_terminated_length": 789.0, "completions/mean_length": 365.546875, "completions/mean_terminated_length": 365.546875, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.5482465624809265, "epoch": 1.579656862745098, "frac_reward_zero_std": 0.75, "grad_norm": 0.6065268482838134, "kl": 0.029394783079624176, "learning_rate": 5.430455952444512e-07, "loss": 0.0095, "num_tokens": 55847649.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.3079403638839722, "sampling/importance_sampling_ratio/mean": 1.000194787979126, "sampling/importance_sampling_ratio/min": 0.730168879032135, "sampling/sampling_logp_difference/max": 0.31447941064834595, "sampling/sampling_logp_difference/mean": 0.016144227236509323, "step": 1289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 963.0, "completions/max_terminated_length": 963.0, "completions/mean_length": 411.8125, "completions/mean_terminated_length": 411.8125, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.5281437635421753, "epoch": 1.5808823529411766, "frac_reward_zero_std": 0.75, "grad_norm": 0.5410055854090258, "kl": 0.020951658487319946, "learning_rate": 5.423358173863116e-07, "loss": 0.0054, "num_tokens": 55891781.0, "reward": 0.8125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.315250039100647, "sampling/importance_sampling_ratio/mean": 0.9999505281448364, "sampling/importance_sampling_ratio/min": 0.6293045282363892, "sampling/sampling_logp_difference/max": 0.46314001083374023, "sampling/sampling_logp_difference/mean": 0.014089229516685009, "step": 1290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 821.0, "completions/max_terminated_length": 821.0, "completions/mean_length": 298.875, "completions/mean_terminated_length": 298.875, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "entropy": 0.4672483205795288, "epoch": 1.5821078431372548, "frac_reward_zero_std": 0.75, "grad_norm": 0.7263830820862327, "kl": 0.021340522915124893, "learning_rate": 5.416259535891446e-07, "loss": 0.0141, "num_tokens": 55928029.0, "reward": 0.84375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.4753994941711426, "sampling/importance_sampling_ratio/mean": 1.0000263452529907, "sampling/importance_sampling_ratio/min": 0.6961392760276794, "sampling/sampling_logp_difference/max": 0.38892877101898193, "sampling/sampling_logp_difference/mean": 0.013984077610075474, "step": 1291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 886.0, "completions/max_terminated_length": 886.0, "completions/mean_length": 410.953125, "completions/mean_terminated_length": 410.953125, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.41841572523117065, "epoch": 1.5833333333333335, "frac_reward_zero_std": 0.75, "grad_norm": 0.4749368880963745, "kl": 0.02137928083539009, "learning_rate": 5.409160052939291e-07, "loss": -0.0379, "num_tokens": 55976234.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.6225508451461792, "sampling/importance_sampling_ratio/mean": 1.000110149383545, "sampling/importance_sampling_ratio/min": 0.5593251585960388, "sampling/sampling_logp_difference/max": 0.5810242891311646, "sampling/sampling_logp_difference/mean": 0.014665389433503151, "step": 1292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1462.0, "completions/max_terminated_length": 1462.0, "completions/mean_length": 349.6875, "completions/mean_terminated_length": 349.6875, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.4750930070877075, "epoch": 1.5845588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 0.013839359739895519, "kl": 0.02026500552892685, "learning_rate": 5.402059739418148e-07, "loss": 0.0002, "num_tokens": 56015334.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4664666652679443, "sampling/importance_sampling_ratio/mean": 0.9999005794525146, "sampling/importance_sampling_ratio/min": 0.5715345740318298, "sampling/sampling_logp_difference/max": 0.5594303607940674, "sampling/sampling_logp_difference/mean": 0.015614586882293224, "step": 1293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1427.0, "completions/max_terminated_length": 1427.0, "completions/mean_length": 457.171875, "completions/mean_terminated_length": 457.171875, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "entropy": 0.4155139923095703, "epoch": 1.5857843137254903, "frac_reward_zero_std": 0.5, "grad_norm": 0.6591841063454098, "kl": 0.017143631353974342, "learning_rate": 5.394958609741206e-07, "loss": -0.0901, "num_tokens": 56060401.0, "reward": -0.65625, "reward_std": 0.42695626616477966, "rewards/decision_reward_func/mean": -0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.605602502822876, "sampling/importance_sampling_ratio/mean": 0.9997199773788452, "sampling/importance_sampling_ratio/min": 0.6721695065498352, "sampling/sampling_logp_difference/max": 0.473499059677124, "sampling/sampling_logp_difference/mean": 0.011616621166467667, "step": 1294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1174.0, "completions/max_terminated_length": 1174.0, "completions/mean_length": 406.453125, "completions/mean_terminated_length": 406.453125, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.6582151055335999, "epoch": 1.5870098039215685, "frac_reward_zero_std": 0.25, "grad_norm": 0.8934412535704405, "kl": 0.04738523066043854, "learning_rate": 5.387856678323307e-07, "loss": 0.0191, "num_tokens": 56104446.0, "reward": 0.0625, "reward_std": 0.7077301740646362, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.4368641376495361, "sampling/importance_sampling_ratio/mean": 1.000166416168213, "sampling/importance_sampling_ratio/min": 0.7723841667175293, "sampling/sampling_logp_difference/max": 0.36246299743652344, "sampling/sampling_logp_difference/mean": 0.017398390918970108, "step": 1295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 791.0, "completions/max_terminated_length": 791.0, "completions/mean_length": 302.546875, "completions/mean_terminated_length": 302.546875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.42760348320007324, "epoch": 1.5882352941176472, "frac_reward_zero_std": 1.0, "grad_norm": 0.012379030914501665, "kl": 0.021550066769123077, "learning_rate": 5.380753959580922e-07, "loss": 0.0002, "num_tokens": 56153425.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.532147765159607, "sampling/importance_sampling_ratio/mean": 0.9999161958694458, "sampling/importance_sampling_ratio/min": 0.5073919296264648, "sampling/sampling_logp_difference/max": 0.678471565246582, "sampling/sampling_logp_difference/mean": 0.014053035527467728, "step": 1296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 835.0, "completions/max_terminated_length": 835.0, "completions/mean_length": 443.546875, "completions/mean_terminated_length": 443.546875, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.6087660789489746, "epoch": 1.5894607843137254, "frac_reward_zero_std": 0.75, "grad_norm": 0.4355575791344984, "kl": 0.034827474504709244, "learning_rate": 5.373650467932121e-07, "loss": -0.0101, "num_tokens": 56199700.0, "reward": 0.34375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 1.4574787616729736, "sampling/importance_sampling_ratio/mean": 1.0000529289245605, "sampling/importance_sampling_ratio/min": 0.6563722491264343, "sampling/sampling_logp_difference/max": 0.42102718353271484, "sampling/sampling_logp_difference/mean": 0.01606791652739048, "step": 1297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 814.0, "completions/max_terminated_length": 814.0, "completions/mean_length": 382.5, "completions/mean_terminated_length": 382.5, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "entropy": 0.40245237946510315, "epoch": 1.590686274509804, "frac_reward_zero_std": 1.0, "grad_norm": 0.010878093674787651, "kl": 0.014623953029513359, "learning_rate": 5.366546217796541e-07, "loss": 0.0001, "num_tokens": 56245092.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3702584505081177, "sampling/importance_sampling_ratio/mean": 0.9999938607215881, "sampling/importance_sampling_ratio/min": 0.6942373514175415, "sampling/sampling_logp_difference/max": 0.3649413585662842, "sampling/sampling_logp_difference/mean": 0.012307525612413883, "step": 1298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 670.0, "completions/max_terminated_length": 670.0, "completions/mean_length": 378.0, "completions/mean_terminated_length": 378.0, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "entropy": 0.4603008031845093, "epoch": 1.5919117647058822, "frac_reward_zero_std": 1.0, "grad_norm": 0.01539013587336859, "kl": 0.025854622945189476, "learning_rate": 5.359441223595363e-07, "loss": 0.0003, "num_tokens": 56289460.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.3739004135131836, "sampling/importance_sampling_ratio/mean": 1.0002024173736572, "sampling/importance_sampling_ratio/min": 0.6584315896034241, "sampling/sampling_logp_difference/max": 0.4178946018218994, "sampling/sampling_logp_difference/mean": 0.012922714464366436, "step": 1299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 814.0, "completions/max_terminated_length": 814.0, "completions/mean_length": 360.71875, "completions/mean_terminated_length": 360.71875, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.43712109327316284, "epoch": 1.593137254901961, "frac_reward_zero_std": 0.5, "grad_norm": 0.8823216902529211, "kl": 0.026895154267549515, "learning_rate": 5.352335499751269e-07, "loss": 0.0567, "num_tokens": 56328818.0, "reward": 0.625, "reward_std": 0.42078250646591187, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.4548677206039429, "sampling/importance_sampling_ratio/mean": 1.0004888772964478, "sampling/importance_sampling_ratio/min": 0.6656554341316223, "sampling/sampling_logp_difference/max": 0.4069831371307373, "sampling/sampling_logp_difference/mean": 0.012863085605204105, "step": 1300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 982.0, "completions/max_terminated_length": 982.0, "completions/mean_length": 423.671875, "completions/mean_terminated_length": 423.671875, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "entropy": 0.41874366998672485, "epoch": 1.594362745098039, "frac_reward_zero_std": 1.0, "grad_norm": 0.011534859777973734, "kl": 0.015069096349179745, "learning_rate": 5.345229060688433e-07, "loss": 0.0001, "num_tokens": 56375469.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999421834945679, "sampling/importance_sampling_ratio/min": 0.6195028424263, "sampling/sampling_logp_difference/max": 0.813469409942627, "sampling/sampling_logp_difference/mean": 0.01251782476902008, "step": 1301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1263.0, "completions/max_terminated_length": 1263.0, "completions/mean_length": 419.59375, "completions/mean_terminated_length": 419.59375, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "entropy": 0.4864674210548401, "epoch": 1.5955882352941178, "frac_reward_zero_std": 0.75, "grad_norm": 0.5698955094592916, "kl": 0.020600754767656326, "learning_rate": 5.338121920832475e-07, "loss": 0.0663, "num_tokens": 56419971.0, "reward": 0.8125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.519298791885376, "sampling/importance_sampling_ratio/mean": 1.0003634691238403, "sampling/importance_sampling_ratio/min": 0.6997677683830261, "sampling/sampling_logp_difference/max": 0.41824889183044434, "sampling/sampling_logp_difference/mean": 0.015540880151093006, "step": 1302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 671.0, "completions/max_terminated_length": 671.0, "completions/mean_length": 375.796875, "completions/mean_terminated_length": 375.796875, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.5340962409973145, "epoch": 1.596813725490196, "frac_reward_zero_std": 0.75, "grad_norm": 0.5655344690415828, "kl": 0.033334799110889435, "learning_rate": 5.331014094610438e-07, "loss": 0.0153, "num_tokens": 56459270.0, "reward": 0.71875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.6130930185317993, "sampling/importance_sampling_ratio/mean": 1.000133752822876, "sampling/importance_sampling_ratio/min": 0.5570548176765442, "sampling/sampling_logp_difference/max": 0.5850915908813477, "sampling/sampling_logp_difference/mean": 0.01598905771970749, "step": 1303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 907.0, "completions/max_terminated_length": 907.0, "completions/mean_length": 385.515625, "completions/mean_terminated_length": 385.515625, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.3967931568622589, "epoch": 1.5980392156862746, "frac_reward_zero_std": 0.75, "grad_norm": 0.6200903384842308, "kl": 0.018682673573493958, "learning_rate": 5.323905596450759e-07, "loss": -0.003, "num_tokens": 56501735.0, "reward": 0.59375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.7056113481521606, "sampling/importance_sampling_ratio/mean": 0.9999558329582214, "sampling/importance_sampling_ratio/min": 0.6288620233535767, "sampling/sampling_logp_difference/max": 0.5339236259460449, "sampling/sampling_logp_difference/mean": 0.013176694512367249, "step": 1304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 982.0, "completions/max_terminated_length": 982.0, "completions/mean_length": 377.65625, "completions/mean_terminated_length": 377.65625, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "entropy": 0.4250771999359131, "epoch": 1.5992647058823528, "frac_reward_zero_std": 0.75, "grad_norm": 0.695561917590013, "kl": 0.02337605506181717, "learning_rate": 5.31679644078324e-07, "loss": -0.0359, "num_tokens": 56541553.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.6634883880615234, "sampling/importance_sampling_ratio/mean": 0.9996857643127441, "sampling/importance_sampling_ratio/min": 0.48544517159461975, "sampling/sampling_logp_difference/max": 0.7226889133453369, "sampling/sampling_logp_difference/mean": 0.013553105294704437, "step": 1305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 814.0, "completions/max_terminated_length": 814.0, "completions/mean_length": 300.625, "completions/mean_terminated_length": 300.625, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.4562414884567261, "epoch": 1.6004901960784315, "frac_reward_zero_std": 0.5, "grad_norm": 0.9835137492478616, "kl": 0.03557881712913513, "learning_rate": 5.309686642039015e-07, "loss": 0.0042, "num_tokens": 56576361.0, "reward": 0.59375, "reward_std": 0.5061737298965454, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.5064080953598022, "sampling/importance_sampling_ratio/mean": 1.0003340244293213, "sampling/importance_sampling_ratio/min": 0.5685411095619202, "sampling/sampling_logp_difference/max": 0.5646815896034241, "sampling/sampling_logp_difference/mean": 0.01501871831715107, "step": 1306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 726.0, "completions/max_terminated_length": 726.0, "completions/mean_length": 345.65625, "completions/mean_terminated_length": 345.65625, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.38053521513938904, "epoch": 1.6017156862745097, "frac_reward_zero_std": 1.0, "grad_norm": 0.01537749025420589, "kl": 0.018637768924236298, "learning_rate": 5.302576214650527e-07, "loss": 0.0002, "num_tokens": 56617491.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5307300090789795, "sampling/importance_sampling_ratio/mean": 0.9998552799224854, "sampling/importance_sampling_ratio/min": 0.6657708287239075, "sampling/sampling_logp_difference/max": 0.42574477195739746, "sampling/sampling_logp_difference/mean": 0.012895086780190468, "step": 1307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 682.0, "completions/max_terminated_length": 682.0, "completions/mean_length": 392.734375, "completions/mean_terminated_length": 392.734375, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "entropy": 0.36318787932395935, "epoch": 1.6029411764705883, "frac_reward_zero_std": 1.0, "grad_norm": 0.01912357883198562, "kl": 0.01762184500694275, "learning_rate": 5.295465173051491e-07, "loss": 0.0002, "num_tokens": 56661026.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5175888538360596, "sampling/importance_sampling_ratio/mean": 1.0001590251922607, "sampling/importance_sampling_ratio/min": 0.6270712614059448, "sampling/sampling_logp_difference/max": 0.46669507026672363, "sampling/sampling_logp_difference/mean": 0.0119274677708745, "step": 1308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 750.0, "completions/max_terminated_length": 750.0, "completions/mean_length": 377.6875, "completions/mean_terminated_length": 377.6875, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "entropy": 0.5918853282928467, "epoch": 1.6041666666666665, "frac_reward_zero_std": 0.25, "grad_norm": 0.8877034898248776, "kl": 0.04266280680894852, "learning_rate": 5.288353531676873e-07, "loss": 0.0823, "num_tokens": 56701406.0, "reward": -0.25, "reward_std": 0.5879635810852051, "rewards/decision_reward_func/mean": -0.25, "rewards/decision_reward_func/std": 0.9759001135826111, "sampling/importance_sampling_ratio/max": 1.4952038526535034, "sampling/importance_sampling_ratio/mean": 0.9998062252998352, "sampling/importance_sampling_ratio/min": 0.679018497467041, "sampling/sampling_logp_difference/max": 0.4022625684738159, "sampling/sampling_logp_difference/mean": 0.01688825711607933, "step": 1309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 299.484375, "completions/mean_terminated_length": 299.484375, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.2933143377304077, "epoch": 1.6053921568627452, "frac_reward_zero_std": 1.0, "grad_norm": 0.013616374631349947, "kl": 0.01714586466550827, "learning_rate": 5.281241304962852e-07, "loss": 0.0002, "num_tokens": 56736861.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6034196615219116, "sampling/importance_sampling_ratio/mean": 0.9997900724411011, "sampling/importance_sampling_ratio/min": 0.6295931339263916, "sampling/sampling_logp_difference/max": 0.4721386432647705, "sampling/sampling_logp_difference/mean": 0.010896679945290089, "step": 1310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/max_terminated_length": 480.0, "completions/mean_length": 280.046875, "completions/mean_terminated_length": 280.046875, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.4352789521217346, "epoch": 1.6066176470588234, "frac_reward_zero_std": 0.75, "grad_norm": 0.6016554128364243, "kl": 0.030850928276777267, "learning_rate": 5.2741285073468e-07, "loss": -0.0141, "num_tokens": 56780976.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.507885456085205, "sampling/importance_sampling_ratio/mean": 0.9997565746307373, "sampling/importance_sampling_ratio/min": 0.6715707182884216, "sampling/sampling_logp_difference/max": 0.41070830821990967, "sampling/sampling_logp_difference/mean": 0.013864089734852314, "step": 1311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 650.0, "completions/max_terminated_length": 650.0, "completions/mean_length": 362.171875, "completions/mean_terminated_length": 362.171875, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "entropy": 0.5661037564277649, "epoch": 1.607843137254902, "frac_reward_zero_std": 0.5, "grad_norm": 1.8316241907141286, "kl": 0.03530064970254898, "learning_rate": 5.267015153267245e-07, "loss": -0.0009, "num_tokens": 56823979.0, "reward": 0.46875, "reward_std": 0.5061737298965454, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.580573558807373, "sampling/importance_sampling_ratio/mean": 1.0002721548080444, "sampling/importance_sampling_ratio/min": 0.6916414499282837, "sampling/sampling_logp_difference/max": 0.45778775215148926, "sampling/sampling_logp_difference/mean": 0.017244597896933556, "step": 1312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 938.0, "completions/max_terminated_length": 938.0, "completions/mean_length": 339.078125, "completions/mean_terminated_length": 339.078125, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "entropy": 0.3894118666648865, "epoch": 1.6090686274509802, "frac_reward_zero_std": 0.75, "grad_norm": 0.686601979010763, "kl": 0.020519252866506577, "learning_rate": 5.259901257163844e-07, "loss": -0.0433, "num_tokens": 56862224.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.5235482454299927, "sampling/importance_sampling_ratio/mean": 0.9999945759773254, "sampling/importance_sampling_ratio/min": 0.605646550655365, "sampling/sampling_logp_difference/max": 0.5014586448669434, "sampling/sampling_logp_difference/mean": 0.011995394714176655, "step": 1313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 848.0, "completions/max_terminated_length": 848.0, "completions/mean_length": 328.609375, "completions/mean_terminated_length": 328.609375, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.4247559905052185, "epoch": 1.6102941176470589, "frac_reward_zero_std": 1.0, "grad_norm": 0.013998303601757755, "kl": 0.023011667653918266, "learning_rate": 5.252786833477358e-07, "loss": 0.0002, "num_tokens": 56902887.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6388980150222778, "sampling/importance_sampling_ratio/mean": 1.0003724098205566, "sampling/importance_sampling_ratio/min": 0.7028102874755859, "sampling/sampling_logp_difference/max": 0.49402403831481934, "sampling/sampling_logp_difference/mean": 0.01402522623538971, "step": 1314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 774.0, "completions/max_terminated_length": 774.0, "completions/mean_length": 430.125, "completions/mean_terminated_length": 430.125, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "entropy": 0.41761744022369385, "epoch": 1.6115196078431373, "frac_reward_zero_std": 0.75, "grad_norm": 0.516371384711874, "kl": 0.01752132922410965, "learning_rate": 5.245671896649612e-07, "loss": 0.017, "num_tokens": 56949439.0, "reward": 0.0625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.5189265012741089, "sampling/importance_sampling_ratio/mean": 1.0001829862594604, "sampling/importance_sampling_ratio/min": 0.6771732568740845, "sampling/sampling_logp_difference/max": 0.41800379753112793, "sampling/sampling_logp_difference/mean": 0.012386979535222054, "step": 1315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 893.0, "completions/max_terminated_length": 893.0, "completions/mean_length": 389.328125, "completions/mean_terminated_length": 389.328125, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.43409156799316406, "epoch": 1.6127450980392157, "frac_reward_zero_std": 1.0, "grad_norm": 0.013790507936794023, "kl": 0.02091209404170513, "learning_rate": 5.23855646112348e-07, "loss": 0.0002, "num_tokens": 56989652.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.383296012878418, "sampling/importance_sampling_ratio/mean": 0.9998171925544739, "sampling/importance_sampling_ratio/min": 0.6395378112792969, "sampling/sampling_logp_difference/max": 0.4470095634460449, "sampling/sampling_logp_difference/mean": 0.013360124081373215, "step": 1316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1096.0, "completions/max_terminated_length": 1096.0, "completions/mean_length": 453.359375, "completions/mean_terminated_length": 453.359375, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "entropy": 0.48839354515075684, "epoch": 1.6139705882352942, "frac_reward_zero_std": 1.0, "grad_norm": 0.01437786787772195, "kl": 0.019227419048547745, "learning_rate": 5.231440541342845e-07, "loss": 0.0002, "num_tokens": 57034171.0, "reward": -0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": -0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3660587072372437, "sampling/importance_sampling_ratio/mean": 0.9995009303092957, "sampling/importance_sampling_ratio/min": 0.6725849509239197, "sampling/sampling_logp_difference/max": 0.3966268301010132, "sampling/sampling_logp_difference/mean": 0.01420808769762516, "step": 1317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1141.0, "completions/max_terminated_length": 1141.0, "completions/mean_length": 371.96875, "completions/mean_terminated_length": 371.96875, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.3580455183982849, "epoch": 1.6151960784313726, "frac_reward_zero_std": 1.0, "grad_norm": 0.014610992321402675, "kl": 0.018459849059581757, "learning_rate": 5.224324151752575e-07, "loss": 0.0002, "num_tokens": 57078361.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4565290212631226, "sampling/importance_sampling_ratio/mean": 0.9999150633811951, "sampling/importance_sampling_ratio/min": 0.42266350984573364, "sampling/sampling_logp_difference/max": 0.8611788749694824, "sampling/sampling_logp_difference/mean": 0.012079011648893356, "step": 1318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1072.0, "completions/max_terminated_length": 1072.0, "completions/mean_length": 427.65625, "completions/mean_terminated_length": 427.65625, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "entropy": 0.5759284496307373, "epoch": 1.616421568627451, "frac_reward_zero_std": 0.5, "grad_norm": 0.6888465760755285, "kl": 0.02887854352593422, "learning_rate": 5.217207306798487e-07, "loss": -0.0273, "num_tokens": 57122851.0, "reward": 0.1875, "reward_std": 0.40311288833618164, "rewards/decision_reward_func/mean": 0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 1.4297244548797607, "sampling/importance_sampling_ratio/mean": 1.0004889965057373, "sampling/importance_sampling_ratio/min": 0.6294035911560059, "sampling/sampling_logp_difference/max": 0.4629826545715332, "sampling/sampling_logp_difference/mean": 0.016665194183588028, "step": 1319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 917.0, "completions/max_terminated_length": 917.0, "completions/mean_length": 422.1875, "completions/mean_terminated_length": 422.1875, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "entropy": 0.4340866208076477, "epoch": 1.6176470588235294, "frac_reward_zero_std": 0.75, "grad_norm": 0.47389631486290595, "kl": 0.01924830675125122, "learning_rate": 5.210090020927326e-07, "loss": 0.0015, "num_tokens": 57168031.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.7520631551742554, "sampling/importance_sampling_ratio/mean": 1.000364065170288, "sampling/importance_sampling_ratio/min": 0.5411010384559631, "sampling/sampling_logp_difference/max": 0.6141493320465088, "sampling/sampling_logp_difference/mean": 0.013051658868789673, "step": 1320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1043.0, "completions/max_terminated_length": 1043.0, "completions/mean_length": 397.0625, "completions/mean_terminated_length": 397.0625, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.46444281935691833, "epoch": 1.6188725490196079, "frac_reward_zero_std": 0.75, "grad_norm": 0.5145075060856645, "kl": 0.02183469571173191, "learning_rate": 5.202972308586735e-07, "loss": 0.0198, "num_tokens": 57216211.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.3998280763626099, "sampling/importance_sampling_ratio/mean": 0.9997694492340088, "sampling/importance_sampling_ratio/min": 0.7172219157218933, "sampling/sampling_logp_difference/max": 0.3363494873046875, "sampling/sampling_logp_difference/mean": 0.014103527180850506, "step": 1321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/max_terminated_length": 390.0, "completions/mean_length": 263.515625, "completions/mean_terminated_length": 263.515625, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "entropy": 0.47052711248397827, "epoch": 1.6200980392156863, "frac_reward_zero_std": 1.0, "grad_norm": 0.018023672905613825, "kl": 0.022840339690446854, "learning_rate": 5.195854184225213e-07, "loss": 0.0002, "num_tokens": 57251348.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3638145923614502, "sampling/importance_sampling_ratio/mean": 1.0000064373016357, "sampling/importance_sampling_ratio/min": 0.7321192622184753, "sampling/sampling_logp_difference/max": 0.3118119239807129, "sampling/sampling_logp_difference/mean": 0.01498958095908165, "step": 1322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 849.0, "completions/max_terminated_length": 849.0, "completions/mean_length": 340.4375, "completions/mean_terminated_length": 340.4375, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.39750367403030396, "epoch": 1.6213235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.009906634073114621, "kl": 0.018839262425899506, "learning_rate": 5.188735662292107e-07, "loss": 0.0002, "num_tokens": 57288976.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4694010019302368, "sampling/importance_sampling_ratio/mean": 1.0001397132873535, "sampling/importance_sampling_ratio/min": 0.6994335055351257, "sampling/sampling_logp_difference/max": 0.384854793548584, "sampling/sampling_logp_difference/mean": 0.013698600232601166, "step": 1323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 464.0, "completions/max_terminated_length": 464.0, "completions/mean_length": 285.34375, "completions/mean_terminated_length": 285.34375, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "entropy": 0.5426774024963379, "epoch": 1.6225490196078431, "frac_reward_zero_std": 0.75, "grad_norm": 0.674672851238747, "kl": 0.03588826581835747, "learning_rate": 5.181616757237561e-07, "loss": 0.0293, "num_tokens": 57322406.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.3881616592407227, "sampling/importance_sampling_ratio/mean": 0.9996458888053894, "sampling/importance_sampling_ratio/min": 0.696521520614624, "sampling/sampling_logp_difference/max": 0.3616565465927124, "sampling/sampling_logp_difference/mean": 0.017123229801654816, "step": 1324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 682.0, "completions/max_terminated_length": 682.0, "completions/mean_length": 306.96875, "completions/mean_terminated_length": 306.96875, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.5529435873031616, "epoch": 1.6237745098039216, "frac_reward_zero_std": 1.0, "grad_norm": 0.0253821861228549, "kl": 0.03208804875612259, "learning_rate": 5.174497483512505e-07, "loss": 0.0003, "num_tokens": 57359524.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5008279085159302, "sampling/importance_sampling_ratio/mean": 0.9998080730438232, "sampling/importance_sampling_ratio/min": 0.6869498491287231, "sampling/sampling_logp_difference/max": 0.4060169458389282, "sampling/sampling_logp_difference/mean": 0.016780907288193703, "step": 1325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 775.0, "completions/max_terminated_length": 775.0, "completions/mean_length": 372.796875, "completions/mean_terminated_length": 372.796875, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.41577965021133423, "epoch": 1.625, "frac_reward_zero_std": 0.75, "grad_norm": 0.634599441567809, "kl": 0.021819457411766052, "learning_rate": 5.167377855568612e-07, "loss": -0.0499, "num_tokens": 57404343.0, "reward": 0.1875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 1.4235868453979492, "sampling/importance_sampling_ratio/mean": 1.0000929832458496, "sampling/importance_sampling_ratio/min": 0.6482874155044556, "sampling/sampling_logp_difference/max": 0.43342113494873047, "sampling/sampling_logp_difference/mean": 0.013490250334143639, "step": 1326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1135.0, "completions/max_terminated_length": 1135.0, "completions/mean_length": 385.515625, "completions/mean_terminated_length": 385.515625, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.4982171058654785, "epoch": 1.6262254901960784, "frac_reward_zero_std": 1.0, "grad_norm": 0.017217851302999125, "kl": 0.02795340120792389, "learning_rate": 5.160257887858277e-07, "loss": 0.0003, "num_tokens": 57454824.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9996830224990845, "sampling/importance_sampling_ratio/min": 0.447765052318573, "sampling/sampling_logp_difference/max": 0.8034866452217102, "sampling/sampling_logp_difference/mean": 0.015436938963830471, "step": 1327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 969.0, "completions/max_terminated_length": 969.0, "completions/mean_length": 303.828125, "completions/mean_terminated_length": 303.828125, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.4264777898788452, "epoch": 1.6274509803921569, "frac_reward_zero_std": 0.75, "grad_norm": 0.6280948258727408, "kl": 0.017718281596899033, "learning_rate": 5.15313759483458e-07, "loss": -0.0027, "num_tokens": 57489501.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.3204236030578613, "sampling/importance_sampling_ratio/mean": 1.0000232458114624, "sampling/importance_sampling_ratio/min": 0.6298591494560242, "sampling/sampling_logp_difference/max": 0.46225905418395996, "sampling/sampling_logp_difference/mean": 0.01338912919163704, "step": 1328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 718.0, "completions/max_terminated_length": 718.0, "completions/mean_length": 319.921875, "completions/mean_terminated_length": 319.921875, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "entropy": 0.6912826895713806, "epoch": 1.6286764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.9426504088620756, "kl": 0.04457981139421463, "learning_rate": 5.146016990951268e-07, "loss": -0.0222, "num_tokens": 57528904.0, "reward": 0.625, "reward_std": 0.4577302038669586, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9994935989379883, "sampling/importance_sampling_ratio/min": 0.5255391597747803, "sampling/sampling_logp_difference/max": 0.7760670185089111, "sampling/sampling_logp_difference/mean": 0.01928030140697956, "step": 1329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 780.0, "completions/max_terminated_length": 780.0, "completions/mean_length": 316.21875, "completions/mean_terminated_length": 316.21875, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.44513198733329773, "epoch": 1.6299019607843137, "frac_reward_zero_std": 0.75, "grad_norm": 0.9886624058441099, "kl": 0.025066902860999107, "learning_rate": 5.138896090662714e-07, "loss": -0.0431, "num_tokens": 57569286.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.489701509475708, "sampling/importance_sampling_ratio/mean": 1.0002708435058594, "sampling/importance_sampling_ratio/min": 0.34855493903160095, "sampling/sampling_logp_difference/max": 1.0539593696594238, "sampling/sampling_logp_difference/mean": 0.014408689923584461, "step": 1330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1188.0, "completions/max_terminated_length": 1188.0, "completions/mean_length": 382.734375, "completions/mean_terminated_length": 382.734375, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "entropy": 0.6488862633705139, "epoch": 1.6311274509803921, "frac_reward_zero_std": 0.75, "grad_norm": 0.6408435041303193, "kl": 0.034781016409397125, "learning_rate": 5.131774908423898e-07, "loss": 0.0491, "num_tokens": 57609237.0, "reward": 0.71875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.4022067785263062, "sampling/importance_sampling_ratio/mean": 1.0005584955215454, "sampling/importance_sampling_ratio/min": 0.6043157577514648, "sampling/sampling_logp_difference/max": 0.5036584138870239, "sampling/sampling_logp_difference/mean": 0.017911020666360855, "step": 1331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 566.0, "completions/max_terminated_length": 566.0, "completions/mean_length": 299.21875, "completions/mean_terminated_length": 299.21875, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "entropy": 0.4060841202735901, "epoch": 1.6323529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.010524248217524936, "kl": 0.01664718985557556, "learning_rate": 5.124653458690365e-07, "loss": 0.0002, "num_tokens": 57646707.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.463536024093628, "sampling/importance_sampling_ratio/mean": 1.0003422498703003, "sampling/importance_sampling_ratio/min": 0.5967661738395691, "sampling/sampling_logp_difference/max": 0.5162298679351807, "sampling/sampling_logp_difference/mean": 0.013655497692525387, "step": 1332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 660.0, "completions/max_terminated_length": 660.0, "completions/mean_length": 350.953125, "completions/mean_terminated_length": 350.953125, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.5313653349876404, "epoch": 1.633578431372549, "frac_reward_zero_std": 0.75, "grad_norm": 0.48395912817708414, "kl": 0.02626187354326248, "learning_rate": 5.117531755918207e-07, "loss": -0.0119, "num_tokens": 57685264.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001341104507446, "sampling/importance_sampling_ratio/min": 0.620541512966156, "sampling/sampling_logp_difference/max": 0.7057394981384277, "sampling/sampling_logp_difference/mean": 0.015271037817001343, "step": 1333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 839.0, "completions/max_terminated_length": 839.0, "completions/mean_length": 346.078125, "completions/mean_terminated_length": 346.078125, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "entropy": 0.4409234821796417, "epoch": 1.6348039215686274, "frac_reward_zero_std": 1.0, "grad_norm": 0.010312653581784091, "kl": 0.02112944982945919, "learning_rate": 5.110409814564031e-07, "loss": 0.0002, "num_tokens": 57730149.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5417319536209106, "sampling/importance_sampling_ratio/mean": 1.0003923177719116, "sampling/importance_sampling_ratio/min": 0.6558611989021301, "sampling/sampling_logp_difference/max": 0.4329063892364502, "sampling/sampling_logp_difference/mean": 0.013928340747952461, "step": 1334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 961.0, "completions/max_terminated_length": 961.0, "completions/mean_length": 397.46875, "completions/mean_terminated_length": 397.46875, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "entropy": 0.6736617684364319, "epoch": 1.6360294117647058, "frac_reward_zero_std": 0.25, "grad_norm": 0.9827653278593027, "kl": 0.03820805624127388, "learning_rate": 5.103287649084926e-07, "loss": -0.0902, "num_tokens": 57772659.0, "reward": 0.1875, "reward_std": 0.5915650129318237, "rewards/decision_reward_func/mean": 0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 1.5753419399261475, "sampling/importance_sampling_ratio/mean": 1.0000438690185547, "sampling/importance_sampling_ratio/min": 0.4084920883178711, "sampling/sampling_logp_difference/max": 0.8952827453613281, "sampling/sampling_logp_difference/mean": 0.01734151504933834, "step": 1335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 663.0, "completions/max_terminated_length": 663.0, "completions/mean_length": 343.0625, "completions/mean_terminated_length": 343.0625, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.3775976300239563, "epoch": 1.6372549019607843, "frac_reward_zero_std": 1.0, "grad_norm": 0.009859689565519712, "kl": 0.016228042542934418, "learning_rate": 5.096165273938435e-07, "loss": 0.0002, "num_tokens": 57811495.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3311017751693726, "sampling/importance_sampling_ratio/mean": 1.0001236200332642, "sampling/importance_sampling_ratio/min": 0.6891391277313232, "sampling/sampling_logp_difference/max": 0.372312068939209, "sampling/sampling_logp_difference/mean": 0.011993644759058952, "step": 1336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 671.0, "completions/max_terminated_length": 671.0, "completions/mean_length": 342.625, "completions/mean_terminated_length": 342.625, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.5663101077079773, "epoch": 1.6384803921568627, "frac_reward_zero_std": 0.5, "grad_norm": 0.8371012621325898, "kl": 0.03884133696556091, "learning_rate": 5.089042703582533e-07, "loss": 0.0004, "num_tokens": 57850767.0, "reward": 0.34375, "reward_std": 0.48935678601264954, "rewards/decision_reward_func/mean": 0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000510215759277, "sampling/importance_sampling_ratio/min": 0.09631389379501343, "sampling/sampling_logp_difference/max": 2.3401427268981934, "sampling/sampling_logp_difference/mean": 0.016509518027305603, "step": 1337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1387.0, "completions/max_terminated_length": 1387.0, "completions/mean_length": 477.78125, "completions/mean_terminated_length": 477.78125, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "entropy": 0.5440219640731812, "epoch": 1.6397058823529411, "frac_reward_zero_std": 0.5, "grad_norm": 0.6491978866344292, "kl": 0.023156650364398956, "learning_rate": 5.081919952475583e-07, "loss": -0.0013, "num_tokens": 57904737.0, "reward": 0.1875, "reward_std": 0.4787135720252991, "rewards/decision_reward_func/mean": 0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.00016188621521, "sampling/importance_sampling_ratio/min": 0.7343925833702087, "sampling/sampling_logp_difference/max": 0.7371196746826172, "sampling/sampling_logp_difference/mean": 0.015118763782083988, "step": 1338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 550.0, "completions/max_terminated_length": 550.0, "completions/mean_length": 335.5625, "completions/mean_terminated_length": 335.5625, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "entropy": 0.5403843522071838, "epoch": 1.6409313725490198, "frac_reward_zero_std": 0.5, "grad_norm": 0.9274223677690238, "kl": 0.02991124615073204, "learning_rate": 5.074797035076318e-07, "loss": 0.0666, "num_tokens": 57940885.0, "reward": 0.65625, "reward_std": 0.375, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.4768396615982056, "sampling/importance_sampling_ratio/mean": 0.9997713565826416, "sampling/importance_sampling_ratio/min": 0.6259216070175171, "sampling/sampling_logp_difference/max": 0.46853017807006836, "sampling/sampling_logp_difference/mean": 0.01671626977622509, "step": 1339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 933.0, "completions/max_terminated_length": 933.0, "completions/mean_length": 520.828125, "completions/mean_terminated_length": 520.828125, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.5139927864074707, "epoch": 1.642156862745098, "frac_reward_zero_std": 0.75, "grad_norm": 0.47497428421109683, "kl": 0.019954916089773178, "learning_rate": 5.067673965843812e-07, "loss": -0.0274, "num_tokens": 57990922.0, "reward": 0.0625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.522385597229004, "sampling/importance_sampling_ratio/mean": 0.9996585845947266, "sampling/importance_sampling_ratio/min": 0.6211332678794861, "sampling/sampling_logp_difference/max": 0.4762096405029297, "sampling/sampling_logp_difference/mean": 0.014839756302535534, "step": 1340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1436.0, "completions/max_terminated_length": 1436.0, "completions/mean_length": 412.78125, "completions/mean_terminated_length": 412.78125, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.43742215633392334, "epoch": 1.6433823529411766, "frac_reward_zero_std": 0.75, "grad_norm": 0.4574620880353474, "kl": 0.018622832372784615, "learning_rate": 5.060550759237441e-07, "loss": -0.0379, "num_tokens": 58033820.0, "reward": 0.09375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.5002503395080566, "sampling/importance_sampling_ratio/mean": 1.0000674724578857, "sampling/importance_sampling_ratio/min": 0.6603621244430542, "sampling/sampling_logp_difference/max": 0.41496700048446655, "sampling/sampling_logp_difference/mean": 0.01258531678467989, "step": 1341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 663.0, "completions/max_terminated_length": 663.0, "completions/mean_length": 315.296875, "completions/mean_terminated_length": 315.296875, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.43639785051345825, "epoch": 1.6446078431372548, "frac_reward_zero_std": 1.0, "grad_norm": 0.013975648313702365, "kl": 0.022092022001743317, "learning_rate": 5.053427429716866e-07, "loss": 0.0002, "num_tokens": 58073599.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4530924558639526, "sampling/importance_sampling_ratio/mean": 0.9995454549789429, "sampling/importance_sampling_ratio/min": 0.7335337996482849, "sampling/sampling_logp_difference/max": 0.37369394302368164, "sampling/sampling_logp_difference/mean": 0.013373750261962414, "step": 1342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 826.0, "completions/max_terminated_length": 826.0, "completions/mean_length": 356.46875, "completions/mean_terminated_length": 356.46875, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "entropy": 0.4959559142589569, "epoch": 1.6458333333333335, "frac_reward_zero_std": 0.5, "grad_norm": 0.7997742181352666, "kl": 0.024809401482343674, "learning_rate": 5.046303991741993e-07, "loss": -0.0435, "num_tokens": 58113885.0, "reward": 0.75, "reward_std": 0.4472135901451111, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.7512223720550537, "sampling/importance_sampling_ratio/mean": 1.0001559257507324, "sampling/importance_sampling_ratio/min": 0.7180253863334656, "sampling/sampling_logp_difference/max": 0.5603140592575073, "sampling/sampling_logp_difference/mean": 0.014193592593073845, "step": 1343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1512.0, "completions/max_terminated_length": 1512.0, "completions/mean_length": 504.921875, "completions/mean_terminated_length": 504.921875, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.5968375205993652, "epoch": 1.6470588235294117, "frac_reward_zero_std": 0.25, "grad_norm": 0.8124629594872771, "kl": 0.025826914235949516, "learning_rate": 5.039180459772949e-07, "loss": -0.0411, "num_tokens": 58164024.0, "reward": 0.40625, "reward_std": 0.6373475193977356, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.6233336925506592, "sampling/importance_sampling_ratio/mean": 0.9999634027481079, "sampling/importance_sampling_ratio/min": 0.5543688535690308, "sampling/sampling_logp_difference/max": 0.5899250507354736, "sampling/sampling_logp_difference/mean": 0.01691528782248497, "step": 1344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1309.0, "completions/max_terminated_length": 1309.0, "completions/mean_length": 325.5625, "completions/mean_terminated_length": 325.5625, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.4416135549545288, "epoch": 1.6482843137254903, "frac_reward_zero_std": 1.0, "grad_norm": 0.01415742105857661, "kl": 0.020850250497460365, "learning_rate": 5.032056848270056e-07, "loss": 0.0002, "num_tokens": 58199484.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3674914836883545, "sampling/importance_sampling_ratio/mean": 0.999557375907898, "sampling/importance_sampling_ratio/min": 0.6206791400909424, "sampling/sampling_logp_difference/max": 0.4769411087036133, "sampling/sampling_logp_difference/mean": 0.015572583302855492, "step": 1345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 559.0, "completions/max_terminated_length": 559.0, "completions/mean_length": 331.0, "completions/mean_terminated_length": 331.0, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "entropy": 0.41537681221961975, "epoch": 1.6495098039215685, "frac_reward_zero_std": 0.75, "grad_norm": 0.6774363064962958, "kl": 0.021967682987451553, "learning_rate": 5.02493317169379e-07, "loss": 0.0309, "num_tokens": 58235356.0, "reward": 0.65625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000340938568115, "sampling/importance_sampling_ratio/min": 0.6334532499313354, "sampling/sampling_logp_difference/max": 2.330294609069824, "sampling/sampling_logp_difference/mean": 0.013498147018253803, "step": 1346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 814.0, "completions/max_terminated_length": 814.0, "completions/mean_length": 423.578125, "completions/mean_terminated_length": 423.578125, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.48262256383895874, "epoch": 1.6507352941176472, "frac_reward_zero_std": 0.75, "grad_norm": 0.5856121636207079, "kl": 0.027152761816978455, "learning_rate": 5.017809444504767e-07, "loss": 0.0316, "num_tokens": 58281681.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.3321735858917236, "sampling/importance_sampling_ratio/mean": 0.9998631477355957, "sampling/importance_sampling_ratio/min": 0.6559626460075378, "sampling/sampling_logp_difference/max": 0.4216514825820923, "sampling/sampling_logp_difference/mean": 0.013837556354701519, "step": 1347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1417.0, "completions/max_terminated_length": 1417.0, "completions/mean_length": 458.984375, "completions/mean_terminated_length": 458.984375, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "entropy": 0.4227965474128723, "epoch": 1.6519607843137254, "frac_reward_zero_std": 0.5, "grad_norm": 0.6374947639811238, "kl": 0.01989714801311493, "learning_rate": 5.010685681163698e-07, "loss": -0.0104, "num_tokens": 58331088.0, "reward": 0.8125, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.5496714115142822, "sampling/importance_sampling_ratio/mean": 1.000199317932129, "sampling/importance_sampling_ratio/min": 0.657691478729248, "sampling/sampling_logp_difference/max": 0.43804287910461426, "sampling/sampling_logp_difference/mean": 0.011917680501937866, "step": 1348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1244.0, "completions/max_terminated_length": 1244.0, "completions/mean_length": 388.84375, "completions/mean_terminated_length": 388.84375, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.5720001459121704, "epoch": 1.653186274509804, "frac_reward_zero_std": 0.75, "grad_norm": 0.6812947945088246, "kl": 0.024591978639364243, "learning_rate": 5.003561896131374e-07, "loss": 0.073, "num_tokens": 58377030.0, "reward": -0.09375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": -0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.6245834827423096, "sampling/importance_sampling_ratio/mean": 0.9998483061790466, "sampling/importance_sampling_ratio/min": 0.63031405210495, "sampling/sampling_logp_difference/max": 0.48525142669677734, "sampling/sampling_logp_difference/mean": 0.016770392656326294, "step": 1349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 789.0, "completions/max_terminated_length": 789.0, "completions/mean_length": 371.390625, "completions/mean_terminated_length": 371.390625, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "entropy": 0.5570262670516968, "epoch": 1.6544117647058822, "frac_reward_zero_std": 0.5, "grad_norm": 0.957136732383167, "kl": 0.025612296536564827, "learning_rate": 4.996438103868625e-07, "loss": -0.057, "num_tokens": 58419647.0, "reward": 0.5, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4730937480926514, "sampling/importance_sampling_ratio/mean": 0.9999185800552368, "sampling/importance_sampling_ratio/min": 0.677151083946228, "sampling/sampling_logp_difference/max": 0.3898608684539795, "sampling/sampling_logp_difference/mean": 0.015323314815759659, "step": 1350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1036.0, "completions/max_terminated_length": 1036.0, "completions/mean_length": 443.25, "completions/mean_terminated_length": 443.25, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.5292783975601196, "epoch": 1.655637254901961, "frac_reward_zero_std": 0.5, "grad_norm": 0.7047312491154096, "kl": 0.02643897384405136, "learning_rate": 4.989314318836302e-07, "loss": -0.0011, "num_tokens": 58464575.0, "reward": 0.625, "reward_std": 0.42078250646591187, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.5051921606063843, "sampling/importance_sampling_ratio/mean": 1.0001921653747559, "sampling/importance_sampling_ratio/min": 0.6819129586219788, "sampling/sampling_logp_difference/max": 0.4089205265045166, "sampling/sampling_logp_difference/mean": 0.015219951048493385, "step": 1351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/max_terminated_length": 490.0, "completions/mean_length": 231.8125, "completions/mean_terminated_length": 231.8125, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.4789193272590637, "epoch": 1.656862745098039, "frac_reward_zero_std": 1.0, "grad_norm": 0.020990108782955832, "kl": 0.03399135172367096, "learning_rate": 4.982190555495235e-07, "loss": 0.0003, "num_tokens": 58492947.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3742963075637817, "sampling/importance_sampling_ratio/mean": 1.000495433807373, "sampling/importance_sampling_ratio/min": 0.6464587450027466, "sampling/sampling_logp_difference/max": 0.4362459182739258, "sampling/sampling_logp_difference/mean": 0.0157027468085289, "step": 1352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1486.0, "completions/max_terminated_length": 1486.0, "completions/mean_length": 530.03125, "completions/mean_terminated_length": 530.03125, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "entropy": 0.41928917169570923, "epoch": 1.6580882352941178, "frac_reward_zero_std": 0.5, "grad_norm": 0.6021534107072857, "kl": 0.018277877941727638, "learning_rate": 4.975066828306209e-07, "loss": -0.0065, "num_tokens": 58542453.0, "reward": 0.9375, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.4435739517211914, "sampling/importance_sampling_ratio/mean": 1.0000178813934326, "sampling/importance_sampling_ratio/min": 0.5392635464668274, "sampling/sampling_logp_difference/max": 0.6175508499145508, "sampling/sampling_logp_difference/mean": 0.011779162101447582, "step": 1353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 991.0, "completions/max_terminated_length": 991.0, "completions/mean_length": 380.625, "completions/mean_terminated_length": 380.625, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.5336524844169617, "epoch": 1.659313725490196, "frac_reward_zero_std": 0.5, "grad_norm": 0.8112952845750276, "kl": 0.029784411191940308, "learning_rate": 4.967943151729944e-07, "loss": 0.048, "num_tokens": 58581229.0, "reward": 0.53125, "reward_std": 0.4629635810852051, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.4829119443893433, "sampling/importance_sampling_ratio/mean": 1.000385046005249, "sampling/importance_sampling_ratio/min": 0.6097726225852966, "sampling/sampling_logp_difference/max": 0.49466919898986816, "sampling/sampling_logp_difference/mean": 0.015564781613647938, "step": 1354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 738.0, "completions/max_terminated_length": 738.0, "completions/mean_length": 361.265625, "completions/mean_terminated_length": 361.265625, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.3786468505859375, "epoch": 1.6605392156862746, "frac_reward_zero_std": 0.75, "grad_norm": 0.49449937502596836, "kl": 0.023506734520196915, "learning_rate": 4.96081954022705e-07, "loss": -0.0274, "num_tokens": 58618990.0, "reward": 0.84375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.491737723350525, "sampling/importance_sampling_ratio/mean": 0.9999800324440002, "sampling/importance_sampling_ratio/min": 0.6301649808883667, "sampling/sampling_logp_difference/max": 0.4617736339569092, "sampling/sampling_logp_difference/mean": 0.01205186266452074, "step": 1355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 659.0, "completions/max_terminated_length": 659.0, "completions/mean_length": 362.609375, "completions/mean_terminated_length": 362.609375, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "entropy": 0.5420134663581848, "epoch": 1.6617647058823528, "frac_reward_zero_std": 0.75, "grad_norm": 0.593849065483842, "kl": 0.03071119263768196, "learning_rate": 4.953696008258008e-07, "loss": 0.0057, "num_tokens": 58657253.0, "reward": 0.34375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 1.576406717300415, "sampling/importance_sampling_ratio/mean": 0.9996557235717773, "sampling/importance_sampling_ratio/min": 0.6839156150817871, "sampling/sampling_logp_difference/max": 0.45514798164367676, "sampling/sampling_logp_difference/mean": 0.015151049941778183, "step": 1356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 599.0, "completions/max_terminated_length": 599.0, "completions/mean_length": 337.234375, "completions/mean_terminated_length": 337.234375, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "entropy": 0.5783308148384094, "epoch": 1.6629901960784315, "frac_reward_zero_std": 0.75, "grad_norm": 0.5531591007088242, "kl": 0.03614196553826332, "learning_rate": 4.946572570283134e-07, "loss": -0.0306, "num_tokens": 58695220.0, "reward": 0.84375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.401644229888916, "sampling/importance_sampling_ratio/mean": 0.9996453523635864, "sampling/importance_sampling_ratio/min": 0.7121204137802124, "sampling/sampling_logp_difference/max": 0.3395082950592041, "sampling/sampling_logp_difference/mean": 0.01715715415775776, "step": 1357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1189.0, "completions/max_terminated_length": 1189.0, "completions/mean_length": 509.671875, "completions/mean_terminated_length": 509.671875, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.7143102288246155, "epoch": 1.6642156862745097, "frac_reward_zero_std": 0.25, "grad_norm": 0.7948255420644211, "kl": 0.04424666613340378, "learning_rate": 4.939449240762558e-07, "loss": -0.0085, "num_tokens": 58744591.0, "reward": 0.1875, "reward_std": 0.7191373109817505, "rewards/decision_reward_func/mean": 0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 1.5388120412826538, "sampling/importance_sampling_ratio/mean": 0.9998378753662109, "sampling/importance_sampling_ratio/min": 0.6414307951927185, "sampling/sampling_logp_difference/max": 0.4440540075302124, "sampling/sampling_logp_difference/mean": 0.017422858625650406, "step": 1358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 860.0, "completions/max_terminated_length": 860.0, "completions/mean_length": 410.0, "completions/mean_terminated_length": 410.0, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "entropy": 0.3461485803127289, "epoch": 1.6654411764705883, "frac_reward_zero_std": 1.0, "grad_norm": 0.013579026390598152, "kl": 0.01853764057159424, "learning_rate": 4.932326034156189e-07, "loss": 0.0002, "num_tokens": 58789935.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.4483983516693115, "sampling/importance_sampling_ratio/mean": 1.0000975131988525, "sampling/importance_sampling_ratio/min": 0.704343855381012, "sampling/sampling_logp_difference/max": 0.37045836448669434, "sampling/sampling_logp_difference/mean": 0.010500459931790829, "step": 1359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 707.0, "completions/max_terminated_length": 707.0, "completions/mean_length": 314.140625, "completions/mean_terminated_length": 314.140625, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "entropy": 0.4333236813545227, "epoch": 1.6666666666666665, "frac_reward_zero_std": 0.75, "grad_norm": 0.7189245863182316, "kl": 0.0245039165019989, "learning_rate": 4.925202964923683e-07, "loss": -0.0005, "num_tokens": 58824600.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.5423049926757812, "sampling/importance_sampling_ratio/mean": 1.000321626663208, "sampling/importance_sampling_ratio/min": 0.4841820299625397, "sampling/sampling_logp_difference/max": 0.7252943515777588, "sampling/sampling_logp_difference/mean": 0.01334725134074688, "step": 1360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 557.0, "completions/max_terminated_length": 557.0, "completions/mean_length": 366.46875, "completions/mean_terminated_length": 366.46875, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "entropy": 0.4794554114341736, "epoch": 1.6678921568627452, "frac_reward_zero_std": 1.0, "grad_norm": 0.014027453634871472, "kl": 0.02458389848470688, "learning_rate": 4.918080047524417e-07, "loss": 0.0002, "num_tokens": 58863094.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.453819990158081, "sampling/importance_sampling_ratio/mean": 1.0003160238265991, "sampling/importance_sampling_ratio/min": 0.7513923048973083, "sampling/sampling_logp_difference/max": 0.37419450283050537, "sampling/sampling_logp_difference/mean": 0.014004962518811226, "step": 1361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 613.0, "completions/max_terminated_length": 613.0, "completions/mean_length": 349.296875, "completions/mean_terminated_length": 349.296875, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.5333895683288574, "epoch": 1.6691176470588234, "frac_reward_zero_std": 0.75, "grad_norm": 0.5596858806796827, "kl": 0.04080923646688461, "learning_rate": 4.910957296417467e-07, "loss": -0.0046, "num_tokens": 58897721.0, "reward": 0.28125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 1.4812757968902588, "sampling/importance_sampling_ratio/mean": 0.9999156594276428, "sampling/importance_sampling_ratio/min": 0.6171553134918213, "sampling/sampling_logp_difference/max": 0.4826345443725586, "sampling/sampling_logp_difference/mean": 0.015695620328187943, "step": 1362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 832.0, "completions/max_terminated_length": 832.0, "completions/mean_length": 406.65625, "completions/mean_terminated_length": 406.65625, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "entropy": 0.5694663524627686, "epoch": 1.670343137254902, "frac_reward_zero_std": 0.5, "grad_norm": 0.7467535677571522, "kl": 0.0357922799885273, "learning_rate": 4.903834726061564e-07, "loss": -0.009, "num_tokens": 58945059.0, "reward": 0.46875, "reward_std": 0.42516323924064636, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.6152846813201904, "sampling/importance_sampling_ratio/mean": 1.000563621520996, "sampling/importance_sampling_ratio/min": 0.6961621642112732, "sampling/sampling_logp_difference/max": 0.4795112609863281, "sampling/sampling_logp_difference/mean": 0.015204238705337048, "step": 1363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 950.0, "completions/max_terminated_length": 950.0, "completions/mean_length": 432.078125, "completions/mean_terminated_length": 432.078125, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "entropy": 0.5879849195480347, "epoch": 1.6715686274509802, "frac_reward_zero_std": 0.5, "grad_norm": 0.7102684608385832, "kl": 0.028558911755681038, "learning_rate": 4.896712350915074e-07, "loss": -0.016, "num_tokens": 58998424.0, "reward": 0.53125, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.817140817642212, "sampling/importance_sampling_ratio/mean": 1.0001492500305176, "sampling/importance_sampling_ratio/min": 0.6137153506278992, "sampling/sampling_logp_difference/max": 0.597264289855957, "sampling/sampling_logp_difference/mean": 0.01614808663725853, "step": 1364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 542.0, "completions/max_terminated_length": 542.0, "completions/mean_length": 301.046875, "completions/mean_terminated_length": 301.046875, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.4201350212097168, "epoch": 1.6727941176470589, "frac_reward_zero_std": 0.75, "grad_norm": 0.5025472889545297, "kl": 0.030188269913196564, "learning_rate": 4.889590185435969e-07, "loss": -0.017, "num_tokens": 59036795.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.9056017398834229, "sampling/importance_sampling_ratio/mean": 0.9996373653411865, "sampling/importance_sampling_ratio/min": 0.5912477970123291, "sampling/sampling_logp_difference/max": 0.6447978019714355, "sampling/sampling_logp_difference/mean": 0.013029379770159721, "step": 1365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 837.0, "completions/max_terminated_length": 837.0, "completions/mean_length": 385.140625, "completions/mean_terminated_length": 385.140625, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "entropy": 0.8429678082466125, "epoch": 1.6740196078431373, "frac_reward_zero_std": 0.0, "grad_norm": 1.1422143866706955, "kl": 0.05727671831846237, "learning_rate": 4.882468244081792e-07, "loss": 0.0918, "num_tokens": 59085092.0, "reward": 0.53125, "reward_std": 0.7165650129318237, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.3756165504455566, "sampling/importance_sampling_ratio/mean": 1.0002497434616089, "sampling/importance_sampling_ratio/min": 0.6801486611366272, "sampling/sampling_logp_difference/max": 0.3854438066482544, "sampling/sampling_logp_difference/mean": 0.020658748224377632, "step": 1366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1122.0, "completions/max_terminated_length": 1122.0, "completions/mean_length": 420.296875, "completions/mean_terminated_length": 420.296875, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "entropy": 0.446441113948822, "epoch": 1.6752450980392157, "frac_reward_zero_std": 0.75, "grad_norm": 0.5780351368118738, "kl": 0.026798605918884277, "learning_rate": 4.875346541309636e-07, "loss": 0.0143, "num_tokens": 59130439.0, "reward": 0.75, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.6478437185287476, "sampling/importance_sampling_ratio/mean": 0.9999286532402039, "sampling/importance_sampling_ratio/min": 0.6106590628623962, "sampling/sampling_logp_difference/max": 0.4994676113128662, "sampling/sampling_logp_difference/mean": 0.014183169230818748, "step": 1367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 662.0, "completions/max_terminated_length": 662.0, "completions/mean_length": 415.03125, "completions/mean_terminated_length": 415.03125, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "entropy": 0.5041353702545166, "epoch": 1.6764705882352942, "frac_reward_zero_std": 0.75, "grad_norm": 0.42205524317924314, "kl": 0.029931802302598953, "learning_rate": 4.868225091576102e-07, "loss": -0.0352, "num_tokens": 59175209.0, "reward": 0.125, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 1.4897260665893555, "sampling/importance_sampling_ratio/mean": 1.0003068447113037, "sampling/importance_sampling_ratio/min": 0.656509518623352, "sampling/sampling_logp_difference/max": 0.4208180904388428, "sampling/sampling_logp_difference/mean": 0.013385031372308731, "step": 1368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 811.0, "completions/max_terminated_length": 811.0, "completions/mean_length": 347.484375, "completions/mean_terminated_length": 347.484375, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "entropy": 0.524351179599762, "epoch": 1.6776960784313726, "frac_reward_zero_std": 0.75, "grad_norm": 0.7006827684600181, "kl": 0.024509914219379425, "learning_rate": 4.861103909337285e-07, "loss": 0.0738, "num_tokens": 59215976.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.5076663494110107, "sampling/importance_sampling_ratio/mean": 1.0000810623168945, "sampling/importance_sampling_ratio/min": 0.6926467418670654, "sampling/sampling_logp_difference/max": 0.41056299209594727, "sampling/sampling_logp_difference/mean": 0.014268601313233376, "step": 1369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3313.0, "completions/max_terminated_length": 3313.0, "completions/mean_length": 633.84375, "completions/mean_terminated_length": 633.84375, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "entropy": 0.5733725428581238, "epoch": 1.678921568627451, "frac_reward_zero_std": 0.5, "grad_norm": 0.5547349064869637, "kl": 0.027716079726815224, "learning_rate": 4.853983009048732e-07, "loss": -0.0023, "num_tokens": 59276990.0, "reward": 0.25, "reward_std": 0.44091323018074036, "rewards/decision_reward_func/mean": 0.25, "rewards/decision_reward_func/std": 0.9759001135826111, "sampling/importance_sampling_ratio/max": 1.6598318815231323, "sampling/importance_sampling_ratio/mean": 1.000124216079712, "sampling/importance_sampling_ratio/min": 0.6994336843490601, "sampling/sampling_logp_difference/max": 0.506716251373291, "sampling/sampling_logp_difference/mean": 0.015067193657159805, "step": 1370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1054.0, "completions/max_terminated_length": 1054.0, "completions/mean_length": 450.0625, "completions/mean_terminated_length": 450.0625, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.6266497373580933, "epoch": 1.6801470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 0.8485901310780453, "kl": 0.030410034582018852, "learning_rate": 4.84686240516542e-07, "loss": -0.0283, "num_tokens": 59322514.0, "reward": 0.4375, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.8577935695648193, "sampling/importance_sampling_ratio/mean": 1.000200867652893, "sampling/importance_sampling_ratio/min": 0.7085660696029663, "sampling/sampling_logp_difference/max": 0.619389533996582, "sampling/sampling_logp_difference/mean": 0.016702093183994293, "step": 1371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1071.0, "completions/max_terminated_length": 1071.0, "completions/mean_length": 374.765625, "completions/mean_terminated_length": 374.765625, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "entropy": 0.340493381023407, "epoch": 1.6813725490196079, "frac_reward_zero_std": 1.0, "grad_norm": 0.011017276294275812, "kl": 0.01507528405636549, "learning_rate": 4.839742112141724e-07, "loss": 0.0001, "num_tokens": 59364387.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3715609312057495, "sampling/importance_sampling_ratio/mean": 0.9999875426292419, "sampling/importance_sampling_ratio/min": 0.6725244522094727, "sampling/sampling_logp_difference/max": 0.396716833114624, "sampling/sampling_logp_difference/mean": 0.010611426085233688, "step": 1372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1174.0, "completions/max_terminated_length": 1174.0, "completions/mean_length": 415.625, "completions/mean_terminated_length": 415.625, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "entropy": 0.5272084474563599, "epoch": 1.6825980392156863, "frac_reward_zero_std": 0.5, "grad_norm": 0.8005761188729728, "kl": 0.024490002542734146, "learning_rate": 4.832622144431388e-07, "loss": -0.0832, "num_tokens": 59409755.0, "reward": -0.3125, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": -0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.8719145059585571, "sampling/importance_sampling_ratio/mean": 0.9999858140945435, "sampling/importance_sampling_ratio/min": 0.6042047739028931, "sampling/sampling_logp_difference/max": 0.6269617080688477, "sampling/sampling_logp_difference/mean": 0.014425510540604591, "step": 1373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 635.0, "completions/max_terminated_length": 635.0, "completions/mean_length": 377.0, "completions/mean_terminated_length": 377.0, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.5714730024337769, "epoch": 1.6838235294117647, "frac_reward_zero_std": 0.75, "grad_norm": 0.8970513449602454, "kl": 0.02688544988632202, "learning_rate": 4.825502516487496e-07, "loss": 0.0048, "num_tokens": 59453851.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.5078890323638916, "sampling/importance_sampling_ratio/mean": 1.0000542402267456, "sampling/importance_sampling_ratio/min": 0.6154473423957825, "sampling/sampling_logp_difference/max": 0.48540592193603516, "sampling/sampling_logp_difference/mean": 0.015192912891507149, "step": 1374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1183.0, "completions/max_terminated_length": 1183.0, "completions/mean_length": 467.09375, "completions/mean_terminated_length": 467.09375, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "entropy": 0.6460728049278259, "epoch": 1.6850490196078431, "frac_reward_zero_std": 0.5, "grad_norm": 0.7779860880305599, "kl": 0.03087739273905754, "learning_rate": 4.818383242762439e-07, "loss": 0.0336, "num_tokens": 59508561.0, "reward": 0.53125, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.507400631904602, "sampling/importance_sampling_ratio/mean": 0.9999992251396179, "sampling/importance_sampling_ratio/min": 0.5632230639457703, "sampling/sampling_logp_difference/max": 0.5740795135498047, "sampling/sampling_logp_difference/mean": 0.016230221837759018, "step": 1375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 702.0, "completions/max_terminated_length": 702.0, "completions/mean_length": 378.375, "completions/mean_terminated_length": 378.375, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "entropy": 0.5749035477638245, "epoch": 1.6862745098039216, "frac_reward_zero_std": 0.5, "grad_norm": 0.8307793511956973, "kl": 0.04477039724588394, "learning_rate": 4.811264337707894e-07, "loss": -0.0376, "num_tokens": 59547961.0, "reward": 0.15625, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": 0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.5869113206863403, "sampling/importance_sampling_ratio/mean": 0.9995193481445312, "sampling/importance_sampling_ratio/min": 0.48859500885009766, "sampling/sampling_logp_difference/max": 0.7162213325500488, "sampling/sampling_logp_difference/mean": 0.015549402683973312, "step": 1376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 744.0, "completions/max_terminated_length": 744.0, "completions/mean_length": 308.6875, "completions/mean_terminated_length": 308.6875, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.4991961717605591, "epoch": 1.6875, "frac_reward_zero_std": 1.0, "grad_norm": 0.01682401155068973, "kl": 0.029461301863193512, "learning_rate": 4.804145815774786e-07, "loss": 0.0003, "num_tokens": 59587717.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4982579946517944, "sampling/importance_sampling_ratio/mean": 1.0000228881835938, "sampling/importance_sampling_ratio/min": 0.6147016286849976, "sampling/sampling_logp_difference/max": 0.4866182804107666, "sampling/sampling_logp_difference/mean": 0.0160943865776062, "step": 1377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1193.0, "completions/max_terminated_length": 1193.0, "completions/mean_length": 358.90625, "completions/mean_terminated_length": 358.90625, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.5455276966094971, "epoch": 1.6887254901960784, "frac_reward_zero_std": 0.5, "grad_norm": 0.8362110718653366, "kl": 0.033447206020355225, "learning_rate": 4.797027691413267e-07, "loss": -0.0845, "num_tokens": 59625151.0, "reward": 0.25, "reward_std": 0.3811737596988678, "rewards/decision_reward_func/mean": 0.25, "rewards/decision_reward_func/std": 0.9759001135826111, "sampling/importance_sampling_ratio/max": 1.9823559522628784, "sampling/importance_sampling_ratio/mean": 1.0002342462539673, "sampling/importance_sampling_ratio/min": 0.6171622276306152, "sampling/sampling_logp_difference/max": 0.6842859983444214, "sampling/sampling_logp_difference/mean": 0.015623294748365879, "step": 1378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 982.0, "completions/max_terminated_length": 982.0, "completions/mean_length": 465.453125, "completions/mean_terminated_length": 465.453125, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "entropy": 0.515173077583313, "epoch": 1.6899509803921569, "frac_reward_zero_std": 0.75, "grad_norm": 0.4758536326923105, "kl": 0.022913526743650436, "learning_rate": 4.789909979072673e-07, "loss": -0.0042, "num_tokens": 59676412.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.9346567392349243, "sampling/importance_sampling_ratio/mean": 1.000035047531128, "sampling/importance_sampling_ratio/min": 0.4871019124984741, "sampling/sampling_logp_difference/max": 0.7192819118499756, "sampling/sampling_logp_difference/mean": 0.013937775045633316, "step": 1379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1719.0, "completions/max_terminated_length": 1719.0, "completions/mean_length": 382.359375, "completions/mean_terminated_length": 382.359375, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "entropy": 0.5538427829742432, "epoch": 1.6911764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.015986194911633698, "kl": 0.03016209602355957, "learning_rate": 4.782792693201513e-07, "loss": 0.0003, "num_tokens": 59718307.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.420745611190796, "sampling/importance_sampling_ratio/mean": 1.000272274017334, "sampling/importance_sampling_ratio/min": 0.6954985857009888, "sampling/sampling_logp_difference/max": 0.363126277923584, "sampling/sampling_logp_difference/mean": 0.015957122668623924, "step": 1380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 928.0, "completions/max_terminated_length": 928.0, "completions/mean_length": 381.28125, "completions/mean_terminated_length": 381.28125, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.522920548915863, "epoch": 1.6924019607843137, "frac_reward_zero_std": 0.75, "grad_norm": 0.6456484836529416, "kl": 0.029715491458773613, "learning_rate": 4.775675848247427e-07, "loss": 0.0074, "num_tokens": 59761173.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.3671391010284424, "sampling/importance_sampling_ratio/mean": 0.9999483227729797, "sampling/importance_sampling_ratio/min": 0.7405766844749451, "sampling/sampling_logp_difference/max": 0.31272029876708984, "sampling/sampling_logp_difference/mean": 0.01535993255674839, "step": 1381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/max_terminated_length": 536.0, "completions/mean_length": 310.40625, "completions/mean_terminated_length": 310.40625, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.5051084756851196, "epoch": 1.6936274509803921, "frac_reward_zero_std": 0.75, "grad_norm": 0.6951060704985536, "kl": 0.03599838539958, "learning_rate": 4.768559458657155e-07, "loss": 0.0242, "num_tokens": 59796047.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.3670469522476196, "sampling/importance_sampling_ratio/mean": 1.0000723600387573, "sampling/importance_sampling_ratio/min": 0.662371039390564, "sampling/sampling_logp_difference/max": 0.4119293689727783, "sampling/sampling_logp_difference/mean": 0.014995885081589222, "step": 1382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1006.0, "completions/max_terminated_length": 1006.0, "completions/mean_length": 403.8125, "completions/mean_terminated_length": 403.8125, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "entropy": 0.43025535345077515, "epoch": 1.6948529411764706, "frac_reward_zero_std": 0.75, "grad_norm": 0.6283002383282397, "kl": 0.020327363163232803, "learning_rate": 4.7614435388765203e-07, "loss": 0.0095, "num_tokens": 59847379.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.475846529006958, "sampling/importance_sampling_ratio/mean": 1.0001144409179688, "sampling/importance_sampling_ratio/min": 0.6543276309967041, "sampling/sampling_logp_difference/max": 0.4241471290588379, "sampling/sampling_logp_difference/mean": 0.012347444891929626, "step": 1383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 942.0, "completions/max_terminated_length": 942.0, "completions/mean_length": 396.1875, "completions/mean_terminated_length": 396.1875, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.558464527130127, "epoch": 1.696078431372549, "frac_reward_zero_std": 0.75, "grad_norm": 0.463478413831952, "kl": 0.026855889707803726, "learning_rate": 4.7543281033503885e-07, "loss": 0.0001, "num_tokens": 59891695.0, "reward": 0.375, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.3381022214889526, "sampling/importance_sampling_ratio/mean": 0.9997965693473816, "sampling/importance_sampling_ratio/min": 0.6176419258117676, "sampling/sampling_logp_difference/max": 0.48184633255004883, "sampling/sampling_logp_difference/mean": 0.015164588578045368, "step": 1384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1508.0, "completions/max_terminated_length": 1508.0, "completions/mean_length": 577.15625, "completions/mean_terminated_length": 577.15625, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "entropy": 0.4701322019100189, "epoch": 1.6973039215686274, "frac_reward_zero_std": 0.75, "grad_norm": 0.5351961468124572, "kl": 0.019043393433094025, "learning_rate": 4.747213166522644e-07, "loss": 0.0196, "num_tokens": 59946697.0, "reward": 0.375, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.6481446027755737, "sampling/importance_sampling_ratio/mean": 0.9995334148406982, "sampling/importance_sampling_ratio/min": 0.6415576338768005, "sampling/sampling_logp_difference/max": 0.499650239944458, "sampling/sampling_logp_difference/mean": 0.012550273910164833, "step": 1385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 588.0, "completions/max_terminated_length": 588.0, "completions/mean_length": 359.203125, "completions/mean_terminated_length": 359.203125, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "entropy": 0.4772397577762604, "epoch": 1.6985294117647058, "frac_reward_zero_std": 0.75, "grad_norm": 0.676268011138134, "kl": 0.020526200532913208, "learning_rate": 4.740098742836156e-07, "loss": -0.0128, "num_tokens": 59983766.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.40131676197052, "sampling/importance_sampling_ratio/mean": 0.9997522830963135, "sampling/importance_sampling_ratio/min": 0.02865932136774063, "sampling/sampling_logp_difference/max": 3.552276611328125, "sampling/sampling_logp_difference/mean": 0.013200536370277405, "step": 1386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 797.0, "completions/max_terminated_length": 797.0, "completions/mean_length": 384.375, "completions/mean_terminated_length": 384.375, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.5951744318008423, "epoch": 1.6997549019607843, "frac_reward_zero_std": 0.5, "grad_norm": 0.9550219913398915, "kl": 0.03319135308265686, "learning_rate": 4.732984846732755e-07, "loss": -0.007, "num_tokens": 60026590.0, "reward": 0.53125, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.7108122110366821, "sampling/importance_sampling_ratio/mean": 1.000368595123291, "sampling/importance_sampling_ratio/min": 0.45969974994659424, "sampling/sampling_logp_difference/max": 0.7771817445755005, "sampling/sampling_logp_difference/mean": 0.016438623890280724, "step": 1387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 638.0, "completions/max_terminated_length": 638.0, "completions/mean_length": 384.890625, "completions/mean_terminated_length": 384.890625, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.3364757299423218, "epoch": 1.7009803921568627, "frac_reward_zero_std": 1.0, "grad_norm": 0.011722039211019912, "kl": 0.017027191817760468, "learning_rate": 4.725871492653199e-07, "loss": 0.0002, "num_tokens": 60068919.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3627750873565674, "sampling/importance_sampling_ratio/mean": 1.000518560409546, "sampling/importance_sampling_ratio/min": 0.7180386185646057, "sampling/sampling_logp_difference/max": 0.331231951713562, "sampling/sampling_logp_difference/mean": 0.010395506396889687, "step": 1388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 832.0, "completions/max_terminated_length": 832.0, "completions/mean_length": 352.734375, "completions/mean_terminated_length": 352.734375, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.4312295913696289, "epoch": 1.7022058823529411, "frac_reward_zero_std": 1.0, "grad_norm": 0.015249312024058282, "kl": 0.021868973970413208, "learning_rate": 4.718758695037149e-07, "loss": 0.0002, "num_tokens": 60107350.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5290780067443848, "sampling/importance_sampling_ratio/mean": 1.0002734661102295, "sampling/importance_sampling_ratio/min": 0.5883421301841736, "sampling/sampling_logp_difference/max": 0.5304466485977173, "sampling/sampling_logp_difference/mean": 0.01356316264718771, "step": 1389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1173.0, "completions/max_terminated_length": 1173.0, "completions/mean_length": 436.828125, "completions/mean_terminated_length": 436.828125, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "entropy": 0.5775150060653687, "epoch": 1.7034313725490198, "frac_reward_zero_std": 0.75, "grad_norm": 0.5463214822728255, "kl": 0.02232363447546959, "learning_rate": 4.7116464683231285e-07, "loss": -0.0028, "num_tokens": 60157947.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.4251651763916016, "sampling/importance_sampling_ratio/mean": 0.9997456669807434, "sampling/importance_sampling_ratio/min": 0.5255572199821472, "sampling/sampling_logp_difference/max": 0.6432962417602539, "sampling/sampling_logp_difference/mean": 0.015583600848913193, "step": 1390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1051.0, "completions/max_terminated_length": 1051.0, "completions/mean_length": 491.875, "completions/mean_terminated_length": 491.875, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.5091962814331055, "epoch": 1.704656862745098, "frac_reward_zero_std": 0.25, "grad_norm": 0.8356926303419379, "kl": 0.02988806739449501, "learning_rate": 4.704534826948509e-07, "loss": 0.0139, "num_tokens": 60210675.0, "reward": 0.09375, "reward_std": 0.4515564441680908, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.504958152770996, "sampling/importance_sampling_ratio/mean": 1.0002710819244385, "sampling/importance_sampling_ratio/min": 0.542429506778717, "sampling/sampling_logp_difference/max": 0.6116971969604492, "sampling/sampling_logp_difference/mean": 0.014230703935027122, "step": 1391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1191.0, "completions/max_terminated_length": 1191.0, "completions/mean_length": 341.484375, "completions/mean_terminated_length": 341.484375, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.4007101356983185, "epoch": 1.7058823529411766, "frac_reward_zero_std": 1.0, "grad_norm": 0.011428335549277304, "kl": 0.02520425245165825, "learning_rate": 4.6974237853494744e-07, "loss": 0.0002, "num_tokens": 60251602.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5746550559997559, "sampling/importance_sampling_ratio/mean": 0.9998218417167664, "sampling/importance_sampling_ratio/min": 0.7527239918708801, "sampling/sampling_logp_difference/max": 0.45403623580932617, "sampling/sampling_logp_difference/mean": 0.013760387897491455, "step": 1392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 697.0, "completions/max_terminated_length": 697.0, "completions/mean_length": 353.1875, "completions/mean_terminated_length": 353.1875, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.6021209955215454, "epoch": 1.7071078431372548, "frac_reward_zero_std": 0.5, "grad_norm": 0.9083126244237899, "kl": 0.03629348427057266, "learning_rate": 4.690313357960985e-07, "loss": 0.019, "num_tokens": 60296382.0, "reward": -0.40625, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": -0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.3768171072006226, "sampling/importance_sampling_ratio/mean": 0.9999872446060181, "sampling/importance_sampling_ratio/min": 0.39856407046318054, "sampling/sampling_logp_difference/max": 0.9198870658874512, "sampling/sampling_logp_difference/mean": 0.01679941639304161, "step": 1393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1902.0, "completions/max_terminated_length": 1902.0, "completions/mean_length": 463.09375, "completions/mean_terminated_length": 463.09375, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.5053274631500244, "epoch": 1.7083333333333335, "frac_reward_zero_std": 0.75, "grad_norm": 0.6310555166370054, "kl": 0.018979713320732117, "learning_rate": 4.68320355921676e-07, "loss": -0.0831, "num_tokens": 60343892.0, "reward": 0.59375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.3073242902755737, "sampling/importance_sampling_ratio/mean": 0.9997541308403015, "sampling/importance_sampling_ratio/min": 0.6485984325408936, "sampling/sampling_logp_difference/max": 0.4329414367675781, "sampling/sampling_logp_difference/mean": 0.013809582218527794, "step": 1394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 894.0, "completions/max_terminated_length": 894.0, "completions/mean_length": 469.21875, "completions/mean_terminated_length": 469.21875, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "entropy": 0.4405631124973297, "epoch": 1.7095588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 0.01716192129913421, "kl": 0.027116959914565086, "learning_rate": 4.67609440354924e-07, "loss": 0.0003, "num_tokens": 60394866.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6088883876800537, "sampling/importance_sampling_ratio/mean": 1.0000524520874023, "sampling/importance_sampling_ratio/min": 0.6964912414550781, "sampling/sampling_logp_difference/max": 0.4755434989929199, "sampling/sampling_logp_difference/mean": 0.013535741716623306, "step": 1395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1107.0, "completions/max_terminated_length": 1107.0, "completions/mean_length": 506.28125, "completions/mean_terminated_length": 506.28125, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "entropy": 0.7243881225585938, "epoch": 1.7107843137254903, "frac_reward_zero_std": 0.75, "grad_norm": 0.5947748270809331, "kl": 0.037422239780426025, "learning_rate": 4.668985905389563e-07, "loss": 0.0161, "num_tokens": 60449988.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.5548114776611328, "sampling/importance_sampling_ratio/mean": 1.00002121925354, "sampling/importance_sampling_ratio/min": 0.6771607398986816, "sampling/sampling_logp_difference/max": 0.44135427474975586, "sampling/sampling_logp_difference/mean": 0.017734896391630173, "step": 1396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 849.0, "completions/max_terminated_length": 849.0, "completions/mean_length": 447.53125, "completions/mean_terminated_length": 447.53125, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "entropy": 0.5219602584838867, "epoch": 1.7120098039215685, "frac_reward_zero_std": 0.75, "grad_norm": 0.5896016278942978, "kl": 0.01887696236371994, "learning_rate": 4.661878079167526e-07, "loss": 0.0371, "num_tokens": 60501462.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.4986566305160522, "sampling/importance_sampling_ratio/mean": 0.999770998954773, "sampling/importance_sampling_ratio/min": 0.613283634185791, "sampling/sampling_logp_difference/max": 0.48892784118652344, "sampling/sampling_logp_difference/mean": 0.015722088515758514, "step": 1397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 932.0, "completions/max_terminated_length": 932.0, "completions/mean_length": 350.6875, "completions/mean_terminated_length": 350.6875, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.4232078790664673, "epoch": 1.7132352941176472, "frac_reward_zero_std": 1.0, "grad_norm": 0.012814527310615607, "kl": 0.02294398844242096, "learning_rate": 4.6547709393115677e-07, "loss": 0.0002, "num_tokens": 60539058.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3983029127120972, "sampling/importance_sampling_ratio/mean": 1.0001277923583984, "sampling/importance_sampling_ratio/min": 0.6097384095191956, "sampling/sampling_logp_difference/max": 0.49472522735595703, "sampling/sampling_logp_difference/mean": 0.014455164782702923, "step": 1398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 947.0, "completions/max_terminated_length": 947.0, "completions/mean_length": 403.171875, "completions/mean_terminated_length": 403.171875, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "entropy": 0.47014403343200684, "epoch": 1.7144607843137254, "frac_reward_zero_std": 0.5, "grad_norm": 0.7623283092767664, "kl": 0.021225709468126297, "learning_rate": 4.6476645002487295e-07, "loss": -0.0142, "num_tokens": 60587213.0, "reward": 0.4375, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.3067526817321777, "sampling/importance_sampling_ratio/mean": 0.9999449253082275, "sampling/importance_sampling_ratio/min": 0.6684949994087219, "sampling/sampling_logp_difference/max": 0.402726411819458, "sampling/sampling_logp_difference/mean": 0.01336597464978695, "step": 1399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 826.0, "completions/max_terminated_length": 826.0, "completions/mean_length": 401.515625, "completions/mean_terminated_length": 401.515625, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "entropy": 0.4511163532733917, "epoch": 1.715686274509804, "frac_reward_zero_std": 1.0, "grad_norm": 0.012349604075924604, "kl": 0.020265234634280205, "learning_rate": 4.640558776404639e-07, "loss": 0.0002, "num_tokens": 60635742.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4285809993743896, "sampling/importance_sampling_ratio/mean": 0.9998980760574341, "sampling/importance_sampling_ratio/min": 0.669140100479126, "sampling/sampling_logp_difference/max": 0.40176188945770264, "sampling/sampling_logp_difference/mean": 0.01360153779387474, "step": 1400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 640.0, "completions/max_terminated_length": 640.0, "completions/mean_length": 349.375, "completions/mean_terminated_length": 349.375, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.5163445472717285, "epoch": 1.7169117647058822, "frac_reward_zero_std": 1.0, "grad_norm": 0.017607600394010364, "kl": 0.022327817976474762, "learning_rate": 4.633453782203458e-07, "loss": 0.0002, "num_tokens": 60672662.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.5119978189468384, "sampling/importance_sampling_ratio/mean": 0.999786913394928, "sampling/importance_sampling_ratio/min": 0.6418190002441406, "sampling/sampling_logp_difference/max": 0.4434490203857422, "sampling/sampling_logp_difference/mean": 0.015258736908435822, "step": 1401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 829.0, "completions/max_terminated_length": 829.0, "completions/mean_length": 314.09375, "completions/mean_terminated_length": 314.09375, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "entropy": 0.663071870803833, "epoch": 1.718137254901961, "frac_reward_zero_std": 0.5, "grad_norm": 0.8304903246583996, "kl": 0.04236898943781853, "learning_rate": 4.626349532067879e-07, "loss": -0.0115, "num_tokens": 60711132.0, "reward": -0.03125, "reward_std": 0.4629635810852051, "rewards/decision_reward_func/mean": -0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.5467315912246704, "sampling/importance_sampling_ratio/mean": 0.9998723268508911, "sampling/importance_sampling_ratio/min": 0.6839174628257751, "sampling/sampling_logp_difference/max": 0.4361441135406494, "sampling/sampling_logp_difference/mean": 0.01830805093050003, "step": 1402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 622.0, "completions/max_terminated_length": 622.0, "completions/mean_length": 276.015625, "completions/mean_terminated_length": 276.015625, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.38174697756767273, "epoch": 1.719362745098039, "frac_reward_zero_std": 1.0, "grad_norm": 0.014371635190537338, "kl": 0.02614191733300686, "learning_rate": 4.6192460404190793e-07, "loss": 0.0002, "num_tokens": 60746061.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.30144202709198, "sampling/importance_sampling_ratio/mean": 1.0004346370697021, "sampling/importance_sampling_ratio/min": 0.645760715007782, "sampling/sampling_logp_difference/max": 0.4373262822628021, "sampling/sampling_logp_difference/mean": 0.012667704373598099, "step": 1403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1557.0, "completions/max_terminated_length": 1557.0, "completions/mean_length": 366.578125, "completions/mean_terminated_length": 366.578125, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.5505074262619019, "epoch": 1.7205882352941178, "frac_reward_zero_std": 0.75, "grad_norm": 0.5975776130979529, "kl": 0.030293773859739304, "learning_rate": 4.6121433216766935e-07, "loss": 0.0006, "num_tokens": 60788722.0, "reward": 0.09375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.814648151397705, "sampling/importance_sampling_ratio/mean": 1.0003122091293335, "sampling/importance_sampling_ratio/min": 0.52201908826828, "sampling/sampling_logp_difference/max": 0.6500511169433594, "sampling/sampling_logp_difference/mean": 0.01671779155731201, "step": 1404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1113.0, "completions/max_terminated_length": 1113.0, "completions/mean_length": 391.21875, "completions/mean_terminated_length": 391.21875, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "entropy": 0.36218374967575073, "epoch": 1.721813725490196, "frac_reward_zero_std": 1.0, "grad_norm": 0.009760394207790733, "kl": 0.014501956291496754, "learning_rate": 4.605041390258794e-07, "loss": 0.0001, "num_tokens": 60831664.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5571976900100708, "sampling/importance_sampling_ratio/mean": 1.0002107620239258, "sampling/importance_sampling_ratio/min": 0.5575301051139832, "sampling/sampling_logp_difference/max": 0.5842387676239014, "sampling/sampling_logp_difference/mean": 0.011337365955114365, "step": 1405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1201.0, "completions/max_terminated_length": 1201.0, "completions/mean_length": 386.140625, "completions/mean_terminated_length": 386.140625, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "entropy": 0.5114694833755493, "epoch": 1.7230392156862746, "frac_reward_zero_std": 0.75, "grad_norm": 0.6281722121822745, "kl": 0.025203991681337357, "learning_rate": 4.5979402605818514e-07, "loss": 0.0412, "num_tokens": 60875529.0, "reward": 0.15625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.3666759729385376, "sampling/importance_sampling_ratio/mean": 1.0000169277191162, "sampling/importance_sampling_ratio/min": 0.6207950115203857, "sampling/sampling_logp_difference/max": 0.47675442695617676, "sampling/sampling_logp_difference/mean": 0.01406181138008833, "step": 1406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 541.0, "completions/max_terminated_length": 541.0, "completions/mean_length": 261.40625, "completions/mean_terminated_length": 261.40625, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.5692524313926697, "epoch": 1.7242647058823528, "frac_reward_zero_std": 0.75, "grad_norm": 0.9206097273784791, "kl": 0.028854407370090485, "learning_rate": 4.5908399470607104e-07, "loss": -0.0072, "num_tokens": 60908723.0, "reward": 0.59375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.65715491771698, "sampling/importance_sampling_ratio/mean": 1.0000444650650024, "sampling/importance_sampling_ratio/min": 0.7599767446517944, "sampling/sampling_logp_difference/max": 0.5051021575927734, "sampling/sampling_logp_difference/mean": 0.01717287302017212, "step": 1407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1018.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 490.203125, "completions/mean_terminated_length": 490.203125, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "entropy": 0.5821753144264221, "epoch": 1.7254901960784315, "frac_reward_zero_std": 0.75, "grad_norm": 0.4796605162006866, "kl": 0.022998251020908356, "learning_rate": 4.5837404641085535e-07, "loss": -0.0073, "num_tokens": 60966576.0, "reward": 0.71875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.3650848865509033, "sampling/importance_sampling_ratio/mean": 0.9997649192810059, "sampling/importance_sampling_ratio/min": 0.6619628071784973, "sampling/sampling_logp_difference/max": 0.41254591941833496, "sampling/sampling_logp_difference/mean": 0.016484282910823822, "step": 1408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1267.0, "completions/max_terminated_length": 1267.0, "completions/mean_length": 444.625, "completions/mean_terminated_length": 444.625, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "entropy": 0.330674409866333, "epoch": 1.7267156862745097, "frac_reward_zero_std": 1.0, "grad_norm": 0.00902299858880049, "kl": 0.011907960288226604, "learning_rate": 4.576641826136884e-07, "loss": 0.0001, "num_tokens": 61013544.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4064304828643799, "sampling/importance_sampling_ratio/mean": 1.0003035068511963, "sampling/importance_sampling_ratio/min": 0.6629406213760376, "sampling/sampling_logp_difference/max": 0.4110698699951172, "sampling/sampling_logp_difference/mean": 0.010982787236571312, "step": 1409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 832.0, "completions/max_terminated_length": 832.0, "completions/mean_length": 367.859375, "completions/mean_terminated_length": 367.859375, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.46461552381515503, "epoch": 1.7279411764705883, "frac_reward_zero_std": 0.75, "grad_norm": 0.6165563999527335, "kl": 0.021965527907013893, "learning_rate": 4.5695440475554864e-07, "loss": -0.049, "num_tokens": 61054143.0, "reward": 0.09375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.8925371170043945, "sampling/importance_sampling_ratio/mean": 1.000130295753479, "sampling/importance_sampling_ratio/min": 0.6254145503044128, "sampling/sampling_logp_difference/max": 0.6379183530807495, "sampling/sampling_logp_difference/mean": 0.013991720974445343, "step": 1410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 892.0, "completions/max_terminated_length": 892.0, "completions/mean_length": 387.0, "completions/mean_terminated_length": 387.0, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.560430109500885, "epoch": 1.7291666666666665, "frac_reward_zero_std": 0.75, "grad_norm": 0.6374905294356991, "kl": 0.028867237269878387, "learning_rate": 4.5624471427724036e-07, "loss": -0.0099, "num_tokens": 61092895.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.6237764358520508, "sampling/importance_sampling_ratio/mean": 1.000072717666626, "sampling/importance_sampling_ratio/min": 0.6137487292289734, "sampling/sampling_logp_difference/max": 0.48816967010498047, "sampling/sampling_logp_difference/mean": 0.01615874469280243, "step": 1411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 884.0, "completions/max_terminated_length": 884.0, "completions/mean_length": 420.84375, "completions/mean_terminated_length": 420.84375, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.5032528042793274, "epoch": 1.7303921568627452, "frac_reward_zero_std": 0.75, "grad_norm": 0.49935831312281886, "kl": 0.024506868794560432, "learning_rate": 4.5553511261939e-07, "loss": -0.0124, "num_tokens": 61139157.0, "reward": 0.78125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.730507493019104, "sampling/importance_sampling_ratio/mean": 0.9999330043792725, "sampling/importance_sampling_ratio/min": 0.5000596642494202, "sampling/sampling_logp_difference/max": 0.6930278539657593, "sampling/sampling_logp_difference/mean": 0.013983811251819134, "step": 1412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 543.0, "completions/max_terminated_length": 543.0, "completions/mean_length": 292.15625, "completions/mean_terminated_length": 292.15625, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.4151991307735443, "epoch": 1.7316176470588234, "frac_reward_zero_std": 1.0, "grad_norm": 0.014311570696893832, "kl": 0.02069142460823059, "learning_rate": 4.5482560122244407e-07, "loss": 0.0002, "num_tokens": 61172271.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4350132942199707, "sampling/importance_sampling_ratio/mean": 1.000031590461731, "sampling/importance_sampling_ratio/min": 0.6566081047058105, "sampling/sampling_logp_difference/max": 0.4206678867340088, "sampling/sampling_logp_difference/mean": 0.014357825741171837, "step": 1413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1276.0, "completions/max_terminated_length": 1276.0, "completions/mean_length": 561.21875, "completions/mean_terminated_length": 561.21875, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "entropy": 0.444418728351593, "epoch": 1.732843137254902, "frac_reward_zero_std": 0.75, "grad_norm": 0.4168514868707038, "kl": 0.015179133974015713, "learning_rate": 4.541161815266658e-07, "loss": 0.0461, "num_tokens": 61226205.0, "reward": 0.59375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.952035903930664, "sampling/importance_sampling_ratio/mean": 1.0002570152282715, "sampling/importance_sampling_ratio/min": 0.3940466642379761, "sampling/sampling_logp_difference/max": 0.9312859773635864, "sampling/sampling_logp_difference/mean": 0.012415735982358456, "step": 1414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 619.0, "completions/max_terminated_length": 619.0, "completions/mean_length": 352.015625, "completions/mean_terminated_length": 352.015625, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "entropy": 0.4078255891799927, "epoch": 1.7340686274509802, "frac_reward_zero_std": 1.0, "grad_norm": 0.012056578133926705, "kl": 0.018420029431581497, "learning_rate": 4.534068549721324e-07, "loss": 0.0002, "num_tokens": 61263582.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5769739151000977, "sampling/importance_sampling_ratio/mean": 1.0001169443130493, "sampling/importance_sampling_ratio/min": 0.6919344067573547, "sampling/sampling_logp_difference/max": 0.455507755279541, "sampling/sampling_logp_difference/mean": 0.013405097648501396, "step": 1415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 691.0, "completions/max_terminated_length": 691.0, "completions/mean_length": 322.140625, "completions/mean_terminated_length": 322.140625, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.5349186658859253, "epoch": 1.7352941176470589, "frac_reward_zero_std": 1.0, "grad_norm": 0.01753894720198703, "kl": 0.023845473304390907, "learning_rate": 4.5269762299873144e-07, "loss": 0.0003, "num_tokens": 61305607.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6288418769836426, "sampling/importance_sampling_ratio/mean": 0.9996519684791565, "sampling/importance_sampling_ratio/min": 0.6512447595596313, "sampling/sampling_logp_difference/max": 0.4878692626953125, "sampling/sampling_logp_difference/mean": 0.015781627967953682, "step": 1416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1001.0, "completions/max_terminated_length": 1001.0, "completions/mean_length": 443.015625, "completions/mean_terminated_length": 443.015625, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.5705926418304443, "epoch": 1.7365196078431373, "frac_reward_zero_std": 0.5, "grad_norm": 0.6267243647623533, "kl": 0.036997079849243164, "learning_rate": 4.519884870461591e-07, "loss": 0.0058, "num_tokens": 61353560.0, "reward": 0.1875, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": 0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 1.6008520126342773, "sampling/importance_sampling_ratio/mean": 1.0000805854797363, "sampling/importance_sampling_ratio/min": 0.5320678949356079, "sampling/sampling_logp_difference/max": 0.6309841871261597, "sampling/sampling_logp_difference/mean": 0.015715232118964195, "step": 1417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 975.0, "completions/max_terminated_length": 975.0, "completions/mean_length": 325.15625, "completions/mean_terminated_length": 325.15625, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.43567946553230286, "epoch": 1.7377450980392157, "frac_reward_zero_std": 0.5, "grad_norm": 0.8770073526733293, "kl": 0.028793416917324066, "learning_rate": 4.512794485539165e-07, "loss": -0.0775, "num_tokens": 61388050.0, "reward": 0.4375, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.4851877689361572, "sampling/importance_sampling_ratio/mean": 1.0001757144927979, "sampling/importance_sampling_ratio/min": 0.7065754532814026, "sampling/sampling_logp_difference/max": 0.3955411911010742, "sampling/sampling_logp_difference/mean": 0.01360471174120903, "step": 1418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1156.0, "completions/max_terminated_length": 1156.0, "completions/mean_length": 435.6875, "completions/mean_terminated_length": 435.6875, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "entropy": 0.363567978143692, "epoch": 1.7389705882352942, "frac_reward_zero_std": 1.0, "grad_norm": 0.009882571834708023, "kl": 0.012376422062516212, "learning_rate": 4.505705089613068e-07, "loss": 0.0001, "num_tokens": 61432670.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.556091070175171, "sampling/importance_sampling_ratio/mean": 1.0001575946807861, "sampling/importance_sampling_ratio/min": 0.6340370774269104, "sampling/sampling_logp_difference/max": 0.45564794540405273, "sampling/sampling_logp_difference/mean": 0.011644650250673294, "step": 1419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 864.0, "completions/max_terminated_length": 864.0, "completions/mean_length": 345.15625, "completions/mean_terminated_length": 345.15625, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "entropy": 0.48325589299201965, "epoch": 1.7401960784313726, "frac_reward_zero_std": 0.75, "grad_norm": 0.6212400270975448, "kl": 0.031444378197193146, "learning_rate": 4.4986166970743233e-07, "loss": -0.0069, "num_tokens": 61468968.0, "reward": 0.8125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.5036461353302002, "sampling/importance_sampling_ratio/mean": 0.9999182224273682, "sampling/importance_sampling_ratio/min": 0.6256378293037415, "sampling/sampling_logp_difference/max": 0.46898365020751953, "sampling/sampling_logp_difference/mean": 0.014654017984867096, "step": 1420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1477.0, "completions/max_terminated_length": 1477.0, "completions/mean_length": 452.34375, "completions/mean_terminated_length": 452.34375, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.5893973112106323, "epoch": 1.741421568627451, "frac_reward_zero_std": 0.5, "grad_norm": 0.8300675694897262, "kl": 0.024165090173482895, "learning_rate": 4.4915293223119205e-07, "loss": 0.0781, "num_tokens": 61513950.0, "reward": 0.53125, "reward_std": 0.3723389506340027, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.5790122747421265, "sampling/importance_sampling_ratio/mean": 0.9997885823249817, "sampling/importance_sampling_ratio/min": 0.6754451990127563, "sampling/sampling_logp_difference/max": 0.4567995071411133, "sampling/sampling_logp_difference/mean": 0.01645284704864025, "step": 1421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1071.0, "completions/max_terminated_length": 1071.0, "completions/mean_length": 413.53125, "completions/mean_terminated_length": 413.53125, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "entropy": 0.5912594795227051, "epoch": 1.7426470588235294, "frac_reward_zero_std": 0.75, "grad_norm": 0.5451076239916255, "kl": 0.028784409165382385, "learning_rate": 4.484442979712783e-07, "loss": -0.0331, "num_tokens": 61563360.0, "reward": 0.8125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.611443042755127, "sampling/importance_sampling_ratio/mean": 0.9999720454216003, "sampling/importance_sampling_ratio/min": 0.6247479915618896, "sampling/sampling_logp_difference/max": 0.47713005542755127, "sampling/sampling_logp_difference/mean": 0.016936011612415314, "step": 1422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 419.0, "completions/max_terminated_length": 419.0, "completions/mean_length": 238.984375, "completions/mean_terminated_length": 238.984375, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.457419753074646, "epoch": 1.7438725490196079, "frac_reward_zero_std": 0.75, "grad_norm": 0.7229292729351489, "kl": 0.03824680298566818, "learning_rate": 4.477357683661733e-07, "loss": -0.0123, "num_tokens": 61592927.0, "reward": 0.75, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.4546467065811157, "sampling/importance_sampling_ratio/mean": 1.0003113746643066, "sampling/importance_sampling_ratio/min": 0.7020077109336853, "sampling/sampling_logp_difference/max": 0.37476301193237305, "sampling/sampling_logp_difference/mean": 0.014509645290672779, "step": 1423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1132.0, "completions/max_terminated_length": 1132.0, "completions/mean_length": 354.03125, "completions/mean_terminated_length": 354.03125, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "entropy": 0.49068111181259155, "epoch": 1.7450980392156863, "frac_reward_zero_std": 0.75, "grad_norm": 0.5114036380007675, "kl": 0.037268176674842834, "learning_rate": 4.470273448541475e-07, "loss": -0.008, "num_tokens": 61629969.0, "reward": 0.1875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 1.7692718505859375, "sampling/importance_sampling_ratio/mean": 0.9997608661651611, "sampling/importance_sampling_ratio/min": 0.6179003119468689, "sampling/sampling_logp_difference/max": 0.5705680847167969, "sampling/sampling_logp_difference/mean": 0.014699338003993034, "step": 1424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 884.0, "completions/max_terminated_length": 884.0, "completions/mean_length": 411.328125, "completions/mean_terminated_length": 411.328125, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.5300052165985107, "epoch": 1.7463235294117647, "frac_reward_zero_std": 0.75, "grad_norm": 0.6130669004898056, "kl": 0.024302810430526733, "learning_rate": 4.4631902887325567e-07, "loss": 0.0155, "num_tokens": 61678566.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.6203794479370117, "sampling/importance_sampling_ratio/mean": 0.9997835159301758, "sampling/importance_sampling_ratio/min": 0.6197841763496399, "sampling/sampling_logp_difference/max": 0.48266029357910156, "sampling/sampling_logp_difference/mean": 0.01625633053481579, "step": 1425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 608.0, "completions/max_terminated_length": 608.0, "completions/mean_length": 317.546875, "completions/mean_terminated_length": 317.546875, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "entropy": 0.5314945578575134, "epoch": 1.7475490196078431, "frac_reward_zero_std": 0.5, "grad_norm": 0.9156865219592848, "kl": 0.026602067053318024, "learning_rate": 4.4561082186133456e-07, "loss": 0.0463, "num_tokens": 61711465.0, "reward": 0.8125, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.5765149593353271, "sampling/importance_sampling_ratio/mean": 0.9999828338623047, "sampling/importance_sampling_ratio/min": 0.6059923768043518, "sampling/sampling_logp_difference/max": 0.5008878707885742, "sampling/sampling_logp_difference/mean": 0.01641952060163021, "step": 1426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1143.0, "completions/max_terminated_length": 1143.0, "completions/mean_length": 359.625, "completions/mean_terminated_length": 359.625, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "entropy": 0.4257274568080902, "epoch": 1.7487745098039216, "frac_reward_zero_std": 0.75, "grad_norm": 0.7413938279319696, "kl": 0.017228685319423676, "learning_rate": 4.4490272525599936e-07, "loss": 0.0257, "num_tokens": 61754689.0, "reward": -0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": -0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.577222466468811, "sampling/importance_sampling_ratio/mean": 1.000342845916748, "sampling/importance_sampling_ratio/min": 0.6091461181640625, "sampling/sampling_logp_difference/max": 0.49569714069366455, "sampling/sampling_logp_difference/mean": 0.013375840149819851, "step": 1427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 811.0, "completions/max_terminated_length": 811.0, "completions/mean_length": 313.078125, "completions/mean_terminated_length": 313.078125, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.42069098353385925, "epoch": 1.75, "frac_reward_zero_std": 0.5, "grad_norm": 0.9242446209021791, "kl": 0.027408171445131302, "learning_rate": 4.4419474049464135e-07, "loss": 0.0203, "num_tokens": 61789638.0, "reward": 0.90625, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.4258304834365845, "sampling/importance_sampling_ratio/mean": 1.000454068183899, "sampling/importance_sampling_ratio/min": 0.7177959680557251, "sampling/sampling_logp_difference/max": 0.3547544479370117, "sampling/sampling_logp_difference/mean": 0.013510225340723991, "step": 1428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1403.0, "completions/max_terminated_length": 1403.0, "completions/mean_length": 370.421875, "completions/mean_terminated_length": 370.421875, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "entropy": 0.440681129693985, "epoch": 1.7512254901960784, "frac_reward_zero_std": 1.0, "grad_norm": 0.015176895003142721, "kl": 0.03973357006907463, "learning_rate": 4.43486869014425e-07, "loss": 0.0003, "num_tokens": 61834401.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.574513554573059, "sampling/importance_sampling_ratio/mean": 0.999466061592102, "sampling/importance_sampling_ratio/min": 0.35683637857437134, "sampling/sampling_logp_difference/max": 1.0304780006408691, "sampling/sampling_logp_difference/mean": 0.01360119879245758, "step": 1429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 983.0, "completions/max_terminated_length": 983.0, "completions/mean_length": 341.796875, "completions/mean_terminated_length": 341.796875, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.43917596340179443, "epoch": 1.7524509803921569, "frac_reward_zero_std": 1.0, "grad_norm": 0.01199523404744453, "kl": 0.018051765859127045, "learning_rate": 4.427791122522841e-07, "loss": 0.0002, "num_tokens": 61882596.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4961236715316772, "sampling/importance_sampling_ratio/mean": 1.0000447034835815, "sampling/importance_sampling_ratio/min": 0.6467688679695129, "sampling/sampling_logp_difference/max": 0.435766339302063, "sampling/sampling_logp_difference/mean": 0.014651206322014332, "step": 1430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1124.0, "completions/max_terminated_length": 1124.0, "completions/mean_length": 371.359375, "completions/mean_terminated_length": 371.359375, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.5207093954086304, "epoch": 1.7536764705882353, "frac_reward_zero_std": 0.25, "grad_norm": 1.0733347293938962, "kl": 0.03368696570396423, "learning_rate": 4.420714716449203e-07, "loss": 0.0279, "num_tokens": 61923291.0, "reward": -0.0625, "reward_std": 0.5351393222808838, "rewards/decision_reward_func/mean": -0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.527761459350586, "sampling/importance_sampling_ratio/mean": 0.9999028444290161, "sampling/importance_sampling_ratio/min": 0.6817911863327026, "sampling/sampling_logp_difference/max": 0.42380356788635254, "sampling/sampling_logp_difference/mean": 0.015794405713677406, "step": 1431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 610.0, "completions/max_terminated_length": 610.0, "completions/mean_length": 441.75, "completions/mean_terminated_length": 441.75, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "entropy": 0.42244404554367065, "epoch": 1.7549019607843137, "frac_reward_zero_std": 1.0, "grad_norm": 0.011754117349979326, "kl": 0.018265558406710625, "learning_rate": 4.413639486287991e-07, "loss": 0.0002, "num_tokens": 61970107.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4366421699523926, "sampling/importance_sampling_ratio/mean": 0.999782145023346, "sampling/importance_sampling_ratio/min": 0.7246673107147217, "sampling/sampling_logp_difference/max": 0.3623085021972656, "sampling/sampling_logp_difference/mean": 0.01327145379036665, "step": 1432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 650.0, "completions/max_terminated_length": 650.0, "completions/mean_length": 295.203125, "completions/mean_terminated_length": 295.203125, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.42125624418258667, "epoch": 1.7561274509803921, "frac_reward_zero_std": 0.75, "grad_norm": 0.7965852355663415, "kl": 0.018481414765119553, "learning_rate": 4.406565446401476e-07, "loss": 0.0741, "num_tokens": 62004872.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.3873989582061768, "sampling/importance_sampling_ratio/mean": 1.0002280473709106, "sampling/importance_sampling_ratio/min": 0.6299288868904114, "sampling/sampling_logp_difference/max": 0.46214842796325684, "sampling/sampling_logp_difference/mean": 0.013951700180768967, "step": 1433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 749.0, "completions/max_terminated_length": 749.0, "completions/mean_length": 295.3125, "completions/mean_terminated_length": 295.3125, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.5331114530563354, "epoch": 1.7573529411764706, "frac_reward_zero_std": 0.75, "grad_norm": 0.8966282195487889, "kl": 0.032482996582984924, "learning_rate": 4.399492611149509e-07, "loss": 0.0937, "num_tokens": 62040572.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.6008028984069824, "sampling/importance_sampling_ratio/mean": 1.0000114440917969, "sampling/importance_sampling_ratio/min": 0.6293747425079346, "sampling/sampling_logp_difference/max": 0.4705052375793457, "sampling/sampling_logp_difference/mean": 0.016933776438236237, "step": 1434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1035.0, "completions/max_terminated_length": 1035.0, "completions/mean_length": 395.171875, "completions/mean_terminated_length": 395.171875, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "entropy": 0.6094921231269836, "epoch": 1.758578431372549, "frac_reward_zero_std": 0.75, "grad_norm": 0.5413632671602457, "kl": 0.027127401903271675, "learning_rate": 4.392420994889498e-07, "loss": 0.0397, "num_tokens": 62082679.0, "reward": 0.78125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.4661786556243896, "sampling/importance_sampling_ratio/mean": 0.9999503493309021, "sampling/importance_sampling_ratio/min": 0.6970701813697815, "sampling/sampling_logp_difference/max": 0.3826594352722168, "sampling/sampling_logp_difference/mean": 0.016828477382659912, "step": 1435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 569.0, "completions/max_terminated_length": 569.0, "completions/mean_length": 263.671875, "completions/mean_terminated_length": 263.671875, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.4405229687690735, "epoch": 1.7598039215686274, "frac_reward_zero_std": 1.0, "grad_norm": 0.01576028455603872, "kl": 0.02316945791244507, "learning_rate": 4.385350611976376e-07, "loss": 0.0002, "num_tokens": 62115122.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6573705673217773, "sampling/importance_sampling_ratio/mean": 0.9999954104423523, "sampling/importance_sampling_ratio/min": 0.5869852304458618, "sampling/sampling_logp_difference/max": 0.5327556133270264, "sampling/sampling_logp_difference/mean": 0.015222668647766113, "step": 1436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1289.0, "completions/max_terminated_length": 1289.0, "completions/mean_length": 416.46875, "completions/mean_terminated_length": 416.46875, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "entropy": 0.5715458989143372, "epoch": 1.7610294117647058, "frac_reward_zero_std": 0.0, "grad_norm": 0.9475399576080671, "kl": 0.03986446559429169, "learning_rate": 4.3782814767625755e-07, "loss": 0.011, "num_tokens": 62159824.0, "reward": -0.65625, "reward_std": 0.7015564441680908, "rewards/decision_reward_func/mean": -0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.7708029747009277, "sampling/importance_sampling_ratio/mean": 1.0002617835998535, "sampling/importance_sampling_ratio/min": 0.6636242270469666, "sampling/sampling_logp_difference/max": 0.5714330673217773, "sampling/sampling_logp_difference/mean": 0.01580444537103176, "step": 1437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 722.0, "completions/max_terminated_length": 722.0, "completions/mean_length": 401.65625, "completions/mean_terminated_length": 401.65625, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "entropy": 0.4422675371170044, "epoch": 1.7622549019607843, "frac_reward_zero_std": 0.75, "grad_norm": 0.4924610315204174, "kl": 0.0195455402135849, "learning_rate": 4.371213603597987e-07, "loss": 0.0072, "num_tokens": 62202266.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.6157500743865967, "sampling/importance_sampling_ratio/mean": 1.0001744031906128, "sampling/importance_sampling_ratio/min": 0.648239254951477, "sampling/sampling_logp_difference/max": 0.4797992706298828, "sampling/sampling_logp_difference/mean": 0.013468981720507145, "step": 1438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 928.0, "completions/max_terminated_length": 928.0, "completions/mean_length": 399.171875, "completions/mean_terminated_length": 399.171875, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.4580323100090027, "epoch": 1.7634803921568627, "frac_reward_zero_std": 0.75, "grad_norm": 0.5643552208623269, "kl": 0.01764656789600849, "learning_rate": 4.3641470068299483e-07, "loss": 0.0057, "num_tokens": 62252981.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.6253466606140137, "sampling/importance_sampling_ratio/mean": 1.0001741647720337, "sampling/importance_sampling_ratio/min": 0.5384230613708496, "sampling/sampling_logp_difference/max": 0.6191107034683228, "sampling/sampling_logp_difference/mean": 0.014030995778739452, "step": 1439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 876.0, "completions/max_terminated_length": 876.0, "completions/mean_length": 423.359375, "completions/mean_terminated_length": 423.359375, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "entropy": 0.466898113489151, "epoch": 1.7647058823529411, "frac_reward_zero_std": 1.0, "grad_norm": 0.012233817083115212, "kl": 0.019260874018073082, "learning_rate": 4.3570817008032044e-07, "loss": 0.0002, "num_tokens": 62297452.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.301651120185852, "sampling/importance_sampling_ratio/mean": 0.9999713897705078, "sampling/importance_sampling_ratio/min": 0.6392589211463928, "sampling/sampling_logp_difference/max": 0.44744575023651123, "sampling/sampling_logp_difference/mean": 0.012343171052634716, "step": 1440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 326.0, "completions/max_terminated_length": 326.0, "completions/mean_length": 187.0, "completions/mean_terminated_length": 187.0, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.40521830320358276, "epoch": 1.7659313725490198, "frac_reward_zero_std": 0.75, "grad_norm": 1.0660676547732741, "kl": 0.040749240666627884, "learning_rate": 4.350017699859877e-07, "loss": 0.0231, "num_tokens": 62322300.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.6031315326690674, "sampling/importance_sampling_ratio/mean": 1.0000616312026978, "sampling/importance_sampling_ratio/min": 0.6626556515693665, "sampling/sampling_logp_difference/max": 0.47195887565612793, "sampling/sampling_logp_difference/mean": 0.015803378075361252, "step": 1441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 938.0, "completions/max_terminated_length": 938.0, "completions/mean_length": 334.40625, "completions/mean_terminated_length": 334.40625, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.503713846206665, "epoch": 1.767156862745098, "frac_reward_zero_std": 0.75, "grad_norm": 0.7754017676995372, "kl": 0.02887231856584549, "learning_rate": 4.342955018339441e-07, "loss": -0.044, "num_tokens": 62359910.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.5797401666641235, "sampling/importance_sampling_ratio/mean": 1.000038743019104, "sampling/importance_sampling_ratio/min": 0.7204994559288025, "sampling/sampling_logp_difference/max": 0.4572603702545166, "sampling/sampling_logp_difference/mean": 0.015299947001039982, "step": 1442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1203.0, "completions/max_terminated_length": 1203.0, "completions/mean_length": 557.953125, "completions/mean_terminated_length": 557.953125, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "entropy": 0.5509500503540039, "epoch": 1.7683823529411766, "frac_reward_zero_std": 0.5, "grad_norm": 0.5493687777871651, "kl": 0.029574286192655563, "learning_rate": 4.335893670578694e-07, "loss": 0.0302, "num_tokens": 62417059.0, "reward": 0.5, "reward_std": 0.40311288833618164, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.430479645729065, "sampling/importance_sampling_ratio/mean": 0.9999963045120239, "sampling/importance_sampling_ratio/min": 0.545183539390564, "sampling/sampling_logp_difference/max": 0.6066327095031738, "sampling/sampling_logp_difference/mean": 0.01442105695605278, "step": 1443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 666.0, "completions/max_terminated_length": 666.0, "completions/mean_length": 245.515625, "completions/mean_terminated_length": 245.515625, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.4055136442184448, "epoch": 1.7696078431372548, "frac_reward_zero_std": 1.0, "grad_norm": 0.01514314548380229, "kl": 0.02018251270055771, "learning_rate": 4.328833670911724e-07, "loss": 0.0002, "num_tokens": 62447892.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5574008226394653, "sampling/importance_sampling_ratio/mean": 0.9998613595962524, "sampling/importance_sampling_ratio/min": 0.5914222598075867, "sampling/sampling_logp_difference/max": 0.525225043296814, "sampling/sampling_logp_difference/mean": 0.01389995962381363, "step": 1444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 577.0, "completions/max_terminated_length": 577.0, "completions/mean_length": 257.796875, "completions/mean_terminated_length": 257.796875, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.5417602062225342, "epoch": 1.7708333333333335, "frac_reward_zero_std": 0.75, "grad_norm": 0.8866290371912748, "kl": 0.04240913689136505, "learning_rate": 4.3217750336698803e-07, "loss": 0.0052, "num_tokens": 62478359.0, "reward": 0.15625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0002615451812744, "sampling/importance_sampling_ratio/min": 0.23607070744037628, "sampling/sampling_logp_difference/max": 1.4436239004135132, "sampling/sampling_logp_difference/mean": 0.016878895461559296, "step": 1445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 944.0, "completions/max_terminated_length": 944.0, "completions/mean_length": 350.0625, "completions/mean_terminated_length": 350.0625, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "entropy": 0.4835432767868042, "epoch": 1.7720588235294117, "frac_reward_zero_std": 0.5, "grad_norm": 0.7988909951789304, "kl": 0.021177800372242928, "learning_rate": 4.314717773181752e-07, "loss": 0.0761, "num_tokens": 62519611.0, "reward": 0.9375, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.6020294427871704, "sampling/importance_sampling_ratio/mean": 1.0002670288085938, "sampling/importance_sampling_ratio/min": 0.6625544428825378, "sampling/sampling_logp_difference/max": 0.471271276473999, "sampling/sampling_logp_difference/mean": 0.0145870391279459, "step": 1446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 818.0, "completions/max_terminated_length": 818.0, "completions/mean_length": 304.765625, "completions/mean_terminated_length": 304.765625, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.5669981241226196, "epoch": 1.7732843137254903, "frac_reward_zero_std": 0.75, "grad_norm": 0.8036284988920547, "kl": 0.03421687334775925, "learning_rate": 4.3076619037731287e-07, "loss": -0.0016, "num_tokens": 62554812.0, "reward": 0.84375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.3993587493896484, "sampling/importance_sampling_ratio/mean": 0.999887228012085, "sampling/importance_sampling_ratio/min": 0.6517203450202942, "sampling/sampling_logp_difference/max": 0.42813968658447266, "sampling/sampling_logp_difference/mean": 0.017591536045074463, "step": 1447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 861.0, "completions/max_terminated_length": 861.0, "completions/mean_length": 447.5, "completions/mean_terminated_length": 447.5, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "entropy": 0.5348508358001709, "epoch": 1.7745098039215685, "frac_reward_zero_std": 0.5, "grad_norm": 0.6645700327502434, "kl": 0.02769404835999012, "learning_rate": 4.3006074397669836e-07, "loss": -0.0293, "num_tokens": 62603532.0, "reward": 0.71875, "reward_std": 0.4629635810852051, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.3929375410079956, "sampling/importance_sampling_ratio/mean": 1.0001814365386963, "sampling/importance_sampling_ratio/min": 0.6801581978797913, "sampling/sampling_logp_difference/max": 0.38542985916137695, "sampling/sampling_logp_difference/mean": 0.014506742358207703, "step": 1448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 758.0, "completions/max_terminated_length": 758.0, "completions/mean_length": 395.375, "completions/mean_terminated_length": 395.375, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "entropy": 0.4801896810531616, "epoch": 1.7757352941176472, "frac_reward_zero_std": 1.0, "grad_norm": 0.01454774517501953, "kl": 0.01804165542125702, "learning_rate": 4.293554395483425e-07, "loss": 0.0002, "num_tokens": 62651988.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5071555376052856, "sampling/importance_sampling_ratio/mean": 1.0000715255737305, "sampling/importance_sampling_ratio/min": 0.729070246219635, "sampling/sampling_logp_difference/max": 0.4102240800857544, "sampling/sampling_logp_difference/mean": 0.015342006459832191, "step": 1449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 937.0, "completions/max_terminated_length": 937.0, "completions/mean_length": 413.65625, "completions/mean_terminated_length": 413.65625, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "entropy": 0.42184120416641235, "epoch": 1.7769607843137254, "frac_reward_zero_std": 0.75, "grad_norm": 0.4602638506009457, "kl": 0.015398846939206123, "learning_rate": 4.2865027852396894e-07, "loss": -0.0142, "num_tokens": 62698526.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.6020609140396118, "sampling/importance_sampling_ratio/mean": 1.0001267194747925, "sampling/importance_sampling_ratio/min": 0.7153663635253906, "sampling/sampling_logp_difference/max": 0.47129082679748535, "sampling/sampling_logp_difference/mean": 0.013200586661696434, "step": 1450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/max_terminated_length": 478.0, "completions/mean_length": 271.6875, "completions/mean_terminated_length": 271.6875, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.4879577159881592, "epoch": 1.778186274509804, "frac_reward_zero_std": 0.5, "grad_norm": 0.9305684622737863, "kl": 0.028846975415945053, "learning_rate": 4.2794526233501004e-07, "loss": 0.0178, "num_tokens": 62730554.0, "reward": 0.9375, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9998946189880371, "sampling/importance_sampling_ratio/min": 0.6547195315361023, "sampling/sampling_logp_difference/max": 0.7525339126586914, "sampling/sampling_logp_difference/mean": 0.015504562295973301, "step": 1451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1087.0, "completions/max_terminated_length": 1087.0, "completions/mean_length": 374.953125, "completions/mean_terminated_length": 374.953125, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "entropy": 0.6469908952713013, "epoch": 1.7794117647058822, "frac_reward_zero_std": 0.75, "grad_norm": 0.4840757843853384, "kl": 0.03616541624069214, "learning_rate": 4.272403924126035e-07, "loss": -0.0099, "num_tokens": 62772311.0, "reward": 0.625, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.4417272806167603, "sampling/importance_sampling_ratio/mean": 0.9994404315948486, "sampling/importance_sampling_ratio/min": 0.7131717801094055, "sampling/sampling_logp_difference/max": 0.3658418655395508, "sampling/sampling_logp_difference/mean": 0.017277806997299194, "step": 1452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 454.0, "completions/max_terminated_length": 454.0, "completions/mean_length": 301.265625, "completions/mean_terminated_length": 301.265625, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.5122078061103821, "epoch": 1.780637254901961, "frac_reward_zero_std": 0.75, "grad_norm": 0.7509320962175706, "kl": 0.024306297302246094, "learning_rate": 4.2653567018759103e-07, "loss": -0.0137, "num_tokens": 62812760.0, "reward": 0.6875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.6104580163955688, "sampling/importance_sampling_ratio/mean": 0.999775767326355, "sampling/importance_sampling_ratio/min": 0.4998772442340851, "sampling/sampling_logp_difference/max": 0.6933927536010742, "sampling/sampling_logp_difference/mean": 0.015608852729201317, "step": 1453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1295.0, "completions/max_terminated_length": 1295.0, "completions/mean_length": 418.484375, "completions/mean_terminated_length": 418.484375, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "entropy": 0.4485792815685272, "epoch": 1.781862745098039, "frac_reward_zero_std": 0.75, "grad_norm": 0.43471058264272205, "kl": 0.03146681189537048, "learning_rate": 4.258310970905139e-07, "loss": 0.0154, "num_tokens": 62860087.0, "reward": -0.28125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": -0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 1.9141018390655518, "sampling/importance_sampling_ratio/mean": 1.000078558921814, "sampling/importance_sampling_ratio/min": 0.37716081738471985, "sampling/sampling_logp_difference/max": 0.975083589553833, "sampling/sampling_logp_difference/mean": 0.013401184231042862, "step": 1454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 904.0, "completions/max_terminated_length": 904.0, "completions/mean_length": 408.078125, "completions/mean_terminated_length": 408.078125, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "entropy": 0.5625977516174316, "epoch": 1.7830882352941178, "frac_reward_zero_std": 0.75, "grad_norm": 0.6413768516236167, "kl": 0.02530568093061447, "learning_rate": 4.251266745516112e-07, "loss": 0.0026, "num_tokens": 62909484.0, "reward": 0.65625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.5904589891433716, "sampling/importance_sampling_ratio/mean": 0.999774158000946, "sampling/importance_sampling_ratio/min": 0.5355238914489746, "sampling/sampling_logp_difference/max": 0.6245098114013672, "sampling/sampling_logp_difference/mean": 0.01600048691034317, "step": 1455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 940.0, "completions/max_terminated_length": 940.0, "completions/mean_length": 356.015625, "completions/mean_terminated_length": 356.015625, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.44555431604385376, "epoch": 1.784313725490196, "frac_reward_zero_std": 0.5, "grad_norm": 0.8586605555853197, "kl": 0.03349577635526657, "learning_rate": 4.2442240400081556e-07, "loss": -0.0465, "num_tokens": 62951421.0, "reward": -0.125, "reward_std": 0.42078250646591187, "rewards/decision_reward_func/mean": -0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 1.4458138942718506, "sampling/importance_sampling_ratio/mean": 0.999867856502533, "sampling/importance_sampling_ratio/min": 0.47328078746795654, "sampling/sampling_logp_difference/max": 0.7480664253234863, "sampling/sampling_logp_difference/mean": 0.012996343895792961, "step": 1456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 577.0, "completions/max_terminated_length": 577.0, "completions/mean_length": 366.796875, "completions/mean_terminated_length": 366.796875, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "entropy": 0.3459293246269226, "epoch": 1.7855392156862746, "frac_reward_zero_std": 0.75, "grad_norm": 0.4913226383061533, "kl": 0.020625131204724312, "learning_rate": 4.2371828686775186e-07, "loss": 0.002, "num_tokens": 62995664.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.8449920415878296, "sampling/importance_sampling_ratio/mean": 0.9999894499778748, "sampling/importance_sampling_ratio/min": 0.7215782403945923, "sampling/sampling_logp_difference/max": 0.6124749183654785, "sampling/sampling_logp_difference/mean": 0.011292412877082825, "step": 1457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 460.0, "completions/max_terminated_length": 460.0, "completions/mean_length": 270.984375, "completions/mean_terminated_length": 270.984375, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.5471882820129395, "epoch": 1.7867647058823528, "frac_reward_zero_std": 0.25, "grad_norm": 1.1853392438145478, "kl": 0.05010911822319031, "learning_rate": 4.2301432458173316e-07, "loss": 0.0198, "num_tokens": 63025551.0, "reward": 0.0, "reward_std": 0.5501632690429688, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.4187108278274536, "sampling/importance_sampling_ratio/mean": 1.0000181198120117, "sampling/importance_sampling_ratio/min": 0.6917418241500854, "sampling/sampling_logp_difference/max": 0.3685424327850342, "sampling/sampling_logp_difference/mean": 0.016128651797771454, "step": 1458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 626.0, "completions/max_terminated_length": 626.0, "completions/mean_length": 275.75, "completions/mean_terminated_length": 275.75, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.5854610204696655, "epoch": 1.7879901960784315, "frac_reward_zero_std": 0.25, "grad_norm": 1.2040225223607817, "kl": 0.049702927470207214, "learning_rate": 4.223105185717585e-07, "loss": 0.0162, "num_tokens": 63058159.0, "reward": 0.4375, "reward_std": 0.7191373109817505, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.5765693187713623, "sampling/importance_sampling_ratio/mean": 0.9999676942825317, "sampling/importance_sampling_ratio/min": 0.6331761479377747, "sampling/sampling_logp_difference/max": 0.45700669288635254, "sampling/sampling_logp_difference/mean": 0.017656579613685608, "step": 1459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 793.0, "completions/max_terminated_length": 793.0, "completions/mean_length": 335.40625, "completions/mean_terminated_length": 335.40625, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.4054657816886902, "epoch": 1.7892156862745097, "frac_reward_zero_std": 1.0, "grad_norm": 0.015248188888563373, "kl": 0.0199271347373724, "learning_rate": 4.216068702665093e-07, "loss": 0.0002, "num_tokens": 63097033.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6190370321273804, "sampling/importance_sampling_ratio/mean": 1.0004897117614746, "sampling/importance_sampling_ratio/min": 0.6671188473701477, "sampling/sampling_logp_difference/max": 0.48183155059814453, "sampling/sampling_logp_difference/mean": 0.013406258076429367, "step": 1460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/max_terminated_length": 494.0, "completions/mean_length": 317.21875, "completions/mean_terminated_length": 317.21875, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.41668224334716797, "epoch": 1.7904411764705883, "frac_reward_zero_std": 1.0, "grad_norm": 0.017044785362532466, "kl": 0.02057347074151039, "learning_rate": 4.2090338109434703e-07, "loss": 0.0002, "num_tokens": 63139079.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0006015300750732, "sampling/importance_sampling_ratio/min": 0.6622359752655029, "sampling/sampling_logp_difference/max": 0.7329139709472656, "sampling/sampling_logp_difference/mean": 0.014342732727527618, "step": 1461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 569.0, "completions/max_terminated_length": 569.0, "completions/mean_length": 299.4375, "completions/mean_terminated_length": 299.4375, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "entropy": 0.3400329351425171, "epoch": 1.7916666666666665, "frac_reward_zero_std": 1.0, "grad_norm": 0.014135029373903185, "kl": 0.01702420599758625, "learning_rate": 4.202000524833105e-07, "loss": 0.0002, "num_tokens": 63178963.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4090752601623535, "sampling/importance_sampling_ratio/mean": 1.0001442432403564, "sampling/importance_sampling_ratio/min": 0.7705438733100891, "sampling/sampling_logp_difference/max": 0.34293365478515625, "sampling/sampling_logp_difference/mean": 0.011321531608700752, "step": 1462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 600.0, "completions/max_terminated_length": 600.0, "completions/mean_length": 369.21875, "completions/mean_terminated_length": 369.21875, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "entropy": 0.5026389360427856, "epoch": 1.7928921568627452, "frac_reward_zero_std": 0.5, "grad_norm": 0.8635348308966668, "kl": 0.03159734606742859, "learning_rate": 4.194968858611117e-07, "loss": 0.0079, "num_tokens": 63221329.0, "reward": 0.53125, "reward_std": 0.48935678601264954, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.457558035850525, "sampling/importance_sampling_ratio/mean": 1.0000559091567993, "sampling/importance_sampling_ratio/min": 0.7295470237731934, "sampling/sampling_logp_difference/max": 0.37676239013671875, "sampling/sampling_logp_difference/mean": 0.013623170554637909, "step": 1463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 832.0, "completions/max_terminated_length": 832.0, "completions/mean_length": 410.53125, "completions/mean_terminated_length": 410.53125, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "entropy": 0.5020631551742554, "epoch": 1.7941176470588234, "frac_reward_zero_std": 0.25, "grad_norm": 0.8841164706619437, "kl": 0.024950923398137093, "learning_rate": 4.187938826551346e-07, "loss": -0.0145, "num_tokens": 63274739.0, "reward": 0.6875, "reward_std": 0.5081988573074341, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.577500820159912, "sampling/importance_sampling_ratio/mean": 0.9999037981033325, "sampling/importance_sampling_ratio/min": 0.5758156776428223, "sampling/sampling_logp_difference/max": 0.5519676208496094, "sampling/sampling_logp_difference/mean": 0.014210136607289314, "step": 1464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/max_terminated_length": 391.0, "completions/mean_length": 232.5, "completions/mean_terminated_length": 232.5, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.5459248423576355, "epoch": 1.795343137254902, "frac_reward_zero_std": 0.75, "grad_norm": 0.8159657310131957, "kl": 0.038304440677165985, "learning_rate": 4.180910442924311e-07, "loss": 0.0297, "num_tokens": 63303699.0, "reward": 0.15625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.5155638456344604, "sampling/importance_sampling_ratio/mean": 1.0005266666412354, "sampling/importance_sampling_ratio/min": 0.5171644687652588, "sampling/sampling_logp_difference/max": 0.6593942642211914, "sampling/sampling_logp_difference/mean": 0.01752653531730175, "step": 1465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1081.0, "completions/max_terminated_length": 1081.0, "completions/mean_length": 379.28125, "completions/mean_terminated_length": 379.28125, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.38977211713790894, "epoch": 1.7965686274509802, "frac_reward_zero_std": 0.5, "grad_norm": 0.8232328282476342, "kl": 0.020686611533164978, "learning_rate": 4.173883721997188e-07, "loss": 0.0912, "num_tokens": 63350149.0, "reward": 0.71875, "reward_std": 0.38319888710975647, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.4332777261734009, "sampling/importance_sampling_ratio/mean": 0.999874472618103, "sampling/importance_sampling_ratio/min": 0.4987063407897949, "sampling/sampling_logp_difference/max": 0.6957378387451172, "sampling/sampling_logp_difference/mean": 0.012930499389767647, "step": 1466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/max_terminated_length": 472.0, "completions/mean_length": 285.515625, "completions/mean_terminated_length": 285.515625, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.44929394125938416, "epoch": 1.7977941176470589, "frac_reward_zero_std": 1.0, "grad_norm": 0.017345591336806458, "kl": 0.0235343798995018, "learning_rate": 4.1668586780337713e-07, "loss": 0.0002, "num_tokens": 63382678.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.9723293781280518, "sampling/importance_sampling_ratio/mean": 1.0001670122146606, "sampling/importance_sampling_ratio/min": 0.6147370934486389, "sampling/sampling_logp_difference/max": 0.6792153120040894, "sampling/sampling_logp_difference/mean": 0.015051258727908134, "step": 1467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 731.0, "completions/max_terminated_length": 731.0, "completions/mean_length": 304.625, "completions/mean_terminated_length": 304.625, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.43357861042022705, "epoch": 1.7990196078431373, "frac_reward_zero_std": 0.75, "grad_norm": 0.6070515599608368, "kl": 0.02341076359152794, "learning_rate": 4.159835325294457e-07, "loss": 0.031, "num_tokens": 63415086.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.471816062927246, "sampling/importance_sampling_ratio/mean": 1.0004558563232422, "sampling/importance_sampling_ratio/min": 0.38381603360176086, "sampling/sampling_logp_difference/max": 0.9575920104980469, "sampling/sampling_logp_difference/mean": 0.014780883677303791, "step": 1468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 625.0, "completions/max_terminated_length": 625.0, "completions/mean_length": 263.359375, "completions/mean_terminated_length": 263.359375, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.4711928963661194, "epoch": 1.8002450980392157, "frac_reward_zero_std": 1.0, "grad_norm": 0.021606728348781468, "kl": 0.050927452743053436, "learning_rate": 4.152813678036208e-07, "loss": 0.0004, "num_tokens": 63450517.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.487105369567871, "sampling/importance_sampling_ratio/mean": 1.000678300857544, "sampling/importance_sampling_ratio/min": 0.6292483806610107, "sampling/sampling_logp_difference/max": 0.4632291793823242, "sampling/sampling_logp_difference/mean": 0.01608094945549965, "step": 1469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 446.0, "completions/max_terminated_length": 446.0, "completions/mean_length": 270.90625, "completions/mean_terminated_length": 270.90625, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.5326839685440063, "epoch": 1.8014705882352942, "frac_reward_zero_std": 0.5, "grad_norm": 0.9297503112212775, "kl": 0.03916919603943825, "learning_rate": 4.145793750512522e-07, "loss": -0.0198, "num_tokens": 63484063.0, "reward": -0.59375, "reward_std": 0.47978055477142334, "rewards/decision_reward_func/mean": -0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.4221787452697754, "sampling/importance_sampling_ratio/mean": 1.0001248121261597, "sampling/importance_sampling_ratio/min": 0.6796563267707825, "sampling/sampling_logp_difference/max": 0.3861680030822754, "sampling/sampling_logp_difference/mean": 0.015976859256625175, "step": 1470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 528.0, "completions/max_terminated_length": 528.0, "completions/mean_length": 304.859375, "completions/mean_terminated_length": 304.859375, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.4646678566932678, "epoch": 1.8026960784313726, "frac_reward_zero_std": 0.75, "grad_norm": 0.6195821078032595, "kl": 0.04260803759098053, "learning_rate": 4.1387755569734054e-07, "loss": 0.0328, "num_tokens": 63522102.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.4538031816482544, "sampling/importance_sampling_ratio/mean": 0.9998424053192139, "sampling/importance_sampling_ratio/min": 0.6058667302131653, "sampling/sampling_logp_difference/max": 0.5010952949523926, "sampling/sampling_logp_difference/mean": 0.0145066287368536, "step": 1471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 617.0, "completions/max_terminated_length": 617.0, "completions/mean_length": 292.90625, "completions/mean_terminated_length": 292.90625, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.48339998722076416, "epoch": 1.803921568627451, "frac_reward_zero_std": 0.75, "grad_norm": 0.584963234208766, "kl": 0.024720437824726105, "learning_rate": 4.131759111665348e-07, "loss": 0.0256, "num_tokens": 63561808.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.475376844406128, "sampling/importance_sampling_ratio/mean": 1.0000079870224, "sampling/importance_sampling_ratio/min": 0.6547030210494995, "sampling/sampling_logp_difference/max": 0.4235736131668091, "sampling/sampling_logp_difference/mean": 0.015745623037219048, "step": 1472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1749.0, "completions/max_terminated_length": 1749.0, "completions/mean_length": 314.484375, "completions/mean_terminated_length": 314.484375, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.5659655332565308, "epoch": 1.8051470588235294, "frac_reward_zero_std": 0.75, "grad_norm": 0.5507177843914661, "kl": 0.052392181009054184, "learning_rate": 4.1247444288312895e-07, "loss": 0.0059, "num_tokens": 63599391.0, "reward": -0.84375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": -0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.4421037435531616, "sampling/importance_sampling_ratio/mean": 0.9996950030326843, "sampling/importance_sampling_ratio/min": 0.6435168981552124, "sampling/sampling_logp_difference/max": 0.4408069849014282, "sampling/sampling_logp_difference/mean": 0.017339564859867096, "step": 1473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 808.0, "completions/max_terminated_length": 808.0, "completions/mean_length": 388.9375, "completions/mean_terminated_length": 388.9375, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "entropy": 0.714012622833252, "epoch": 1.8063725490196079, "frac_reward_zero_std": 0.5, "grad_norm": 0.7557983380444944, "kl": 0.04316096380352974, "learning_rate": 4.1177315227105926e-07, "loss": -0.0171, "num_tokens": 63645163.0, "reward": 0.375, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.5310107469558716, "sampling/importance_sampling_ratio/mean": 0.9998548030853271, "sampling/importance_sampling_ratio/min": 0.5298771858215332, "sampling/sampling_logp_difference/max": 0.6351100206375122, "sampling/sampling_logp_difference/mean": 0.019356993958353996, "step": 1474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 555.0, "completions/max_terminated_length": 555.0, "completions/mean_length": 309.578125, "completions/mean_terminated_length": 309.578125, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "entropy": 0.41453471779823303, "epoch": 1.8075980392156863, "frac_reward_zero_std": 0.75, "grad_norm": 0.5700644228283666, "kl": 0.03430888056755066, "learning_rate": 4.1107204075390096e-07, "loss": 0.0095, "num_tokens": 63678320.0, "reward": 0.8125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.4744290113449097, "sampling/importance_sampling_ratio/mean": 1.0004934072494507, "sampling/importance_sampling_ratio/min": 0.6959278583526611, "sampling/sampling_logp_difference/max": 0.38827085494995117, "sampling/sampling_logp_difference/mean": 0.013173743151128292, "step": 1475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 406.0, "completions/max_terminated_length": 406.0, "completions/mean_length": 266.109375, "completions/mean_terminated_length": 266.109375, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "entropy": 0.3815256357192993, "epoch": 1.8088235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.01785315419234385, "kl": 0.01850772649049759, "learning_rate": 4.1037110975486617e-07, "loss": 0.0002, "num_tokens": 63712519.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4880118370056152, "sampling/importance_sampling_ratio/mean": 0.9998971819877625, "sampling/importance_sampling_ratio/min": 0.6152651309967041, "sampling/sampling_logp_difference/max": 0.4857020378112793, "sampling/sampling_logp_difference/mean": 0.013453029096126556, "step": 1476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 725.0, "completions/max_terminated_length": 725.0, "completions/mean_length": 367.53125, "completions/mean_terminated_length": 367.53125, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "entropy": 0.47112035751342773, "epoch": 1.8100490196078431, "frac_reward_zero_std": 0.5, "grad_norm": 0.8161310798922562, "kl": 0.025713618844747543, "learning_rate": 4.096703606968006e-07, "loss": 0.0262, "num_tokens": 63753641.0, "reward": 0.71875, "reward_std": 0.42695626616477966, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.5150517225265503, "sampling/importance_sampling_ratio/mean": 0.99986332654953, "sampling/importance_sampling_ratio/min": 0.4302572011947632, "sampling/sampling_logp_difference/max": 0.843372106552124, "sampling/sampling_logp_difference/mean": 0.01419789344072342, "step": 1477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 945.0, "completions/max_terminated_length": 945.0, "completions/mean_length": 334.1875, "completions/mean_terminated_length": 334.1875, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.47190728783607483, "epoch": 1.8112745098039216, "frac_reward_zero_std": 0.75, "grad_norm": 0.7022908278317226, "kl": 0.026220370084047318, "learning_rate": 4.0896979500218014e-07, "loss": 0.0351, "num_tokens": 63800533.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.5071672201156616, "sampling/importance_sampling_ratio/mean": 1.0000697374343872, "sampling/importance_sampling_ratio/min": 0.6589009165763855, "sampling/sampling_logp_difference/max": 0.4171820878982544, "sampling/sampling_logp_difference/mean": 0.015379241667687893, "step": 1478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1181.0, "completions/max_terminated_length": 1181.0, "completions/mean_length": 387.625, "completions/mean_terminated_length": 387.625, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "entropy": 0.35084599256515503, "epoch": 1.8125, "frac_reward_zero_std": 0.75, "grad_norm": 0.5194761423131357, "kl": 0.02213214710354805, "learning_rate": 4.082694140931088e-07, "loss": 0.0406, "num_tokens": 63842941.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.3773791790008545, "sampling/importance_sampling_ratio/mean": 0.9998571872711182, "sampling/importance_sampling_ratio/min": 0.6482262015342712, "sampling/sampling_logp_difference/max": 0.4335155487060547, "sampling/sampling_logp_difference/mean": 0.011044273152947426, "step": 1479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 574.0, "completions/max_terminated_length": 574.0, "completions/mean_length": 260.015625, "completions/mean_terminated_length": 260.015625, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.5030763745307922, "epoch": 1.8137254901960784, "frac_reward_zero_std": 0.75, "grad_norm": 0.6549865276625486, "kl": 0.04667019844055176, "learning_rate": 4.0756921939131563e-07, "loss": 0.0009, "num_tokens": 63876766.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.6273729801177979, "sampling/importance_sampling_ratio/mean": 0.9996160268783569, "sampling/importance_sampling_ratio/min": 0.7017082571983337, "sampling/sampling_logp_difference/max": 0.4869670867919922, "sampling/sampling_logp_difference/mean": 0.01598011702299118, "step": 1480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1314.0, "completions/max_terminated_length": 1314.0, "completions/mean_length": 373.5625, "completions/mean_terminated_length": 373.5625, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.5863704681396484, "epoch": 1.8149509803921569, "frac_reward_zero_std": 0.75, "grad_norm": 0.41913678379247904, "kl": 0.04974035918712616, "learning_rate": 4.0686921231815155e-07, "loss": 0.0009, "num_tokens": 63918738.0, "reward": -0.15625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": -0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.3401433229446411, "sampling/importance_sampling_ratio/mean": 1.0000354051589966, "sampling/importance_sampling_ratio/min": 0.6939646005630493, "sampling/sampling_logp_difference/max": 0.36533427238464355, "sampling/sampling_logp_difference/mean": 0.01717384159564972, "step": 1481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 540.0, "completions/max_terminated_length": 540.0, "completions/mean_length": 275.3125, "completions/mean_terminated_length": 275.3125, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.4605656564235687, "epoch": 1.8161764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.9029471842733106, "kl": 0.035593271255493164, "learning_rate": 4.0616939429458627e-07, "loss": 0.0176, "num_tokens": 63950694.0, "reward": 0.53125, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000359296798706, "sampling/importance_sampling_ratio/min": 0.6282638311386108, "sampling/sampling_logp_difference/max": 0.7275705337524414, "sampling/sampling_logp_difference/mean": 0.015647318214178085, "step": 1482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 623.0, "completions/max_terminated_length": 623.0, "completions/mean_length": 309.140625, "completions/mean_terminated_length": 309.140625, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.3572482168674469, "epoch": 1.8174019607843137, "frac_reward_zero_std": 0.75, "grad_norm": 0.5888410413990484, "kl": 0.020720962435007095, "learning_rate": 4.0546976674120623e-07, "loss": -0.0042, "num_tokens": 63989327.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.4061692953109741, "sampling/importance_sampling_ratio/mean": 0.9995362758636475, "sampling/importance_sampling_ratio/min": 0.6785000562667847, "sampling/sampling_logp_difference/max": 0.3878706693649292, "sampling/sampling_logp_difference/mean": 0.012953531928360462, "step": 1483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1008.0, "completions/max_terminated_length": 1008.0, "completions/mean_length": 345.78125, "completions/mean_terminated_length": 345.78125, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.5540916919708252, "epoch": 1.8186274509803921, "frac_reward_zero_std": 0.75, "grad_norm": 0.8222787532230127, "kl": 0.03411117568612099, "learning_rate": 4.047703310782111e-07, "loss": -0.0154, "num_tokens": 64034721.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.9113705158233643, "sampling/importance_sampling_ratio/mean": 0.9998827576637268, "sampling/importance_sampling_ratio/min": 0.5676478147506714, "sampling/sampling_logp_difference/max": 0.6478204727172852, "sampling/sampling_logp_difference/mean": 0.01832243800163269, "step": 1484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1459.0, "completions/max_terminated_length": 1459.0, "completions/mean_length": 363.234375, "completions/mean_terminated_length": 363.234375, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.449552446603775, "epoch": 1.8198529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.020477932299061835, "kl": 0.025838978588581085, "learning_rate": 4.0407108872541105e-07, "loss": 0.0002, "num_tokens": 64079856.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6105364561080933, "sampling/importance_sampling_ratio/mean": 1.0001564025878906, "sampling/importance_sampling_ratio/min": 0.5397600531578064, "sampling/sampling_logp_difference/max": 0.6166305541992188, "sampling/sampling_logp_difference/mean": 0.013987569138407707, "step": 1485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 806.0, "completions/max_terminated_length": 806.0, "completions/mean_length": 358.125, "completions/mean_terminated_length": 358.125, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.3964594602584839, "epoch": 1.821078431372549, "frac_reward_zero_std": 0.75, "grad_norm": 0.5108529005720406, "kl": 0.020785309374332428, "learning_rate": 4.0337204110222347e-07, "loss": 0.0057, "num_tokens": 64122424.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.7287302017211914, "sampling/importance_sampling_ratio/mean": 1.000168800354004, "sampling/importance_sampling_ratio/min": 0.674311101436615, "sampling/sampling_logp_difference/max": 0.5473871231079102, "sampling/sampling_logp_difference/mean": 0.013034806586802006, "step": 1486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 950.0, "completions/max_terminated_length": 950.0, "completions/mean_length": 320.71875, "completions/mean_terminated_length": 320.71875, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.38601163029670715, "epoch": 1.8223039215686274, "frac_reward_zero_std": 0.75, "grad_norm": 0.6642751147098532, "kl": 0.029340676963329315, "learning_rate": 4.0267318962767076e-07, "loss": -0.0028, "num_tokens": 64160646.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.5778515338897705, "sampling/importance_sampling_ratio/mean": 0.9998488426208496, "sampling/importance_sampling_ratio/min": 0.6368674039840698, "sampling/sampling_logp_difference/max": 0.4560641050338745, "sampling/sampling_logp_difference/mean": 0.014232320711016655, "step": 1487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 470.0, "completions/max_terminated_length": 470.0, "completions/mean_length": 201.109375, "completions/mean_terminated_length": 201.109375, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.3858603537082672, "epoch": 1.8235294117647058, "frac_reward_zero_std": 1.0, "grad_norm": 0.029424234092153372, "kl": 0.03266284614801407, "learning_rate": 4.0197453572037747e-07, "loss": 0.0003, "num_tokens": 64191645.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4829317331314087, "sampling/importance_sampling_ratio/mean": 1.0003013610839844, "sampling/importance_sampling_ratio/min": 0.6622360348701477, "sampling/sampling_logp_difference/max": 0.41213321685791016, "sampling/sampling_logp_difference/mean": 0.01606486737728119, "step": 1488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 777.0, "completions/max_terminated_length": 777.0, "completions/mean_length": 381.53125, "completions/mean_terminated_length": 381.53125, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.5802179574966431, "epoch": 1.8247549019607843, "frac_reward_zero_std": 0.5, "grad_norm": 0.9403187329362144, "kl": 0.029907554388046265, "learning_rate": 4.0127608079856644e-07, "loss": -0.0154, "num_tokens": 64231263.0, "reward": 0.0625, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.456714391708374, "sampling/importance_sampling_ratio/mean": 0.999818742275238, "sampling/importance_sampling_ratio/min": 0.5702659487724304, "sampling/sampling_logp_difference/max": 0.561652421951294, "sampling/sampling_logp_difference/mean": 0.017131149768829346, "step": 1489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/max_terminated_length": 522.0, "completions/mean_length": 309.734375, "completions/mean_terminated_length": 309.734375, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.4066116213798523, "epoch": 1.8259803921568627, "frac_reward_zero_std": 1.0, "grad_norm": 0.0243366248666188, "kl": 0.0233931802213192, "learning_rate": 4.005778262800571e-07, "loss": 0.0002, "num_tokens": 64270926.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4615116119384766, "sampling/importance_sampling_ratio/mean": 1.000205159187317, "sampling/importance_sampling_ratio/min": 0.6867883205413818, "sampling/sampling_logp_difference/max": 0.3794713020324707, "sampling/sampling_logp_difference/mean": 0.014303816482424736, "step": 1490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 617.0, "completions/max_terminated_length": 617.0, "completions/mean_length": 271.140625, "completions/mean_terminated_length": 271.140625, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.41821399331092834, "epoch": 1.8272058823529411, "frac_reward_zero_std": 1.0, "grad_norm": 0.030414449399371418, "kl": 0.02507947012782097, "learning_rate": 3.9987977358226175e-07, "loss": 0.0002, "num_tokens": 64308855.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000154972076416, "sampling/importance_sampling_ratio/min": 0.6428900957107544, "sampling/sampling_logp_difference/max": 1.4841299057006836, "sampling/sampling_logp_difference/mean": 0.01564151793718338, "step": 1491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 707.0, "completions/max_terminated_length": 707.0, "completions/mean_length": 338.09375, "completions/mean_terminated_length": 338.09375, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.5718681812286377, "epoch": 1.8284313725490198, "frac_reward_zero_std": 0.75, "grad_norm": 0.521413430616571, "kl": 0.03400753438472748, "learning_rate": 3.991819241221835e-07, "loss": -0.0126, "num_tokens": 64362637.0, "reward": 0.0625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.6620888710021973, "sampling/importance_sampling_ratio/mean": 0.9998092651367188, "sampling/importance_sampling_ratio/min": 0.6469231843948364, "sampling/sampling_logp_difference/max": 0.5080752372741699, "sampling/sampling_logp_difference/mean": 0.01669236458837986, "step": 1492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 651.0, "completions/max_terminated_length": 651.0, "completions/mean_length": 327.640625, "completions/mean_terminated_length": 327.640625, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "entropy": 0.5943495035171509, "epoch": 1.829656862745098, "frac_reward_zero_std": 0.75, "grad_norm": 0.7064487772754672, "kl": 0.0353093259036541, "learning_rate": 3.98484279316412e-07, "loss": -0.0268, "num_tokens": 64405062.0, "reward": 0.1875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 1.4377927780151367, "sampling/importance_sampling_ratio/mean": 1.0001267194747925, "sampling/importance_sampling_ratio/min": 0.5097342729568481, "sampling/sampling_logp_difference/max": 0.673865795135498, "sampling/sampling_logp_difference/mean": 0.0182956550270319, "step": 1493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 840.0, "completions/max_terminated_length": 840.0, "completions/mean_length": 370.390625, "completions/mean_terminated_length": 370.390625, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.3102118670940399, "epoch": 1.8308823529411766, "frac_reward_zero_std": 0.75, "grad_norm": 0.5090895914522031, "kl": 0.01900198683142662, "learning_rate": 3.977868405811223e-07, "loss": 0.0207, "num_tokens": 64444159.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.597716212272644, "sampling/importance_sampling_ratio/mean": 0.9999607801437378, "sampling/importance_sampling_ratio/min": 0.514190673828125, "sampling/sampling_logp_difference/max": 0.6651611328125, "sampling/sampling_logp_difference/mean": 0.012107525952160358, "step": 1494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1054.0, "completions/max_terminated_length": 1054.0, "completions/mean_length": 347.5, "completions/mean_terminated_length": 347.5, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.34786584973335266, "epoch": 1.8321078431372548, "frac_reward_zero_std": 1.0, "grad_norm": 0.017516030394736194, "kl": 0.01955219730734825, "learning_rate": 3.970896093320708e-07, "loss": 0.0002, "num_tokens": 64483983.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.651611328125, "sampling/importance_sampling_ratio/mean": 1.0002906322479248, "sampling/importance_sampling_ratio/min": 0.712900698184967, "sampling/sampling_logp_difference/max": 0.5017514228820801, "sampling/sampling_logp_difference/mean": 0.01162596046924591, "step": 1495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 706.0, "completions/max_terminated_length": 706.0, "completions/mean_length": 322.859375, "completions/mean_terminated_length": 322.859375, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "entropy": 0.4321971833705902, "epoch": 1.8333333333333335, "frac_reward_zero_std": 0.75, "grad_norm": 0.6466631049698861, "kl": 0.023026997223496437, "learning_rate": 3.9639258698459287e-07, "loss": -0.006, "num_tokens": 64522406.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.9920326471328735, "sampling/importance_sampling_ratio/mean": 1.0000436305999756, "sampling/importance_sampling_ratio/min": 0.6066678762435913, "sampling/sampling_logp_difference/max": 0.6891555786132812, "sampling/sampling_logp_difference/mean": 0.013773245736956596, "step": 1496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 623.0, "completions/max_terminated_length": 623.0, "completions/mean_length": 378.234375, "completions/mean_terminated_length": 378.234375, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.2600848078727722, "epoch": 1.8345588235294117, "frac_reward_zero_std": 0.75, "grad_norm": 0.4986788522412576, "kl": 0.01374959759414196, "learning_rate": 3.9569577495359964e-07, "loss": -0.0038, "num_tokens": 64565301.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001695156097412, "sampling/importance_sampling_ratio/min": 0.6771225333213806, "sampling/sampling_logp_difference/max": 0.8912527561187744, "sampling/sampling_logp_difference/mean": 0.009511817246675491, "step": 1497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 699.0, "completions/max_terminated_length": 699.0, "completions/mean_length": 322.171875, "completions/mean_terminated_length": 322.171875, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.543710470199585, "epoch": 1.8357843137254903, "frac_reward_zero_std": 0.75, "grad_norm": 0.5991389052571922, "kl": 0.031009787693619728, "learning_rate": 3.949991746535753e-07, "loss": 0.0037, "num_tokens": 64603088.0, "reward": 0.3125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.3304901123046875, "sampling/importance_sampling_ratio/mean": 0.9999856948852539, "sampling/importance_sampling_ratio/min": 0.6171379685401917, "sampling/sampling_logp_difference/max": 0.4826626777648926, "sampling/sampling_logp_difference/mean": 0.01687324047088623, "step": 1498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/max_terminated_length": 502.0, "completions/mean_length": 307.421875, "completions/mean_terminated_length": 307.421875, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.3400655686855316, "epoch": 1.8370098039215685, "frac_reward_zero_std": 0.75, "grad_norm": 0.7209969939992316, "kl": 0.022697385400533676, "learning_rate": 3.943027874985746e-07, "loss": 0.0215, "num_tokens": 64643595.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.5467345714569092, "sampling/importance_sampling_ratio/mean": 1.0001354217529297, "sampling/importance_sampling_ratio/min": 0.23693160712718964, "sampling/sampling_logp_difference/max": 1.4399837255477905, "sampling/sampling_logp_difference/mean": 0.01264106947928667, "step": 1499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 779.0, "completions/max_terminated_length": 779.0, "completions/mean_length": 304.328125, "completions/mean_terminated_length": 304.328125, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.44460180401802063, "epoch": 1.8382352941176472, "frac_reward_zero_std": 0.75, "grad_norm": 0.8967734940808789, "kl": 0.024024227634072304, "learning_rate": 3.9360661490221904e-07, "loss": -0.0448, "num_tokens": 64689152.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.6318854093551636, "sampling/importance_sampling_ratio/mean": 1.0000442266464233, "sampling/importance_sampling_ratio/min": 0.4309428036212921, "sampling/sampling_logp_difference/max": 0.8417799472808838, "sampling/sampling_logp_difference/mean": 0.014663366600871086, "step": 1500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 858.0, "completions/max_terminated_length": 858.0, "completions/mean_length": 369.65625, "completions/mean_terminated_length": 369.65625, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.44211071729660034, "epoch": 1.8394607843137254, "frac_reward_zero_std": 0.5, "grad_norm": 0.6252514201655867, "kl": 0.033973127603530884, "learning_rate": 3.929106582776948e-07, "loss": -0.0195, "num_tokens": 64730154.0, "reward": 0.5, "reward_std": 0.34156501293182373, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.8156204223632812, "sampling/importance_sampling_ratio/mean": 1.0000739097595215, "sampling/importance_sampling_ratio/min": 0.6316011548042297, "sampling/sampling_logp_difference/max": 0.5964272022247314, "sampling/sampling_logp_difference/mean": 0.01410391554236412, "step": 1501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/max_terminated_length": 501.0, "completions/mean_length": 281.140625, "completions/mean_terminated_length": 281.140625, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.6109508872032166, "epoch": 1.840686274509804, "frac_reward_zero_std": 0.25, "grad_norm": 1.2462282624589291, "kl": 0.04096907749772072, "learning_rate": 3.9221491903775013e-07, "loss": 0.0112, "num_tokens": 64770243.0, "reward": 0.3125, "reward_std": 0.617996096611023, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.4768089056015015, "sampling/importance_sampling_ratio/mean": 0.9996905326843262, "sampling/importance_sampling_ratio/min": 0.6880123019218445, "sampling/sampling_logp_difference/max": 0.3898836374282837, "sampling/sampling_logp_difference/mean": 0.0175531804561615, "step": 1502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 923.0, "completions/max_terminated_length": 923.0, "completions/mean_length": 344.796875, "completions/mean_terminated_length": 344.796875, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.49698638916015625, "epoch": 1.8419117647058822, "frac_reward_zero_std": 0.5, "grad_norm": 0.9120676086937978, "kl": 0.03139358013868332, "learning_rate": 3.9151939859469166e-07, "loss": 0.0168, "num_tokens": 64811142.0, "reward": 0.09375, "reward_std": 0.4515564441680908, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.542270541191101, "sampling/importance_sampling_ratio/mean": 1.0000452995300293, "sampling/importance_sampling_ratio/min": 0.615062415599823, "sampling/sampling_logp_difference/max": 0.48603153228759766, "sampling/sampling_logp_difference/mean": 0.014631720259785652, "step": 1503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 712.0, "completions/max_terminated_length": 712.0, "completions/mean_length": 281.640625, "completions/mean_terminated_length": 281.640625, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.3859175741672516, "epoch": 1.843137254901961, "frac_reward_zero_std": 1.0, "grad_norm": 0.01836113444948338, "kl": 0.02175302803516388, "learning_rate": 3.908240983603813e-07, "loss": 0.0002, "num_tokens": 64848095.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.36582350730896, "sampling/importance_sampling_ratio/mean": 0.9998220801353455, "sampling/importance_sampling_ratio/min": 0.5146454572677612, "sampling/sampling_logp_difference/max": 0.6642770767211914, "sampling/sampling_logp_difference/mean": 0.013971068896353245, "step": 1504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 681.0, "completions/max_terminated_length": 681.0, "completions/mean_length": 367.09375, "completions/mean_terminated_length": 367.09375, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.4097132086753845, "epoch": 1.844362745098039, "frac_reward_zero_std": 0.75, "grad_norm": 0.5328498480488815, "kl": 0.021491950377821922, "learning_rate": 3.9012901974623476e-07, "loss": -0.0019, "num_tokens": 64886421.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.4256913661956787, "sampling/importance_sampling_ratio/mean": 1.0003238916397095, "sampling/importance_sampling_ratio/min": 0.6925027370452881, "sampling/sampling_logp_difference/max": 0.3674430847167969, "sampling/sampling_logp_difference/mean": 0.013990867882966995, "step": 1505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 984.0, "completions/max_terminated_length": 984.0, "completions/mean_length": 389.6875, "completions/mean_terminated_length": 389.6875, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.4461861550807953, "epoch": 1.8455882352941178, "frac_reward_zero_std": 0.75, "grad_norm": 0.500754225938282, "kl": 0.03228436037898064, "learning_rate": 3.894341641632176e-07, "loss": -0.0022, "num_tokens": 64934833.0, "reward": 0.65625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.4358251094818115, "sampling/importance_sampling_ratio/mean": 1.0003340244293213, "sampling/importance_sampling_ratio/min": 0.5718725323677063, "sampling/sampling_logp_difference/max": 0.5588392019271851, "sampling/sampling_logp_difference/mean": 0.014530991204082966, "step": 1506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 916.0, "completions/max_terminated_length": 916.0, "completions/mean_length": 374.84375, "completions/mean_terminated_length": 374.84375, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.4261391758918762, "epoch": 1.846813725490196, "frac_reward_zero_std": 0.5, "grad_norm": 0.7772400483377903, "kl": 0.022157147526741028, "learning_rate": 3.8873953302184283e-07, "loss": 0.0007, "num_tokens": 64978727.0, "reward": 0.65625, "reward_std": 0.375, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.372717022895813, "sampling/importance_sampling_ratio/mean": 0.9998214244842529, "sampling/importance_sampling_ratio/min": 0.6112395524978638, "sampling/sampling_logp_difference/max": 0.4922664165496826, "sampling/sampling_logp_difference/mean": 0.0133878905326128, "step": 1507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 657.0, "completions/max_terminated_length": 657.0, "completions/mean_length": 332.15625, "completions/mean_terminated_length": 332.15625, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.47768649458885193, "epoch": 1.8480392156862746, "frac_reward_zero_std": 0.75, "grad_norm": 0.6784592241340235, "kl": 0.027919333428144455, "learning_rate": 3.880451277321673e-07, "loss": -0.0086, "num_tokens": 65017777.0, "reward": 0.3125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.4556149244308472, "sampling/importance_sampling_ratio/mean": 0.9998083710670471, "sampling/importance_sampling_ratio/min": 0.5685389041900635, "sampling/sampling_logp_difference/max": 0.564685583114624, "sampling/sampling_logp_difference/mean": 0.0151910949498415, "step": 1508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 606.0, "completions/max_terminated_length": 606.0, "completions/mean_length": 241.296875, "completions/mean_terminated_length": 241.296875, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.33824169635772705, "epoch": 1.8492647058823528, "frac_reward_zero_std": 1.0, "grad_norm": 0.02198751533632445, "kl": 0.021724972873926163, "learning_rate": 3.873509497037899e-07, "loss": 0.0002, "num_tokens": 65052580.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3672369718551636, "sampling/importance_sampling_ratio/mean": 0.9999223947525024, "sampling/importance_sampling_ratio/min": 0.477790892124176, "sampling/sampling_logp_difference/max": 0.7385821342468262, "sampling/sampling_logp_difference/mean": 0.012704752385616302, "step": 1509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 718.0, "completions/max_terminated_length": 718.0, "completions/mean_length": 325.609375, "completions/mean_terminated_length": 325.609375, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.3706943392753601, "epoch": 1.8504901960784315, "frac_reward_zero_std": 0.75, "grad_norm": 0.6003206714440048, "kl": 0.02249569073319435, "learning_rate": 3.8665700034584834e-07, "loss": 0.0018, "num_tokens": 65092763.0, "reward": 0.8125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.2916862964630127, "sampling/importance_sampling_ratio/mean": 0.9996832013130188, "sampling/importance_sampling_ratio/min": 0.6229021549224854, "sampling/sampling_logp_difference/max": 0.47336578369140625, "sampling/sampling_logp_difference/mean": 0.012580755166709423, "step": 1510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/max_terminated_length": 547.0, "completions/mean_length": 250.75, "completions/mean_terminated_length": 250.75, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.3597898483276367, "epoch": 1.8517156862745097, "frac_reward_zero_std": 1.0, "grad_norm": 0.021264960537634534, "kl": 0.02741277776658535, "learning_rate": 3.8596328106701533e-07, "loss": 0.0002, "num_tokens": 65122779.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4469611644744873, "sampling/importance_sampling_ratio/mean": 0.9991456270217896, "sampling/importance_sampling_ratio/min": 0.6221746206283569, "sampling/sampling_logp_difference/max": 0.4745345115661621, "sampling/sampling_logp_difference/mean": 0.01401623897254467, "step": 1511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 544.0, "completions/max_terminated_length": 544.0, "completions/mean_length": 285.84375, "completions/mean_terminated_length": 285.84375, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.46444565057754517, "epoch": 1.8529411764705883, "frac_reward_zero_std": 0.75, "grad_norm": 0.6845874418893534, "kl": 0.024557190015912056, "learning_rate": 3.8526979327549736e-07, "loss": 0.0245, "num_tokens": 65163825.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.821679949760437, "sampling/importance_sampling_ratio/mean": 0.9996267557144165, "sampling/importance_sampling_ratio/min": 0.7000468969345093, "sampling/sampling_logp_difference/max": 0.5997591018676758, "sampling/sampling_logp_difference/mean": 0.014882707968354225, "step": 1512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 773.0, "completions/max_terminated_length": 773.0, "completions/mean_length": 285.5, "completions/mean_terminated_length": 285.5, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.46742087602615356, "epoch": 1.8541666666666665, "frac_reward_zero_std": 0.75, "grad_norm": 0.7780298612525532, "kl": 0.0378393679857254, "learning_rate": 3.845765383790306e-07, "loss": -0.0164, "num_tokens": 65197985.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.4804331064224243, "sampling/importance_sampling_ratio/mean": 1.0006639957427979, "sampling/importance_sampling_ratio/min": 0.5872159600257874, "sampling/sampling_logp_difference/max": 0.5323625802993774, "sampling/sampling_logp_difference/mean": 0.01672310382127762, "step": 1513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1207.0, "completions/max_terminated_length": 1207.0, "completions/mean_length": 331.875, "completions/mean_terminated_length": 331.875, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.5499087572097778, "epoch": 1.8553921568627452, "frac_reward_zero_std": 0.5, "grad_norm": 0.8360120356664116, "kl": 0.0366726890206337, "learning_rate": 3.8388351778487875e-07, "loss": 0.0768, "num_tokens": 65238457.0, "reward": 0.3125, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.6115565299987793, "sampling/importance_sampling_ratio/mean": 0.9994481205940247, "sampling/importance_sampling_ratio/min": 0.49635234475135803, "sampling/sampling_logp_difference/max": 0.7004692554473877, "sampling/sampling_logp_difference/mean": 0.01687479019165039, "step": 1514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/max_terminated_length": 487.0, "completions/mean_length": 261.0, "completions/mean_terminated_length": 261.0, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.4485795795917511, "epoch": 1.8566176470588234, "frac_reward_zero_std": 0.75, "grad_norm": 0.6693820233349405, "kl": 0.0324535146355629, "learning_rate": 3.831907328998295e-07, "loss": 0.0064, "num_tokens": 65274729.0, "reward": 0.8125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.4014109373092651, "sampling/importance_sampling_ratio/mean": 1.0001757144927979, "sampling/importance_sampling_ratio/min": 0.6384194493293762, "sampling/sampling_logp_difference/max": 0.4487597942352295, "sampling/sampling_logp_difference/mean": 0.01554932538419962, "step": 1515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 653.0, "completions/max_terminated_length": 653.0, "completions/mean_length": 325.546875, "completions/mean_terminated_length": 325.546875, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.45358067750930786, "epoch": 1.857843137254902, "frac_reward_zero_std": 0.75, "grad_norm": 0.5723296664425801, "kl": 0.02980022504925728, "learning_rate": 3.824981851301924e-07, "loss": 0.0074, "num_tokens": 65312988.0, "reward": 0.09375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.4530646800994873, "sampling/importance_sampling_ratio/mean": 0.9996035099029541, "sampling/importance_sampling_ratio/min": 0.6041304469108582, "sampling/sampling_logp_difference/max": 0.5039651393890381, "sampling/sampling_logp_difference/mean": 0.014469275251030922, "step": 1516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 774.0, "completions/max_terminated_length": 774.0, "completions/mean_length": 311.15625, "completions/mean_terminated_length": 311.15625, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.4594821035861969, "epoch": 1.8590686274509802, "frac_reward_zero_std": 0.75, "grad_norm": 0.5413207813278947, "kl": 0.034932881593704224, "learning_rate": 3.818058758817955e-07, "loss": 0.0072, "num_tokens": 65353302.0, "reward": 0.3125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.3908708095550537, "sampling/importance_sampling_ratio/mean": 1.0000689029693604, "sampling/importance_sampling_ratio/min": 0.6455252766609192, "sampling/sampling_logp_difference/max": 0.43769097328186035, "sampling/sampling_logp_difference/mean": 0.01513918861746788, "step": 1517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1184.0, "completions/max_terminated_length": 1184.0, "completions/mean_length": 366.46875, "completions/mean_terminated_length": 366.46875, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.3963835835456848, "epoch": 1.8602941176470589, "frac_reward_zero_std": 0.75, "grad_norm": 0.6420104193105066, "kl": 0.021197611466050148, "learning_rate": 3.81113806559983e-07, "loss": 0.0011, "num_tokens": 65392564.0, "reward": 0.78125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.5218408107757568, "sampling/importance_sampling_ratio/mean": 1.0001585483551025, "sampling/importance_sampling_ratio/min": 0.7387720346450806, "sampling/sampling_logp_difference/max": 0.4199206829071045, "sampling/sampling_logp_difference/mean": 0.013024854473769665, "step": 1518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.0, "completions/max_terminated_length": 533.0, "completions/mean_length": 252.078125, "completions/mean_terminated_length": 252.078125, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.48703527450561523, "epoch": 1.8615196078431373, "frac_reward_zero_std": 0.5, "grad_norm": 1.0755617088078695, "kl": 0.0646328404545784, "learning_rate": 3.804219785696113e-07, "loss": 0.0561, "num_tokens": 65422825.0, "reward": -0.21875, "reward_std": 0.375, "rewards/decision_reward_func/mean": -0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 1.627528190612793, "sampling/importance_sampling_ratio/mean": 0.9994412660598755, "sampling/importance_sampling_ratio/min": 0.737466037273407, "sampling/sampling_logp_difference/max": 0.4870624542236328, "sampling/sampling_logp_difference/mean": 0.015874793753027916, "step": 1519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 626.0, "completions/max_terminated_length": 626.0, "completions/mean_length": 247.890625, "completions/mean_terminated_length": 247.890625, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.37826013565063477, "epoch": 1.8627450980392157, "frac_reward_zero_std": 1.0, "grad_norm": 0.02322821986162134, "kl": 0.024111095815896988, "learning_rate": 3.797303933150475e-07, "loss": 0.0002, "num_tokens": 65452994.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0003387928009033, "sampling/importance_sampling_ratio/min": 0.5702946782112122, "sampling/sampling_logp_difference/max": 0.7738308906555176, "sampling/sampling_logp_difference/mean": 0.0148411700502038, "step": 1520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1375.0, "completions/max_terminated_length": 1375.0, "completions/mean_length": 476.703125, "completions/mean_terminated_length": 476.703125, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.3567119240760803, "epoch": 1.8639705882352942, "frac_reward_zero_std": 0.5, "grad_norm": 0.7659865761040636, "kl": 0.016975311562418938, "learning_rate": 3.790390522001662e-07, "loss": -0.0608, "num_tokens": 65505247.0, "reward": 0.46875, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.6109991073608398, "sampling/importance_sampling_ratio/mean": 0.999919056892395, "sampling/importance_sampling_ratio/min": 0.5382927060127258, "sampling/sampling_logp_difference/max": 0.6193528175354004, "sampling/sampling_logp_difference/mean": 0.011642403900623322, "step": 1521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 992.0, "completions/max_terminated_length": 992.0, "completions/mean_length": 391.0625, "completions/mean_terminated_length": 391.0625, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.3664676547050476, "epoch": 1.8651960784313726, "frac_reward_zero_std": 0.75, "grad_norm": 0.4106102421037614, "kl": 0.02250988408923149, "learning_rate": 3.7834795662834566e-07, "loss": -0.0184, "num_tokens": 65547395.0, "reward": 0.375, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.6667660474777222, "sampling/importance_sampling_ratio/mean": 1.0002576112747192, "sampling/importance_sampling_ratio/min": 0.5575947165489197, "sampling/sampling_logp_difference/max": 0.584122896194458, "sampling/sampling_logp_difference/mean": 0.011728061363101006, "step": 1522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 854.0, "completions/max_terminated_length": 854.0, "completions/mean_length": 461.96875, "completions/mean_terminated_length": 461.96875, "completions/min_length": 289.0, "completions/min_terminated_length": 289.0, "entropy": 0.5975775122642517, "epoch": 1.866421568627451, "frac_reward_zero_std": 0.0, "grad_norm": 0.8970098079232318, "kl": 0.027259236201643944, "learning_rate": 3.776571080024663e-07, "loss": 0.0153, "num_tokens": 65600737.0, "reward": 0.375, "reward_std": 0.5915650129318237, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.5921275615692139, "sampling/importance_sampling_ratio/mean": 1.0000402927398682, "sampling/importance_sampling_ratio/min": 0.5815204977989197, "sampling/sampling_logp_difference/max": 0.5421090126037598, "sampling/sampling_logp_difference/mean": 0.016495104879140854, "step": 1523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 748.0, "completions/max_terminated_length": 748.0, "completions/mean_length": 388.484375, "completions/mean_terminated_length": 388.484375, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "entropy": 0.3906000256538391, "epoch": 1.8676470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 0.7518983479949255, "kl": 0.016776077449321747, "learning_rate": 3.76966507724907e-07, "loss": 0.0133, "num_tokens": 65650064.0, "reward": 0.90625, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.6782277822494507, "sampling/importance_sampling_ratio/mean": 0.9999362826347351, "sampling/importance_sampling_ratio/min": 0.49081817269325256, "sampling/sampling_logp_difference/max": 0.711681604385376, "sampling/sampling_logp_difference/mean": 0.011862728744745255, "step": 1524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1180.0, "completions/max_terminated_length": 1180.0, "completions/mean_length": 444.78125, "completions/mean_terminated_length": 444.78125, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "entropy": 0.6058897972106934, "epoch": 1.8688725490196079, "frac_reward_zero_std": 0.25, "grad_norm": 0.891835781429965, "kl": 0.023202767595648766, "learning_rate": 3.762761571975429e-07, "loss": -0.0313, "num_tokens": 65700754.0, "reward": 0.25, "reward_std": 0.551956295967102, "rewards/decision_reward_func/mean": 0.25, "rewards/decision_reward_func/std": 0.9759001135826111, "sampling/importance_sampling_ratio/max": 1.4177573919296265, "sampling/importance_sampling_ratio/mean": 0.999906063079834, "sampling/importance_sampling_ratio/min": 0.6547579169273376, "sampling/sampling_logp_difference/max": 0.42348968982696533, "sampling/sampling_logp_difference/mean": 0.016766097396612167, "step": 1525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 588.0, "completions/max_terminated_length": 588.0, "completions/mean_length": 317.140625, "completions/mean_terminated_length": 317.140625, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.5390046238899231, "epoch": 1.8700980392156863, "frac_reward_zero_std": 1.0, "grad_norm": 0.026814463086057214, "kl": 0.0370686873793602, "learning_rate": 3.755860578217413e-07, "loss": 0.0004, "num_tokens": 65740443.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4350756406784058, "sampling/importance_sampling_ratio/mean": 0.9998720288276672, "sampling/importance_sampling_ratio/min": 0.6185767650604248, "sampling/sampling_logp_difference/max": 0.4803340435028076, "sampling/sampling_logp_difference/mean": 0.016210725530982018, "step": 1526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 684.0, "completions/max_terminated_length": 684.0, "completions/mean_length": 305.15625, "completions/mean_terminated_length": 305.15625, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.41533464193344116, "epoch": 1.8713235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.018440516642383876, "kl": 0.02354533225297928, "learning_rate": 3.7489621099836043e-07, "loss": 0.0002, "num_tokens": 65776549.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6278331279754639, "sampling/importance_sampling_ratio/mean": 0.9997698664665222, "sampling/importance_sampling_ratio/min": 0.6202821731567383, "sampling/sampling_logp_difference/max": 0.4872497320175171, "sampling/sampling_logp_difference/mean": 0.01415947824716568, "step": 1527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 806.0, "completions/max_terminated_length": 806.0, "completions/mean_length": 356.453125, "completions/mean_terminated_length": 356.453125, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.4687016010284424, "epoch": 1.8725490196078431, "frac_reward_zero_std": 0.5, "grad_norm": 0.8815427241027213, "kl": 0.02266014739871025, "learning_rate": 3.742066181277457e-07, "loss": -0.0113, "num_tokens": 65820466.0, "reward": 0.21875, "reward_std": 0.4515564441680908, "rewards/decision_reward_func/mean": 0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 1.3607841730117798, "sampling/importance_sampling_ratio/mean": 0.999950647354126, "sampling/importance_sampling_ratio/min": 0.38155654072761536, "sampling/sampling_logp_difference/max": 0.963496208190918, "sampling/sampling_logp_difference/mean": 0.01395792979747057, "step": 1528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 892.0, "completions/max_terminated_length": 892.0, "completions/mean_length": 380.890625, "completions/mean_terminated_length": 380.890625, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.49943292140960693, "epoch": 1.8737745098039216, "frac_reward_zero_std": 0.75, "grad_norm": 0.7306288394840055, "kl": 0.031057991087436676, "learning_rate": 3.735172806097271e-07, "loss": -0.0439, "num_tokens": 65865259.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999140501022339, "sampling/importance_sampling_ratio/min": 0.6185239553451538, "sampling/sampling_logp_difference/max": 0.8396368026733398, "sampling/sampling_logp_difference/mean": 0.015057574957609177, "step": 1529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 549.0, "completions/max_terminated_length": 549.0, "completions/mean_length": 334.28125, "completions/mean_terminated_length": 334.28125, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "entropy": 0.45148006081581116, "epoch": 1.875, "frac_reward_zero_std": 0.25, "grad_norm": 1.0870725428474466, "kl": 0.022799041122198105, "learning_rate": 3.7282819984361577e-07, "loss": 0.0431, "num_tokens": 65904509.0, "reward": 0.4375, "reward_std": 0.4973389506340027, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.578289270401001, "sampling/importance_sampling_ratio/mean": 1.0001311302185059, "sampling/importance_sampling_ratio/min": 0.6056728363037109, "sampling/sampling_logp_difference/max": 0.5014152526855469, "sampling/sampling_logp_difference/mean": 0.014032949693500996, "step": 1530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 704.0, "completions/max_terminated_length": 704.0, "completions/mean_length": 298.5, "completions/mean_terminated_length": 298.5, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "entropy": 0.44092226028442383, "epoch": 1.8762254901960784, "frac_reward_zero_std": 1.0, "grad_norm": 0.01676549726569665, "kl": 0.03120243549346924, "learning_rate": 3.721393772282022e-07, "loss": 0.0003, "num_tokens": 65939533.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.7304856777191162, "sampling/importance_sampling_ratio/mean": 1.000596046447754, "sampling/importance_sampling_ratio/min": 0.5292283296585083, "sampling/sampling_logp_difference/max": 0.6363353729248047, "sampling/sampling_logp_difference/mean": 0.015517350286245346, "step": 1531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 469.0, "completions/max_terminated_length": 469.0, "completions/mean_length": 294.796875, "completions/mean_terminated_length": 294.796875, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.4577953815460205, "epoch": 1.8774509803921569, "frac_reward_zero_std": 0.75, "grad_norm": 0.5691425511102792, "kl": 0.024970009922981262, "learning_rate": 3.7145081416175264e-07, "loss": -0.0113, "num_tokens": 65976096.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.398195743560791, "sampling/importance_sampling_ratio/mean": 1.0001437664031982, "sampling/importance_sampling_ratio/min": 0.6771517395973206, "sampling/sampling_logp_difference/max": 0.3898599147796631, "sampling/sampling_logp_difference/mean": 0.014154917560517788, "step": 1532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 953.0, "completions/max_terminated_length": 953.0, "completions/mean_length": 361.1875, "completions/mean_terminated_length": 361.1875, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.542712390422821, "epoch": 1.8786764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.7527386833910014, "kl": 0.03992345929145813, "learning_rate": 3.7076251204200667e-07, "loss": -0.055, "num_tokens": 66017948.0, "reward": 0.71875, "reward_std": 0.42695626616477966, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.4359116554260254, "sampling/importance_sampling_ratio/mean": 0.9995806217193604, "sampling/importance_sampling_ratio/min": 0.6363176703453064, "sampling/sampling_logp_difference/max": 0.4520573616027832, "sampling/sampling_logp_difference/mean": 0.01579592190682888, "step": 1533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1277.0, "completions/max_terminated_length": 1277.0, "completions/mean_length": 288.984375, "completions/mean_terminated_length": 288.984375, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.47241756319999695, "epoch": 1.8799019607843137, "frac_reward_zero_std": 0.25, "grad_norm": 1.3614694420431652, "kl": 0.029309051111340523, "learning_rate": 3.700744722661736e-07, "loss": -0.1743, "num_tokens": 66050491.0, "reward": 0.3125, "reward_std": 0.5, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.3691718578338623, "sampling/importance_sampling_ratio/mean": 0.9999727606773376, "sampling/importance_sampling_ratio/min": 0.648881733417511, "sampling/sampling_logp_difference/max": 0.4325047731399536, "sampling/sampling_logp_difference/mean": 0.015269131399691105, "step": 1534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 804.0, "completions/max_terminated_length": 804.0, "completions/mean_length": 293.71875, "completions/mean_terminated_length": 293.71875, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.48007288575172424, "epoch": 1.8811274509803921, "frac_reward_zero_std": 0.75, "grad_norm": 0.812847167426514, "kl": 0.028727896511554718, "learning_rate": 3.693866962309308e-07, "loss": 0.0059, "num_tokens": 66089753.0, "reward": 0.78125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.4125014543533325, "sampling/importance_sampling_ratio/mean": 0.9999750852584839, "sampling/importance_sampling_ratio/min": 0.7129489183425903, "sampling/sampling_logp_difference/max": 0.34536218643188477, "sampling/sampling_logp_difference/mean": 0.016187049448490143, "step": 1535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 562.0, "completions/max_terminated_length": 562.0, "completions/mean_length": 264.09375, "completions/mean_terminated_length": 264.09375, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.6087918877601624, "epoch": 1.8823529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.02663812320749791, "kl": 0.03959087282419205, "learning_rate": 3.686991853324202e-07, "loss": 0.0004, "num_tokens": 66125871.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.405287504196167, "sampling/importance_sampling_ratio/mean": 1.00020432472229, "sampling/importance_sampling_ratio/min": 0.595475435256958, "sampling/sampling_logp_difference/max": 0.518395185470581, "sampling/sampling_logp_difference/mean": 0.018892405554652214, "step": 1536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 880.0, "completions/max_terminated_length": 880.0, "completions/mean_length": 359.546875, "completions/mean_terminated_length": 359.546875, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.3752419948577881, "epoch": 1.883578431372549, "frac_reward_zero_std": 1.0, "grad_norm": 0.01176491451882929, "kl": 0.016916194930672646, "learning_rate": 3.680119409662451e-07, "loss": 0.0002, "num_tokens": 66165682.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5543614625930786, "sampling/importance_sampling_ratio/mean": 0.999843418598175, "sampling/importance_sampling_ratio/min": 0.5154072046279907, "sampling/sampling_logp_difference/max": 0.6627979278564453, "sampling/sampling_logp_difference/mean": 0.013117628172039986, "step": 1537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1154.0, "completions/max_terminated_length": 1154.0, "completions/mean_length": 369.96875, "completions/mean_terminated_length": 369.96875, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.5132474899291992, "epoch": 1.8848039215686274, "frac_reward_zero_std": 0.5, "grad_norm": 0.831780713736114, "kl": 0.02308112010359764, "learning_rate": 3.673249645274682e-07, "loss": -0.0767, "num_tokens": 66207696.0, "reward": 0.21875, "reward_std": 0.375, "rewards/decision_reward_func/mean": 0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 1.513954520225525, "sampling/importance_sampling_ratio/mean": 1.0001444816589355, "sampling/importance_sampling_ratio/min": 0.5910457968711853, "sampling/sampling_logp_difference/max": 0.5258617401123047, "sampling/sampling_logp_difference/mean": 0.015835296362638474, "step": 1538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 683.0, "completions/max_terminated_length": 683.0, "completions/mean_length": 299.59375, "completions/mean_terminated_length": 299.59375, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.5491599440574646, "epoch": 1.8860294117647058, "frac_reward_zero_std": 0.5, "grad_norm": 1.027015671169935, "kl": 0.031407296657562256, "learning_rate": 3.6663825741060805e-07, "loss": 0.0077, "num_tokens": 66246950.0, "reward": 0.625, "reward_std": 0.4577302038669586, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.4217299222946167, "sampling/importance_sampling_ratio/mean": 1.0002086162567139, "sampling/importance_sampling_ratio/min": 0.633480966091156, "sampling/sampling_logp_difference/max": 0.4565253257751465, "sampling/sampling_logp_difference/mean": 0.017468318343162537, "step": 1539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 603.0, "completions/max_terminated_length": 603.0, "completions/mean_length": 391.90625, "completions/mean_terminated_length": 391.90625, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "entropy": 0.48969724774360657, "epoch": 1.8872549019607843, "frac_reward_zero_std": 0.5, "grad_norm": 0.7610193293371161, "kl": 0.021984387189149857, "learning_rate": 3.6595182100963686e-07, "loss": -0.0075, "num_tokens": 66287712.0, "reward": 0.5, "reward_std": 0.34156501293182373, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5744869709014893, "sampling/importance_sampling_ratio/mean": 1.0001637935638428, "sampling/importance_sampling_ratio/min": 0.6813758015632629, "sampling/sampling_logp_difference/max": 0.45392942428588867, "sampling/sampling_logp_difference/mean": 0.014237409457564354, "step": 1540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1968.0, "completions/max_terminated_length": 1968.0, "completions/mean_length": 437.6875, "completions/mean_terminated_length": 437.6875, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "entropy": 0.5747129321098328, "epoch": 1.8884803921568627, "frac_reward_zero_std": 0.25, "grad_norm": 0.820854544785157, "kl": 0.03117332234978676, "learning_rate": 3.652656567179765e-07, "loss": 0.0446, "num_tokens": 66331196.0, "reward": 0.28125, "reward_std": 0.565913200378418, "rewards/decision_reward_func/mean": 0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001226663589478, "sampling/importance_sampling_ratio/min": 0.6176879405975342, "sampling/sampling_logp_difference/max": 0.7256312370300293, "sampling/sampling_logp_difference/mean": 0.016707608476281166, "step": 1541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1234.0, "completions/max_terminated_length": 1234.0, "completions/mean_length": 377.265625, "completions/mean_terminated_length": 377.265625, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "entropy": 0.5074349045753479, "epoch": 1.8897058823529411, "frac_reward_zero_std": 0.75, "grad_norm": 0.43189457749051674, "kl": 0.032593876123428345, "learning_rate": 3.645797659284975e-07, "loss": -0.0053, "num_tokens": 66370029.0, "reward": 0.09375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.3266682624816895, "sampling/importance_sampling_ratio/mean": 0.9996873140335083, "sampling/importance_sampling_ratio/min": 0.6176496744155884, "sampling/sampling_logp_difference/max": 0.48183393478393555, "sampling/sampling_logp_difference/mean": 0.015547238290309906, "step": 1542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1071.0, "completions/max_terminated_length": 1071.0, "completions/mean_length": 378.0625, "completions/mean_terminated_length": 378.0625, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "entropy": 0.597244381904602, "epoch": 1.8909313725490198, "frac_reward_zero_std": 0.25, "grad_norm": 1.1690750659925528, "kl": 0.03488236665725708, "learning_rate": 3.638941500335144e-07, "loss": 0.1102, "num_tokens": 66411937.0, "reward": 0.5625, "reward_std": 0.6047805547714233, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.6894258260726929, "sampling/importance_sampling_ratio/mean": 1.0000052452087402, "sampling/importance_sampling_ratio/min": 0.604245126247406, "sampling/sampling_logp_difference/max": 0.5243887901306152, "sampling/sampling_logp_difference/mean": 0.01662249118089676, "step": 1543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 807.0, "completions/max_terminated_length": 807.0, "completions/mean_length": 365.890625, "completions/mean_terminated_length": 365.890625, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "entropy": 0.6019363403320312, "epoch": 1.892156862745098, "frac_reward_zero_std": 0.25, "grad_norm": 1.0181314301136477, "kl": 0.04483054578304291, "learning_rate": 3.6320881042478433e-07, "loss": 0.0434, "num_tokens": 66451482.0, "reward": 0.78125, "reward_std": 0.48935678601264954, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000147819519043, "sampling/importance_sampling_ratio/min": 0.620394766330719, "sampling/sampling_logp_difference/max": 0.8588744401931763, "sampling/sampling_logp_difference/mean": 0.018428482115268707, "step": 1544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/max_terminated_length": 489.0, "completions/mean_length": 289.3125, "completions/mean_terminated_length": 289.3125, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.42463430762290955, "epoch": 1.8933823529411766, "frac_reward_zero_std": 0.5, "grad_norm": 1.1078025468503963, "kl": 0.03333812952041626, "learning_rate": 3.6252374849350303e-07, "loss": 0.0112, "num_tokens": 66488478.0, "reward": 0.9375, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.6706188917160034, "sampling/importance_sampling_ratio/mean": 0.9993360042572021, "sampling/importance_sampling_ratio/min": 0.5709596276283264, "sampling/sampling_logp_difference/max": 0.5604367256164551, "sampling/sampling_logp_difference/mean": 0.014651347883045673, "step": 1545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1518.0, "completions/max_terminated_length": 1518.0, "completions/mean_length": 562.015625, "completions/mean_terminated_length": 562.015625, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "entropy": 0.3440740406513214, "epoch": 1.8946078431372548, "frac_reward_zero_std": 0.75, "grad_norm": 0.49164500678204315, "kl": 0.019372235983610153, "learning_rate": 3.618389656303029e-07, "loss": -0.052, "num_tokens": 66543247.0, "reward": 0.6875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000584125518799, "sampling/importance_sampling_ratio/min": 0.2926718294620514, "sampling/sampling_logp_difference/max": 1.2287033796310425, "sampling/sampling_logp_difference/mean": 0.010481652803719044, "step": 1546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 970.0, "completions/max_terminated_length": 970.0, "completions/mean_length": 372.890625, "completions/mean_terminated_length": 372.890625, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.4993938207626343, "epoch": 1.8958333333333335, "frac_reward_zero_std": 0.75, "grad_norm": 0.7320082912090926, "kl": 0.034821413457393646, "learning_rate": 3.6115446322525e-07, "loss": 0.0491, "num_tokens": 66588648.0, "reward": 0.40625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.4284954071044922, "sampling/importance_sampling_ratio/mean": 0.9998583793640137, "sampling/importance_sampling_ratio/min": 0.6623312830924988, "sampling/sampling_logp_difference/max": 0.4119894504547119, "sampling/sampling_logp_difference/mean": 0.015451380982995033, "step": 1547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1186.0, "completions/max_terminated_length": 1186.0, "completions/mean_length": 589.78125, "completions/mean_terminated_length": 589.78125, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.415791392326355, "epoch": 1.8970588235294117, "frac_reward_zero_std": 0.5, "grad_norm": 0.9048211103985162, "kl": 0.017944112420082092, "learning_rate": 3.6047024266784035e-07, "loss": -0.0476, "num_tokens": 66656202.0, "reward": 0.71875, "reward_std": 0.42695626616477966, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.8705092668533325, "sampling/importance_sampling_ratio/mean": 1.0002036094665527, "sampling/importance_sampling_ratio/min": 0.5320916771888733, "sampling/sampling_logp_difference/max": 0.6309394836425781, "sampling/sampling_logp_difference/mean": 0.01256504189223051, "step": 1548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1113.0, "completions/max_terminated_length": 1113.0, "completions/mean_length": 375.9375, "completions/mean_terminated_length": 375.9375, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.519862949848175, "epoch": 1.8982843137254903, "frac_reward_zero_std": 0.5, "grad_norm": 0.9497002682296182, "kl": 0.02611318603157997, "learning_rate": 3.5978630534699865e-07, "loss": -0.0724, "num_tokens": 66701110.0, "reward": 0.15625, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": 0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.623932957649231, "sampling/importance_sampling_ratio/mean": 1.000152587890625, "sampling/importance_sampling_ratio/min": 0.6182020902633667, "sampling/sampling_logp_difference/max": 0.4848508834838867, "sampling/sampling_logp_difference/mean": 0.015475722029805183, "step": 1549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1012.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 388.515625, "completions/mean_terminated_length": 388.515625, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "entropy": 0.42366838455200195, "epoch": 1.8995098039215685, "frac_reward_zero_std": 0.75, "grad_norm": 0.6092545082461401, "kl": 0.019141484051942825, "learning_rate": 3.591026526510742e-07, "loss": 0.075, "num_tokens": 66747127.0, "reward": 0.75, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.429050087928772, "sampling/importance_sampling_ratio/mean": 1.0000848770141602, "sampling/importance_sampling_ratio/min": 0.6409159898757935, "sampling/sampling_logp_difference/max": 0.4448568820953369, "sampling/sampling_logp_difference/mean": 0.01266468781977892, "step": 1550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 808.0, "completions/max_terminated_length": 808.0, "completions/mean_length": 371.875, "completions/mean_terminated_length": 371.875, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "entropy": 0.5431666970252991, "epoch": 1.9007352941176472, "frac_reward_zero_std": 0.75, "grad_norm": 0.6761812788230899, "kl": 0.02258843183517456, "learning_rate": 3.584192859678391e-07, "loss": -0.0249, "num_tokens": 66788175.0, "reward": 0.28125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 1.540204644203186, "sampling/importance_sampling_ratio/mean": 0.999509871006012, "sampling/importance_sampling_ratio/min": 0.6347807049751282, "sampling/sampling_logp_difference/max": 0.45447564125061035, "sampling/sampling_logp_difference/mean": 0.0160185769200325, "step": 1551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 857.0, "completions/max_terminated_length": 857.0, "completions/mean_length": 396.53125, "completions/mean_terminated_length": 396.53125, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "entropy": 0.6033071279525757, "epoch": 1.9019607843137254, "frac_reward_zero_std": 0.5, "grad_norm": 0.8565434374863017, "kl": 0.023424874991178513, "learning_rate": 3.577362066844838e-07, "loss": 0.0244, "num_tokens": 66831665.0, "reward": 0.875, "reward_std": 0.34156501293182373, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.3632768392562866, "sampling/importance_sampling_ratio/mean": 1.0000090599060059, "sampling/importance_sampling_ratio/min": 0.5572277903556824, "sampling/sampling_logp_difference/max": 0.5847811698913574, "sampling/sampling_logp_difference/mean": 0.017070095986127853, "step": 1552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/max_terminated_length": 485.0, "completions/mean_length": 272.03125, "completions/mean_terminated_length": 272.03125, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.49147236347198486, "epoch": 1.903186274509804, "frac_reward_zero_std": 0.75, "grad_norm": 0.8161143691757499, "kl": 0.024015244096517563, "learning_rate": 3.570534161876163e-07, "loss": 0.0142, "num_tokens": 66863827.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.8980796337127686, "sampling/importance_sampling_ratio/mean": 0.9999273419380188, "sampling/importance_sampling_ratio/min": 0.6839730739593506, "sampling/sampling_logp_difference/max": 0.6408426761627197, "sampling/sampling_logp_difference/mean": 0.01677417755126953, "step": 1553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1547.0, "completions/max_terminated_length": 1547.0, "completions/mean_length": 434.484375, "completions/mean_terminated_length": 434.484375, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "entropy": 0.42723697423934937, "epoch": 1.9044117647058822, "frac_reward_zero_std": 0.5, "grad_norm": 0.8296465664732117, "kl": 0.016663361340761185, "learning_rate": 3.5637091586325796e-07, "loss": 0.0915, "num_tokens": 66914530.0, "reward": 0.375, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9994466304779053, "sampling/importance_sampling_ratio/min": 0.6059614419937134, "sampling/sampling_logp_difference/max": 0.7009563446044922, "sampling/sampling_logp_difference/mean": 0.013442575931549072, "step": 1554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 756.0, "completions/max_terminated_length": 756.0, "completions/mean_length": 400.859375, "completions/mean_terminated_length": 400.859375, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "entropy": 0.4356551170349121, "epoch": 1.905637254901961, "frac_reward_zero_std": 0.75, "grad_norm": 0.5635298370942454, "kl": 0.019770191982388496, "learning_rate": 3.556887070968414e-07, "loss": 0.022, "num_tokens": 66958457.0, "reward": 0.40625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.4754341840744019, "sampling/importance_sampling_ratio/mean": 1.0001157522201538, "sampling/importance_sampling_ratio/min": 0.6107317209243774, "sampling/sampling_logp_difference/max": 0.49309754371643066, "sampling/sampling_logp_difference/mean": 0.013602351769804955, "step": 1555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1130.0, "completions/max_terminated_length": 1130.0, "completions/mean_length": 420.765625, "completions/mean_terminated_length": 420.765625, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "entropy": 0.4157881736755371, "epoch": 1.906862745098039, "frac_reward_zero_std": 1.0, "grad_norm": 0.011903710835643311, "kl": 0.024433039128780365, "learning_rate": 3.550067912732069e-07, "loss": 0.0002, "num_tokens": 67006026.0, "reward": -0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": -0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9998705387115479, "sampling/importance_sampling_ratio/min": 0.685492753982544, "sampling/sampling_logp_difference/max": 0.7696008682250977, "sampling/sampling_logp_difference/mean": 0.01347926165908575, "step": 1556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 902.0, "completions/max_terminated_length": 902.0, "completions/mean_length": 414.875, "completions/mean_terminated_length": 414.875, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "entropy": 0.4550684690475464, "epoch": 1.9080882352941178, "frac_reward_zero_std": 0.5, "grad_norm": 0.7500340120594771, "kl": 0.018677614629268646, "learning_rate": 3.5432516977660054e-07, "loss": 0.0187, "num_tokens": 67050082.0, "reward": 0.875, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.5310187339782715, "sampling/importance_sampling_ratio/mean": 0.9998316764831543, "sampling/importance_sampling_ratio/min": 0.5752145648002625, "sampling/sampling_logp_difference/max": 0.5530121326446533, "sampling/sampling_logp_difference/mean": 0.01273578405380249, "step": 1557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/max_terminated_length": 525.0, "completions/mean_length": 316.4375, "completions/mean_terminated_length": 316.4375, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "entropy": 0.35969728231430054, "epoch": 1.909313725490196, "frac_reward_zero_std": 1.0, "grad_norm": 0.010717952093215936, "kl": 0.01637854054570198, "learning_rate": 3.5364384399067094e-07, "loss": 0.0002, "num_tokens": 67088078.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.491614580154419, "sampling/importance_sampling_ratio/mean": 1.0003862380981445, "sampling/importance_sampling_ratio/min": 0.6468147039413452, "sampling/sampling_logp_difference/max": 0.4356954097747803, "sampling/sampling_logp_difference/mean": 0.012476343661546707, "step": 1558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 801.0, "completions/max_terminated_length": 801.0, "completions/mean_length": 400.359375, "completions/mean_terminated_length": 400.359375, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.6237326264381409, "epoch": 1.9105392156862746, "frac_reward_zero_std": 0.5, "grad_norm": 0.9476320631526008, "kl": 0.031626634299755096, "learning_rate": 3.5296281529846593e-07, "loss": 0.0215, "num_tokens": 67143973.0, "reward": 0.375, "reward_std": 0.5, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.7020442485809326, "sampling/importance_sampling_ratio/mean": 1.000013828277588, "sampling/importance_sampling_ratio/min": 0.5342493653297424, "sampling/sampling_logp_difference/max": 0.6268925666809082, "sampling/sampling_logp_difference/mean": 0.017749615013599396, "step": 1559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1440.0, "completions/max_terminated_length": 1440.0, "completions/mean_length": 393.5, "completions/mean_terminated_length": 393.5, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.4458427429199219, "epoch": 1.9117647058823528, "frac_reward_zero_std": 0.5, "grad_norm": 0.9848557522743231, "kl": 0.023088373243808746, "learning_rate": 3.5228208508243073e-07, "loss": -0.0787, "num_tokens": 67183557.0, "reward": -0.0625, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": -0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9997903108596802, "sampling/importance_sampling_ratio/min": 0.02649843692779541, "sampling/sampling_logp_difference/max": 3.630669593811035, "sampling/sampling_logp_difference/mean": 0.014150760136544704, "step": 1560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 939.0, "completions/max_terminated_length": 939.0, "completions/mean_length": 408.890625, "completions/mean_terminated_length": 408.890625, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "entropy": 0.36420008540153503, "epoch": 1.9129901960784315, "frac_reward_zero_std": 0.5, "grad_norm": 0.7166721311415079, "kl": 0.022832166403532028, "learning_rate": 3.5160165472440467e-07, "loss": 0.0388, "num_tokens": 67227694.0, "reward": 0.78125, "reward_std": 0.42516323924064636, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.5757535696029663, "sampling/importance_sampling_ratio/mean": 1.0002163648605347, "sampling/importance_sampling_ratio/min": 0.6297387480735779, "sampling/sampling_logp_difference/max": 0.4624502658843994, "sampling/sampling_logp_difference/mean": 0.011923748068511486, "step": 1561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1120.0, "completions/max_terminated_length": 1120.0, "completions/mean_length": 370.515625, "completions/mean_terminated_length": 370.515625, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "entropy": 0.5383903980255127, "epoch": 1.9142156862745097, "frac_reward_zero_std": 0.25, "grad_norm": 1.0808938432419246, "kl": 0.027013270184397697, "learning_rate": 3.509215256056183e-07, "loss": 0.034, "num_tokens": 67269231.0, "reward": 0.71875, "reward_std": 0.5061737298965454, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.498481273651123, "sampling/importance_sampling_ratio/mean": 0.9998981952667236, "sampling/importance_sampling_ratio/min": 0.6774193048477173, "sampling/sampling_logp_difference/max": 0.4044520854949951, "sampling/sampling_logp_difference/mean": 0.015632934868335724, "step": 1562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 710.0, "completions/max_terminated_length": 710.0, "completions/mean_length": 302.015625, "completions/mean_terminated_length": 302.015625, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.5208964347839355, "epoch": 1.9154411764705883, "frac_reward_zero_std": 0.5, "grad_norm": 1.0331465390272978, "kl": 0.02481227181851864, "learning_rate": 3.502416991066904e-07, "loss": 0.0233, "num_tokens": 67305184.0, "reward": 0.84375, "reward_std": 0.3723389506340027, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.3855758905410767, "sampling/importance_sampling_ratio/mean": 0.9998295903205872, "sampling/importance_sampling_ratio/min": 0.7142961621284485, "sampling/sampling_logp_difference/max": 0.33645761013031006, "sampling/sampling_logp_difference/mean": 0.015253245830535889, "step": 1563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 857.0, "completions/max_terminated_length": 857.0, "completions/mean_length": 432.71875, "completions/mean_terminated_length": 432.71875, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "entropy": 0.472014844417572, "epoch": 1.9166666666666665, "frac_reward_zero_std": 0.5, "grad_norm": 0.8225141045680074, "kl": 0.019638419151306152, "learning_rate": 3.495621766076259e-07, "loss": 0.0313, "num_tokens": 67351246.0, "reward": 0.90625, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.6012108325958252, "sampling/importance_sampling_ratio/mean": 0.9997761249542236, "sampling/importance_sampling_ratio/min": 0.7095584273338318, "sampling/sampling_logp_difference/max": 0.4707601070404053, "sampling/sampling_logp_difference/mean": 0.013624006882309914, "step": 1564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 787.0, "completions/max_terminated_length": 787.0, "completions/mean_length": 321.75, "completions/mean_terminated_length": 321.75, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.4404603838920593, "epoch": 1.9178921568627452, "frac_reward_zero_std": 0.75, "grad_norm": 0.5465033535332903, "kl": 0.03408222645521164, "learning_rate": 3.488829594878123e-07, "loss": 0.0002, "num_tokens": 67389246.0, "reward": 0.6875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.613905906677246, "sampling/importance_sampling_ratio/mean": 0.9996048212051392, "sampling/importance_sampling_ratio/min": 0.6202699542045593, "sampling/sampling_logp_difference/max": 0.47865724563598633, "sampling/sampling_logp_difference/mean": 0.01471884548664093, "step": 1565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 926.0, "completions/max_terminated_length": 926.0, "completions/mean_length": 356.296875, "completions/mean_terminated_length": 356.296875, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "entropy": 0.5271302461624146, "epoch": 1.9191176470588234, "frac_reward_zero_std": 0.75, "grad_norm": 0.7901932383562319, "kl": 0.03357952833175659, "learning_rate": 3.4820404912601757e-07, "loss": 0.0377, "num_tokens": 67433601.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.5786494016647339, "sampling/importance_sampling_ratio/mean": 0.9995280504226685, "sampling/importance_sampling_ratio/min": 0.456936776638031, "sampling/sampling_logp_difference/max": 0.783210277557373, "sampling/sampling_logp_difference/mean": 0.017176080495119095, "step": 1566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 595.0, "completions/max_terminated_length": 595.0, "completions/mean_length": 321.390625, "completions/mean_terminated_length": 321.390625, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.35988983511924744, "epoch": 1.920343137254902, "frac_reward_zero_std": 1.0, "grad_norm": 0.013637704222192413, "kl": 0.017907753586769104, "learning_rate": 3.4752544690038643e-07, "loss": 0.0002, "num_tokens": 67470666.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5235122442245483, "sampling/importance_sampling_ratio/mean": 0.9998645782470703, "sampling/importance_sampling_ratio/min": 0.7052757740020752, "sampling/sampling_logp_difference/max": 0.4210183620452881, "sampling/sampling_logp_difference/mean": 0.011900190263986588, "step": 1567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 825.0, "completions/max_terminated_length": 825.0, "completions/mean_length": 375.65625, "completions/mean_terminated_length": 375.65625, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.6202492713928223, "epoch": 1.9215686274509802, "frac_reward_zero_std": 0.25, "grad_norm": 1.0718918263566821, "kl": 0.042236290872097015, "learning_rate": 3.468471541884385e-07, "loss": -0.0335, "num_tokens": 67509476.0, "reward": -0.5625, "reward_std": 0.47360679507255554, "rewards/decision_reward_func/mean": -0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.6066476106643677, "sampling/importance_sampling_ratio/mean": 1.0003832578659058, "sampling/importance_sampling_ratio/min": 0.5592910051345825, "sampling/sampling_logp_difference/max": 0.5810853242874146, "sampling/sampling_logp_difference/mean": 0.01776529848575592, "step": 1568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 875.0, "completions/max_terminated_length": 875.0, "completions/mean_length": 332.28125, "completions/mean_terminated_length": 332.28125, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "entropy": 0.4865238666534424, "epoch": 1.9227941176470589, "frac_reward_zero_std": 0.75, "grad_norm": 0.5612391060198201, "kl": 0.026936842128634453, "learning_rate": 3.461691723670651e-07, "loss": -0.0024, "num_tokens": 67546758.0, "reward": 0.59375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.5438671112060547, "sampling/importance_sampling_ratio/mean": 1.000169038772583, "sampling/importance_sampling_ratio/min": 0.636954128742218, "sampling/sampling_logp_difference/max": 0.45105767250061035, "sampling/sampling_logp_difference/mean": 0.015116903930902481, "step": 1569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/max_terminated_length": 337.0, "completions/mean_length": 238.8125, "completions/mean_terminated_length": 238.8125, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.442634642124176, "epoch": 1.9240196078431373, "frac_reward_zero_std": 1.0, "grad_norm": 0.02019782104102133, "kl": 0.02362716943025589, "learning_rate": 3.454915028125263e-07, "loss": 0.0002, "num_tokens": 67579690.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4058055877685547, "sampling/importance_sampling_ratio/mean": 1.0000338554382324, "sampling/importance_sampling_ratio/min": 0.6833733320236206, "sampling/sampling_logp_difference/max": 0.38071393966674805, "sampling/sampling_logp_difference/mean": 0.014816053211688995, "step": 1570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 807.0, "completions/max_terminated_length": 807.0, "completions/mean_length": 322.453125, "completions/mean_terminated_length": 322.453125, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.5353413820266724, "epoch": 1.9252450980392157, "frac_reward_zero_std": 1.0, "grad_norm": 0.013390035799623732, "kl": 0.019472569227218628, "learning_rate": 3.4481414690044836e-07, "loss": 0.0002, "num_tokens": 67620807.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.4929912090301514, "sampling/importance_sampling_ratio/mean": 1.0003468990325928, "sampling/importance_sampling_ratio/min": 0.6402289271354675, "sampling/sampling_logp_difference/max": 0.44592952728271484, "sampling/sampling_logp_difference/mean": 0.016833962872624397, "step": 1571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 573.0, "completions/max_terminated_length": 573.0, "completions/mean_length": 276.921875, "completions/mean_terminated_length": 276.921875, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "entropy": 0.3989385962486267, "epoch": 1.9264705882352942, "frac_reward_zero_std": 1.0, "grad_norm": 0.022079231664808706, "kl": 0.025833487510681152, "learning_rate": 3.441371060058209e-07, "loss": 0.0003, "num_tokens": 67657010.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.516563892364502, "sampling/importance_sampling_ratio/mean": 0.9998262524604797, "sampling/importance_sampling_ratio/min": 0.617525041103363, "sampling/sampling_logp_difference/max": 0.48203563690185547, "sampling/sampling_logp_difference/mean": 0.014172088354825974, "step": 1572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/max_terminated_length": 515.0, "completions/mean_length": 250.96875, "completions/mean_terminated_length": 250.96875, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.47997766733169556, "epoch": 1.9276960784313726, "frac_reward_zero_std": 1.0, "grad_norm": 0.015334291853474694, "kl": 0.02260681800544262, "learning_rate": 3.4346038150299425e-07, "loss": 0.0002, "num_tokens": 67686064.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4980757236480713, "sampling/importance_sampling_ratio/mean": 0.9997596740722656, "sampling/importance_sampling_ratio/min": 0.648613452911377, "sampling/sampling_logp_difference/max": 0.4329183101654053, "sampling/sampling_logp_difference/mean": 0.016024520620703697, "step": 1573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 439.0, "completions/max_terminated_length": 439.0, "completions/mean_length": 262.578125, "completions/mean_terminated_length": 262.578125, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.4883583188056946, "epoch": 1.928921568627451, "frac_reward_zero_std": 1.0, "grad_norm": 0.019124740167311968, "kl": 0.027463238686323166, "learning_rate": 3.427839747656758e-07, "loss": 0.0003, "num_tokens": 67722485.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6125600337982178, "sampling/importance_sampling_ratio/mean": 1.0004878044128418, "sampling/importance_sampling_ratio/min": 0.6394203305244446, "sampling/sampling_logp_difference/max": 0.47782301902770996, "sampling/sampling_logp_difference/mean": 0.015745999291539192, "step": 1574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 683.0, "completions/max_terminated_length": 683.0, "completions/mean_length": 264.828125, "completions/mean_terminated_length": 264.828125, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.4569088816642761, "epoch": 1.9301470588235294, "frac_reward_zero_std": 0.75, "grad_norm": 1.0252004260210374, "kl": 0.023304447531700134, "learning_rate": 3.4210788716692875e-07, "loss": -0.028, "num_tokens": 67755274.0, "reward": 0.625, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.5714406967163086, "sampling/importance_sampling_ratio/mean": 0.9997819066047668, "sampling/importance_sampling_ratio/min": 0.4734843969345093, "sampling/sampling_logp_difference/max": 0.7476363182067871, "sampling/sampling_logp_difference/mean": 0.013574411161243916, "step": 1575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 745.0, "completions/max_terminated_length": 745.0, "completions/mean_length": 328.875, "completions/mean_terminated_length": 328.875, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.6367250680923462, "epoch": 1.9313725490196079, "frac_reward_zero_std": 0.75, "grad_norm": 0.5627397637975498, "kl": 0.03249683231115341, "learning_rate": 3.414321200791679e-07, "loss": 0.0033, "num_tokens": 67800818.0, "reward": 0.34375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 1.6006616353988647, "sampling/importance_sampling_ratio/mean": 1.000342845916748, "sampling/importance_sampling_ratio/min": 0.4265812039375305, "sampling/sampling_logp_difference/max": 0.8519525527954102, "sampling/sampling_logp_difference/mean": 0.018196657299995422, "step": 1576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 455.0, "completions/max_terminated_length": 455.0, "completions/mean_length": 300.296875, "completions/mean_terminated_length": 300.296875, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "entropy": 0.5424859523773193, "epoch": 1.9325980392156863, "frac_reward_zero_std": 1.0, "grad_norm": 0.01864396165321054, "kl": 0.023455757647752762, "learning_rate": 3.4075667487415785e-07, "loss": 0.0002, "num_tokens": 67840869.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6096258163452148, "sampling/importance_sampling_ratio/mean": 1.0003740787506104, "sampling/importance_sampling_ratio/min": 0.5586931109428406, "sampling/sampling_logp_difference/max": 0.5821549892425537, "sampling/sampling_logp_difference/mean": 0.01746327057480812, "step": 1577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 930.0, "completions/max_terminated_length": 930.0, "completions/mean_length": 364.953125, "completions/mean_terminated_length": 364.953125, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "entropy": 0.6064436435699463, "epoch": 1.9338235294117647, "frac_reward_zero_std": 0.75, "grad_norm": 0.5687857959707722, "kl": 0.03444577008485794, "learning_rate": 3.4008155292300934e-07, "loss": -0.0178, "num_tokens": 67879058.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.4560539722442627, "sampling/importance_sampling_ratio/mean": 0.9994417428970337, "sampling/importance_sampling_ratio/min": 0.6737205982208252, "sampling/sampling_logp_difference/max": 0.3949397802352905, "sampling/sampling_logp_difference/mean": 0.016656391322612762, "step": 1578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 892.0, "completions/max_terminated_length": 892.0, "completions/mean_length": 383.515625, "completions/mean_terminated_length": 383.515625, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.45167067646980286, "epoch": 1.9350490196078431, "frac_reward_zero_std": 0.5, "grad_norm": 0.8043046771184399, "kl": 0.02305075153708458, "learning_rate": 3.3940675559617723e-07, "loss": -0.058, "num_tokens": 67925843.0, "reward": 0.5, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.748533010482788, "sampling/importance_sampling_ratio/mean": 1.0003104209899902, "sampling/importance_sampling_ratio/min": 0.6171730160713196, "sampling/sampling_logp_difference/max": 0.5587770938873291, "sampling/sampling_logp_difference/mean": 0.013972347602248192, "step": 1579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 410.0, "completions/max_terminated_length": 410.0, "completions/mean_length": 249.625, "completions/mean_terminated_length": 249.625, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.5227569937705994, "epoch": 1.9362745098039216, "frac_reward_zero_std": 1.0, "grad_norm": 0.016467373199546134, "kl": 0.02595914527773857, "learning_rate": 3.3873228426345757e-07, "loss": 0.0003, "num_tokens": 67954667.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.4757620096206665, "sampling/importance_sampling_ratio/mean": 1.00020170211792, "sampling/importance_sampling_ratio/min": 0.5823780298233032, "sampling/sampling_logp_difference/max": 0.5406355857849121, "sampling/sampling_logp_difference/mean": 0.016594573855400085, "step": 1580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 589.0, "completions/max_terminated_length": 589.0, "completions/mean_length": 275.3125, "completions/mean_terminated_length": 275.3125, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.5715699195861816, "epoch": 1.9375, "frac_reward_zero_std": 0.75, "grad_norm": 0.701072689024406, "kl": 0.05752525106072426, "learning_rate": 3.380581402939841e-07, "loss": -0.0153, "num_tokens": 67985903.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.4351998567581177, "sampling/importance_sampling_ratio/mean": 1.000058650970459, "sampling/importance_sampling_ratio/min": 0.6482405066490173, "sampling/sampling_logp_difference/max": 0.4334934949874878, "sampling/sampling_logp_difference/mean": 0.01749599725008011, "step": 1581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 948.0, "completions/max_terminated_length": 948.0, "completions/mean_length": 374.1875, "completions/mean_terminated_length": 374.1875, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.5673298835754395, "epoch": 1.9387254901960784, "frac_reward_zero_std": 0.5, "grad_norm": 0.9963796544231983, "kl": 0.029742876067757607, "learning_rate": 3.373843250562265e-07, "loss": -0.0264, "num_tokens": 68031019.0, "reward": 0.53125, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.4478073120117188, "sampling/importance_sampling_ratio/mean": 0.9996123313903809, "sampling/importance_sampling_ratio/min": 0.6132709980010986, "sampling/sampling_logp_difference/max": 0.48894834518432617, "sampling/sampling_logp_difference/mean": 0.016682710498571396, "step": 1582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 690.0, "completions/max_terminated_length": 690.0, "completions/mean_length": 278.0625, "completions/mean_terminated_length": 278.0625, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.5086662769317627, "epoch": 1.9399509803921569, "frac_reward_zero_std": 1.0, "grad_norm": 0.019772510518919272, "kl": 0.046213991940021515, "learning_rate": 3.3671083991798697e-07, "loss": 0.0004, "num_tokens": 68064175.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.421569585800171, "sampling/importance_sampling_ratio/mean": 1.0003182888031006, "sampling/importance_sampling_ratio/min": 0.7625324130058289, "sampling/sampling_logp_difference/max": 0.3517615795135498, "sampling/sampling_logp_difference/mean": 0.016471732407808304, "step": 1583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 594.0, "completions/max_terminated_length": 594.0, "completions/mean_length": 308.9375, "completions/mean_terminated_length": 308.9375, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.40237438678741455, "epoch": 1.9411764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.014466048310701625, "kl": 0.018467843532562256, "learning_rate": 3.360376862463978e-07, "loss": 0.0002, "num_tokens": 68098059.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0002237558364868, "sampling/importance_sampling_ratio/min": 0.6772358417510986, "sampling/sampling_logp_difference/max": 0.8445332050323486, "sampling/sampling_logp_difference/mean": 0.014027119614183903, "step": 1584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 521.0, "completions/max_terminated_length": 521.0, "completions/mean_length": 306.0, "completions/mean_terminated_length": 306.0, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.4027760326862335, "epoch": 1.9424019607843137, "frac_reward_zero_std": 0.75, "grad_norm": 0.6435938875696471, "kl": 0.022236516699194908, "learning_rate": 3.3536486540791823e-07, "loss": -0.0116, "num_tokens": 68133339.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.3964879512786865, "sampling/importance_sampling_ratio/mean": 0.999901533126831, "sampling/importance_sampling_ratio/min": 0.7264013886451721, "sampling/sampling_logp_difference/max": 0.33396053314208984, "sampling/sampling_logp_difference/mean": 0.01271170936524868, "step": 1585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 715.0, "completions/max_terminated_length": 715.0, "completions/mean_length": 336.265625, "completions/mean_terminated_length": 336.265625, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "entropy": 0.5313848257064819, "epoch": 1.9436274509803921, "frac_reward_zero_std": 0.75, "grad_norm": 0.7616204466610809, "kl": 0.02818790264427662, "learning_rate": 3.3469237876833187e-07, "loss": -0.0071, "num_tokens": 68176540.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000687599182129, "sampling/importance_sampling_ratio/min": 0.6582149267196655, "sampling/sampling_logp_difference/max": 0.7445142269134521, "sampling/sampling_logp_difference/mean": 0.01558151189237833, "step": 1586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 613.0, "completions/max_terminated_length": 613.0, "completions/mean_length": 343.65625, "completions/mean_terminated_length": 343.65625, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "entropy": 0.45206502079963684, "epoch": 1.9448529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.013033931414988199, "kl": 0.018497932702302933, "learning_rate": 3.340202276927442e-07, "loss": 0.0002, "num_tokens": 68218582.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4030307531356812, "sampling/importance_sampling_ratio/mean": 1.0000488758087158, "sampling/importance_sampling_ratio/min": 0.6850235462188721, "sampling/sampling_logp_difference/max": 0.37830209732055664, "sampling/sampling_logp_difference/mean": 0.012983627617359161, "step": 1587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1492.0, "completions/max_terminated_length": 1492.0, "completions/mean_length": 497.15625, "completions/mean_terminated_length": 497.15625, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "entropy": 0.5618832111358643, "epoch": 1.946078431372549, "frac_reward_zero_std": 0.5, "grad_norm": 0.7005122531867044, "kl": 0.02800927869975567, "learning_rate": 3.333484135455792e-07, "loss": -0.0501, "num_tokens": 68270512.0, "reward": 0.1875, "reward_std": 0.3943893015384674, "rewards/decision_reward_func/mean": 0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000916719436646, "sampling/importance_sampling_ratio/min": 0.6230705976486206, "sampling/sampling_logp_difference/max": 0.9457170963287354, "sampling/sampling_logp_difference/mean": 0.015618949197232723, "step": 1588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/max_terminated_length": 522.0, "completions/mean_length": 284.765625, "completions/mean_terminated_length": 284.765625, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.5263767838478088, "epoch": 1.9473039215686274, "frac_reward_zero_std": 1.0, "grad_norm": 0.016271173817283748, "kl": 0.02384846843779087, "learning_rate": 3.326769376905769e-07, "loss": 0.0002, "num_tokens": 68322497.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4062432050704956, "sampling/importance_sampling_ratio/mean": 0.9996626973152161, "sampling/importance_sampling_ratio/min": 0.6504873633384705, "sampling/sampling_logp_difference/max": 0.43003344535827637, "sampling/sampling_logp_difference/mean": 0.01668081432580948, "step": 1589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 867.0, "completions/max_terminated_length": 867.0, "completions/mean_length": 455.125, "completions/mean_terminated_length": 455.125, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "entropy": 0.46168357133865356, "epoch": 1.9485294117647058, "frac_reward_zero_std": 1.0, "grad_norm": 0.02317132670926992, "kl": 0.01968373917043209, "learning_rate": 3.3200580149079083e-07, "loss": 0.0002, "num_tokens": 68375385.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4111099243164062, "sampling/importance_sampling_ratio/mean": 1.0000941753387451, "sampling/importance_sampling_ratio/min": 0.6546944975852966, "sampling/sampling_logp_difference/max": 0.4235866069793701, "sampling/sampling_logp_difference/mean": 0.012873737141489983, "step": 1590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1050.0, "completions/max_terminated_length": 1050.0, "completions/mean_length": 414.09375, "completions/mean_terminated_length": 414.09375, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.629016637802124, "epoch": 1.9497549019607843, "frac_reward_zero_std": 0.75, "grad_norm": 0.6494713214390222, "kl": 0.028472326695919037, "learning_rate": 3.31335006308585e-07, "loss": -0.0115, "num_tokens": 68417519.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.9463071823120117, "sampling/importance_sampling_ratio/mean": 1.0005357265472412, "sampling/importance_sampling_ratio/min": 0.6368793845176697, "sampling/sampling_logp_difference/max": 0.6659338474273682, "sampling/sampling_logp_difference/mean": 0.018462462350726128, "step": 1591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1145.0, "completions/max_terminated_length": 1145.0, "completions/mean_length": 396.65625, "completions/mean_terminated_length": 396.65625, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.5906513929367065, "epoch": 1.9509803921568627, "frac_reward_zero_std": 0.25, "grad_norm": 1.0120344532119678, "kl": 0.04214811325073242, "learning_rate": 3.3066455350563115e-07, "loss": 0.0299, "num_tokens": 68458233.0, "reward": 0.40625, "reward_std": 0.6331988573074341, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.4975483417510986, "sampling/importance_sampling_ratio/mean": 1.0000988245010376, "sampling/importance_sampling_ratio/min": 0.6394641995429993, "sampling/sampling_logp_difference/max": 0.447124719619751, "sampling/sampling_logp_difference/mean": 0.016466323286294937, "step": 1592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1238.0, "completions/max_terminated_length": 1238.0, "completions/mean_length": 427.203125, "completions/mean_terminated_length": 427.203125, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.48179104924201965, "epoch": 1.9522058823529411, "frac_reward_zero_std": 0.5, "grad_norm": 0.8145249613673906, "kl": 0.021749455481767654, "learning_rate": 3.29994444442906e-07, "loss": 0.0885, "num_tokens": 68503494.0, "reward": 0.625, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.6488267183303833, "sampling/importance_sampling_ratio/mean": 1.0002312660217285, "sampling/importance_sampling_ratio/min": 0.6384544372558594, "sampling/sampling_logp_difference/max": 0.5000638961791992, "sampling/sampling_logp_difference/mean": 0.014083778485655785, "step": 1593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 983.0, "completions/max_terminated_length": 983.0, "completions/mean_length": 424.796875, "completions/mean_terminated_length": 424.796875, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.5607149004936218, "epoch": 1.9534313725490198, "frac_reward_zero_std": 0.5, "grad_norm": 0.7975687056758776, "kl": 0.029872089624404907, "learning_rate": 3.2932468048068836e-07, "loss": -0.0462, "num_tokens": 68550185.0, "reward": -0.3125, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": -0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.4206327199935913, "sampling/importance_sampling_ratio/mean": 0.9994009733200073, "sampling/importance_sampling_ratio/min": 0.695499062538147, "sampling/sampling_logp_difference/max": 0.3631255626678467, "sampling/sampling_logp_difference/mean": 0.015072531998157501, "step": 1594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 986.0, "completions/max_terminated_length": 986.0, "completions/mean_length": 438.5625, "completions/mean_terminated_length": 438.5625, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "entropy": 0.42655491828918457, "epoch": 1.954656862745098, "frac_reward_zero_std": 0.75, "grad_norm": 0.5776896011018877, "kl": 0.022986333817243576, "learning_rate": 3.2865526297855694e-07, "loss": 0.0792, "num_tokens": 68599597.0, "reward": 0.25, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.25, "rewards/decision_reward_func/std": 0.9759001135826111, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000330924987793, "sampling/importance_sampling_ratio/min": 0.46197769045829773, "sampling/sampling_logp_difference/max": 0.9307906627655029, "sampling/sampling_logp_difference/mean": 0.012330269441008568, "step": 1595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1105.0, "completions/max_terminated_length": 1105.0, "completions/mean_length": 348.046875, "completions/mean_terminated_length": 348.046875, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.5153805017471313, "epoch": 1.9558823529411766, "frac_reward_zero_std": 0.75, "grad_norm": 0.6414438204645521, "kl": 0.023002423346042633, "learning_rate": 3.2798619329538646e-07, "loss": -0.0493, "num_tokens": 68638032.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.4640345573425293, "sampling/importance_sampling_ratio/mean": 0.9996954798698425, "sampling/importance_sampling_ratio/min": 0.6212019920349121, "sampling/sampling_logp_difference/max": 0.47609901428222656, "sampling/sampling_logp_difference/mean": 0.01601400598883629, "step": 1596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 756.0, "completions/max_terminated_length": 756.0, "completions/mean_length": 390.46875, "completions/mean_terminated_length": 390.46875, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "entropy": 0.6212517023086548, "epoch": 1.9571078431372548, "frac_reward_zero_std": 0.5, "grad_norm": 0.9044279358306834, "kl": 0.03310666233301163, "learning_rate": 3.2731747278934623e-07, "loss": -0.0115, "num_tokens": 68683406.0, "reward": 0.21875, "reward_std": 0.375, "rewards/decision_reward_func/mean": 0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 1.3266850709915161, "sampling/importance_sampling_ratio/mean": 1.000289797782898, "sampling/importance_sampling_ratio/min": 0.5910096764564514, "sampling/sampling_logp_difference/max": 0.5259228944778442, "sampling/sampling_logp_difference/mean": 0.016948755830526352, "step": 1597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1053.0, "completions/max_terminated_length": 1053.0, "completions/mean_length": 446.421875, "completions/mean_terminated_length": 446.421875, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "entropy": 0.4604445695877075, "epoch": 1.9583333333333335, "frac_reward_zero_std": 0.75, "grad_norm": 0.49424712934075776, "kl": 0.018776264041662216, "learning_rate": 3.266491028178964e-07, "loss": 0.0396, "num_tokens": 68730969.0, "reward": 0.625, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.6082907915115356, "sampling/importance_sampling_ratio/mean": 0.9999462366104126, "sampling/importance_sampling_ratio/min": 0.6547533273696899, "sampling/sampling_logp_difference/max": 0.4751720428466797, "sampling/sampling_logp_difference/mean": 0.01344958133995533, "step": 1598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 635.0, "completions/max_terminated_length": 635.0, "completions/mean_length": 347.5625, "completions/mean_terminated_length": 347.5625, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.40875816345214844, "epoch": 1.9595588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 0.011529968570697697, "kl": 0.018835127353668213, "learning_rate": 3.2598108473778595e-07, "loss": 0.0002, "num_tokens": 68770605.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4283710718154907, "sampling/importance_sampling_ratio/mean": 1.0003104209899902, "sampling/importance_sampling_ratio/min": 0.697778046131134, "sampling/sampling_logp_difference/max": 0.35985422134399414, "sampling/sampling_logp_difference/mean": 0.011908348649740219, "step": 1599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1195.0, "completions/max_terminated_length": 1195.0, "completions/mean_length": 436.3125, "completions/mean_terminated_length": 436.3125, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "entropy": 0.6424329280853271, "epoch": 1.9607843137254903, "frac_reward_zero_std": 0.75, "grad_norm": 0.4957562332194898, "kl": 0.029608536511659622, "learning_rate": 3.253134199050489e-07, "loss": -0.0201, "num_tokens": 68816257.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.435015082359314, "sampling/importance_sampling_ratio/mean": 0.999704122543335, "sampling/importance_sampling_ratio/min": 0.6148654818534851, "sampling/sampling_logp_difference/max": 0.48635172843933105, "sampling/sampling_logp_difference/mean": 0.017087213695049286, "step": 1600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1109.0, "completions/max_terminated_length": 1109.0, "completions/mean_length": 392.546875, "completions/mean_terminated_length": 392.546875, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "entropy": 0.5117258429527283, "epoch": 1.9620098039215685, "frac_reward_zero_std": 0.75, "grad_norm": 0.47979394610257947, "kl": 0.02599429339170456, "learning_rate": 3.2464610967500273e-07, "loss": 0.0409, "num_tokens": 68861492.0, "reward": 0.84375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.4279710054397583, "sampling/importance_sampling_ratio/mean": 1.000414252281189, "sampling/importance_sampling_ratio/min": 0.682912290096283, "sampling/sampling_logp_difference/max": 0.38138890266418457, "sampling/sampling_logp_difference/mean": 0.015819476917386055, "step": 1601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 750.0, "completions/max_terminated_length": 750.0, "completions/mean_length": 469.84375, "completions/mean_terminated_length": 469.84375, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "entropy": 0.49386677145957947, "epoch": 1.9632352941176472, "frac_reward_zero_std": 0.5, "grad_norm": 0.7012252425250007, "kl": 0.025976218283176422, "learning_rate": 3.239791554022449e-07, "loss": 0.0082, "num_tokens": 68910346.0, "reward": 0.53125, "reward_std": 0.4629635810852051, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.4452589750289917, "sampling/importance_sampling_ratio/mean": 0.9997819066047668, "sampling/importance_sampling_ratio/min": 0.6958194971084595, "sampling/sampling_logp_difference/max": 0.368288516998291, "sampling/sampling_logp_difference/mean": 0.012814486399292946, "step": 1602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1177.0, "completions/max_terminated_length": 1177.0, "completions/mean_length": 413.390625, "completions/mean_terminated_length": 413.390625, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.468494713306427, "epoch": 1.9644607843137254, "frac_reward_zero_std": 1.0, "grad_norm": 0.011301512400275684, "kl": 0.019860263913869858, "learning_rate": 3.233125584406505e-07, "loss": 0.0002, "num_tokens": 68955923.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5551005601882935, "sampling/importance_sampling_ratio/mean": 1.0001389980316162, "sampling/importance_sampling_ratio/min": 0.3340384364128113, "sampling/sampling_logp_difference/max": 1.0964992046356201, "sampling/sampling_logp_difference/mean": 0.01414535567164421, "step": 1603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1022.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 456.078125, "completions/mean_terminated_length": 456.078125, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.3868984580039978, "epoch": 1.965686274509804, "frac_reward_zero_std": 1.0, "grad_norm": 0.009903916811632902, "kl": 0.018353629857301712, "learning_rate": 3.226463201433688e-07, "loss": 0.0002, "num_tokens": 69005272.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5071730613708496, "sampling/importance_sampling_ratio/mean": 1.0000190734863281, "sampling/importance_sampling_ratio/min": 0.6780698299407959, "sampling/sampling_logp_difference/max": 0.41023576259613037, "sampling/sampling_logp_difference/mean": 0.012178616598248482, "step": 1604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1625.0, "completions/max_terminated_length": 1625.0, "completions/mean_length": 427.09375, "completions/mean_terminated_length": 427.09375, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "entropy": 0.4857763946056366, "epoch": 1.9669117647058822, "frac_reward_zero_std": 1.0, "grad_norm": 0.01317977715324995, "kl": 0.021790843456983566, "learning_rate": 3.219804418628216e-07, "loss": 0.0002, "num_tokens": 69053006.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4732927083969116, "sampling/importance_sampling_ratio/mean": 1.0003222227096558, "sampling/importance_sampling_ratio/min": 0.7066320776939392, "sampling/sampling_logp_difference/max": 0.3874998092651367, "sampling/sampling_logp_difference/mean": 0.014237549155950546, "step": 1605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1091.0, "completions/max_terminated_length": 1091.0, "completions/mean_length": 357.59375, "completions/mean_terminated_length": 357.59375, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.4473327398300171, "epoch": 1.968137254901961, "frac_reward_zero_std": 1.0, "grad_norm": 0.025072858872832563, "kl": 0.023778272792696953, "learning_rate": 3.2131492495069965e-07, "loss": 0.0002, "num_tokens": 69097140.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.5324664115905762, "sampling/importance_sampling_ratio/mean": 0.9997178316116333, "sampling/importance_sampling_ratio/min": 0.6548279523849487, "sampling/sampling_logp_difference/max": 0.4268784523010254, "sampling/sampling_logp_difference/mean": 0.012958992272615433, "step": 1606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1472.0, "completions/max_terminated_length": 1472.0, "completions/mean_length": 511.0625, "completions/mean_terminated_length": 511.0625, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "entropy": 0.5224879384040833, "epoch": 1.969362745098039, "frac_reward_zero_std": 0.5, "grad_norm": 0.9305878527874581, "kl": 0.020585523918271065, "learning_rate": 3.206497707579598e-07, "loss": -0.0251, "num_tokens": 69149272.0, "reward": 0.375, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.8123060464859009, "sampling/importance_sampling_ratio/mean": 0.9998729228973389, "sampling/importance_sampling_ratio/min": 0.5532049536705017, "sampling/sampling_logp_difference/max": 0.5946000814437866, "sampling/sampling_logp_difference/mean": 0.013488843105733395, "step": 1607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 640.0, "completions/max_terminated_length": 640.0, "completions/mean_length": 311.53125, "completions/mean_terminated_length": 311.53125, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.4001462459564209, "epoch": 1.9705882352941178, "frac_reward_zero_std": 1.0, "grad_norm": 0.011904545572411046, "kl": 0.020253822207450867, "learning_rate": 3.199849806348233e-07, "loss": 0.0002, "num_tokens": 69188010.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4772207736968994, "sampling/importance_sampling_ratio/mean": 1.0000611543655396, "sampling/importance_sampling_ratio/min": 0.6969862580299377, "sampling/sampling_logp_difference/max": 0.39016246795654297, "sampling/sampling_logp_difference/mean": 0.01269588340073824, "step": 1608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 989.0, "completions/max_terminated_length": 989.0, "completions/mean_length": 317.21875, "completions/mean_terminated_length": 317.21875, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.5440255999565125, "epoch": 1.971813725490196, "frac_reward_zero_std": 1.0, "grad_norm": 0.016273450201042355, "kl": 0.02837981842458248, "learning_rate": 3.1932055593077166e-07, "loss": 0.0003, "num_tokens": 69223240.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4585506916046143, "sampling/importance_sampling_ratio/mean": 0.9999441504478455, "sampling/importance_sampling_ratio/min": 0.6481513977050781, "sampling/sampling_logp_difference/max": 0.43363094329833984, "sampling/sampling_logp_difference/mean": 0.016609789803624153, "step": 1609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 660.0, "completions/max_terminated_length": 660.0, "completions/mean_length": 363.46875, "completions/mean_terminated_length": 363.46875, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.5095827579498291, "epoch": 1.9730392156862746, "frac_reward_zero_std": 0.75, "grad_norm": 0.6230383191583009, "kl": 0.025747444480657578, "learning_rate": 3.186564979945453e-07, "loss": 0.0301, "num_tokens": 69265782.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.4753999710083008, "sampling/importance_sampling_ratio/mean": 1.0003809928894043, "sampling/importance_sampling_ratio/min": 0.5708780884742737, "sampling/sampling_logp_difference/max": 0.5605795383453369, "sampling/sampling_logp_difference/mean": 0.015539501793682575, "step": 1610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 866.0, "completions/max_terminated_length": 866.0, "completions/mean_length": 433.03125, "completions/mean_terminated_length": 433.03125, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "entropy": 0.3953997492790222, "epoch": 1.9742647058823528, "frac_reward_zero_std": 1.0, "grad_norm": 0.011320254441255229, "kl": 0.01754176616668701, "learning_rate": 3.179928081741394e-07, "loss": 0.0002, "num_tokens": 69320328.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.461594581604004, "sampling/importance_sampling_ratio/mean": 1.00019109249115, "sampling/importance_sampling_ratio/min": 0.6979783773422241, "sampling/sampling_logp_difference/max": 0.3795280456542969, "sampling/sampling_logp_difference/mean": 0.011455480009317398, "step": 1611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 774.0, "completions/max_terminated_length": 774.0, "completions/mean_length": 434.546875, "completions/mean_terminated_length": 434.546875, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "entropy": 0.5426819920539856, "epoch": 1.9754901960784315, "frac_reward_zero_std": 0.75, "grad_norm": 0.4936465078434719, "kl": 0.021181629970669746, "learning_rate": 3.173294878168025e-07, "loss": 0.0189, "num_tokens": 69364987.0, "reward": -0.0625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": -0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.5418909788131714, "sampling/importance_sampling_ratio/mean": 0.9995971918106079, "sampling/importance_sampling_ratio/min": 0.6919832229614258, "sampling/sampling_logp_difference/max": 0.43300962448120117, "sampling/sampling_logp_difference/mean": 0.014378528110682964, "step": 1612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1609.0, "completions/max_terminated_length": 1609.0, "completions/mean_length": 460.15625, "completions/mean_terminated_length": 460.15625, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "entropy": 0.5778923034667969, "epoch": 1.9767156862745097, "frac_reward_zero_std": 0.75, "grad_norm": 0.585203470656192, "kl": 0.02636198326945305, "learning_rate": 3.166665382690327e-07, "loss": -0.0348, "num_tokens": 69413621.0, "reward": -0.3125, "reward_std": 0.25, "rewards/decision_reward_func/mean": -0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.5784894227981567, "sampling/importance_sampling_ratio/mean": 0.9999513030052185, "sampling/importance_sampling_ratio/min": 0.7263087630271912, "sampling/sampling_logp_difference/max": 0.4564683437347412, "sampling/sampling_logp_difference/mean": 0.015545766800642014, "step": 1613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1283.0, "completions/max_terminated_length": 1283.0, "completions/mean_length": 425.0, "completions/mean_terminated_length": 425.0, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "entropy": 0.4159718155860901, "epoch": 1.9779411764705883, "frac_reward_zero_std": 0.5, "grad_norm": 0.9388593446832053, "kl": 0.022650057449936867, "learning_rate": 3.1600396087657586e-07, "loss": 0.1105, "num_tokens": 69456373.0, "reward": 0.9375, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.4172642230987549, "sampling/importance_sampling_ratio/mean": 1.0002541542053223, "sampling/importance_sampling_ratio/min": 0.6893067359924316, "sampling/sampling_logp_difference/max": 0.3720688819885254, "sampling/sampling_logp_difference/mean": 0.012360403314232826, "step": 1614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1100.0, "completions/max_terminated_length": 1100.0, "completions/mean_length": 504.609375, "completions/mean_terminated_length": 504.609375, "completions/min_length": 305.0, "completions/min_terminated_length": 305.0, "entropy": 0.549781084060669, "epoch": 1.9791666666666665, "frac_reward_zero_std": 0.5, "grad_norm": 1.1542774497558328, "kl": 0.02133595570921898, "learning_rate": 3.153417569844219e-07, "loss": -0.0171, "num_tokens": 69510972.0, "reward": 0.46875, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.7145750522613525, "sampling/importance_sampling_ratio/mean": 1.0000932216644287, "sampling/importance_sampling_ratio/min": 0.6653224229812622, "sampling/sampling_logp_difference/max": 0.5391652584075928, "sampling/sampling_logp_difference/mean": 0.014291539788246155, "step": 1615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1138.0, "completions/max_terminated_length": 1138.0, "completions/mean_length": 474.109375, "completions/mean_terminated_length": 474.109375, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "entropy": 0.5674899816513062, "epoch": 1.9803921568627452, "frac_reward_zero_std": 0.5, "grad_norm": 0.7080345424823736, "kl": 0.027309810742735863, "learning_rate": 3.1467992793680267e-07, "loss": 0.0325, "num_tokens": 69565507.0, "reward": 0.9375, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.7811157703399658, "sampling/importance_sampling_ratio/mean": 0.9999535083770752, "sampling/importance_sampling_ratio/min": 0.3444252014160156, "sampling/sampling_logp_difference/max": 1.0658783912658691, "sampling/sampling_logp_difference/mean": 0.01647529937326908, "step": 1616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 682.0, "completions/max_terminated_length": 682.0, "completions/mean_length": 374.421875, "completions/mean_terminated_length": 374.421875, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.4128122329711914, "epoch": 1.9816176470588234, "frac_reward_zero_std": 0.75, "grad_norm": 0.5136184898964172, "kl": 0.021588144823908806, "learning_rate": 3.140184750771895e-07, "loss": -0.0209, "num_tokens": 69606430.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.464746117591858, "sampling/importance_sampling_ratio/mean": 0.999975860118866, "sampling/importance_sampling_ratio/min": 0.6727650165557861, "sampling/sampling_logp_difference/max": 0.3963592052459717, "sampling/sampling_logp_difference/mean": 0.012948977760970592, "step": 1617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 806.0, "completions/max_terminated_length": 806.0, "completions/mean_length": 302.890625, "completions/mean_terminated_length": 302.890625, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.47042524814605713, "epoch": 1.982843137254902, "frac_reward_zero_std": 1.0, "grad_norm": 0.015075224194254397, "kl": 0.029691431671380997, "learning_rate": 3.133573997482896e-07, "loss": 0.0003, "num_tokens": 69649015.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5745811462402344, "sampling/importance_sampling_ratio/mean": 0.9996337294578552, "sampling/importance_sampling_ratio/min": 0.7106504440307617, "sampling/sampling_logp_difference/max": 0.45398926734924316, "sampling/sampling_logp_difference/mean": 0.014758598059415817, "step": 1618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 607.0, "completions/max_terminated_length": 607.0, "completions/mean_length": 377.34375, "completions/mean_terminated_length": 377.34375, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.4164310097694397, "epoch": 1.9840686274509802, "frac_reward_zero_std": 1.0, "grad_norm": 0.012046064058785864, "kl": 0.019147302955389023, "learning_rate": 3.1269670329204393e-07, "loss": 0.0002, "num_tokens": 69693245.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.421162486076355, "sampling/importance_sampling_ratio/mean": 1.0001134872436523, "sampling/importance_sampling_ratio/min": 0.6987019777297974, "sampling/sampling_logp_difference/max": 0.35853099822998047, "sampling/sampling_logp_difference/mean": 0.013210390694439411, "step": 1619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 870.0, "completions/max_terminated_length": 870.0, "completions/mean_length": 347.984375, "completions/mean_terminated_length": 347.984375, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.4158927798271179, "epoch": 1.9852941176470589, "frac_reward_zero_std": 1.0, "grad_norm": 0.014228067938487089, "kl": 0.02242537960410118, "learning_rate": 3.1203638704962465e-07, "loss": 0.0002, "num_tokens": 69732796.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4651191234588623, "sampling/importance_sampling_ratio/mean": 0.9996147155761719, "sampling/importance_sampling_ratio/min": 0.7390772104263306, "sampling/sampling_logp_difference/max": 0.38193655014038086, "sampling/sampling_logp_difference/mean": 0.012506287544965744, "step": 1620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 632.0, "completions/max_terminated_length": 632.0, "completions/mean_length": 357.1875, "completions/mean_terminated_length": 357.1875, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "entropy": 0.6388640999794006, "epoch": 1.9865196078431373, "frac_reward_zero_std": 1.0, "grad_norm": 0.05994528472917882, "kl": 0.046394769102334976, "learning_rate": 3.11376452361432e-07, "loss": 0.0004, "num_tokens": 69770680.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.519039273262024, "sampling/importance_sampling_ratio/mean": 0.9998978972434998, "sampling/importance_sampling_ratio/min": 0.21334338188171387, "sampling/sampling_logp_difference/max": 1.5448522567749023, "sampling/sampling_logp_difference/mean": 0.01800444722175598, "step": 1621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 542.0, "completions/max_terminated_length": 542.0, "completions/mean_length": 324.0, "completions/mean_terminated_length": 324.0, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.43122610449790955, "epoch": 1.9877450980392157, "frac_reward_zero_std": 1.0, "grad_norm": 0.012363516528937683, "kl": 0.018396897241473198, "learning_rate": 3.107169005670912e-07, "loss": 0.0002, "num_tokens": 69806712.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4658284187316895, "sampling/importance_sampling_ratio/mean": 1.000197410583496, "sampling/importance_sampling_ratio/min": 0.6622446179389954, "sampling/sampling_logp_difference/max": 0.41212034225463867, "sampling/sampling_logp_difference/mean": 0.014052336104214191, "step": 1622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 666.0, "completions/max_terminated_length": 666.0, "completions/mean_length": 296.390625, "completions/mean_terminated_length": 296.390625, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.5758157968521118, "epoch": 1.9889705882352942, "frac_reward_zero_std": 0.75, "grad_norm": 0.7958271691328472, "kl": 0.03611079603433609, "learning_rate": 3.100577330054508e-07, "loss": 0.0369, "num_tokens": 69846497.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.6254827976226807, "sampling/importance_sampling_ratio/mean": 1.0003635883331299, "sampling/importance_sampling_ratio/min": 0.6546944975852966, "sampling/sampling_logp_difference/max": 0.4858049154281616, "sampling/sampling_logp_difference/mean": 0.016626061871647835, "step": 1623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1089.0, "completions/max_terminated_length": 1089.0, "completions/mean_length": 427.765625, "completions/mean_terminated_length": 427.765625, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "entropy": 0.6455445885658264, "epoch": 1.9901960784313726, "frac_reward_zero_std": 0.5, "grad_norm": 0.7231910523035077, "kl": 0.03188615292310715, "learning_rate": 3.0939895101457914e-07, "loss": -0.0567, "num_tokens": 69889218.0, "reward": 0.28125, "reward_std": 0.38319888710975647, "rewards/decision_reward_func/mean": 0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 1.3646667003631592, "sampling/importance_sampling_ratio/mean": 1.0002870559692383, "sampling/importance_sampling_ratio/min": 0.6207318305969238, "sampling/sampling_logp_difference/max": 0.4768562316894531, "sampling/sampling_logp_difference/mean": 0.016469012945890427, "step": 1624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1375.0, "completions/max_terminated_length": 1375.0, "completions/mean_length": 463.03125, "completions/mean_terminated_length": 463.03125, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "entropy": 0.3391924500465393, "epoch": 1.991421568627451, "frac_reward_zero_std": 1.0, "grad_norm": 0.010688082310530903, "kl": 0.018620464950799942, "learning_rate": 3.087405559317622e-07, "loss": 0.0002, "num_tokens": 69936340.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.7948840856552124, "sampling/importance_sampling_ratio/mean": 0.9999966621398926, "sampling/importance_sampling_ratio/min": 0.662239134311676, "sampling/sampling_logp_difference/max": 0.5849404335021973, "sampling/sampling_logp_difference/mean": 0.010126523673534393, "step": 1625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 900.0, "completions/max_terminated_length": 900.0, "completions/mean_length": 434.328125, "completions/mean_terminated_length": 434.328125, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "entropy": 0.4668295979499817, "epoch": 1.9926470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.012479568240322304, "kl": 0.019077584147453308, "learning_rate": 3.0808254909349986e-07, "loss": 0.0002, "num_tokens": 69981161.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9996905326843262, "sampling/importance_sampling_ratio/min": 0.637320339679718, "sampling/sampling_logp_difference/max": 0.792661190032959, "sampling/sampling_logp_difference/mean": 0.013372006826102734, "step": 1626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 858.0, "completions/max_terminated_length": 858.0, "completions/mean_length": 400.3125, "completions/mean_terminated_length": 400.3125, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "entropy": 0.49644458293914795, "epoch": 1.9938725490196079, "frac_reward_zero_std": 1.0, "grad_norm": 0.015546990986956031, "kl": 0.02163894847035408, "learning_rate": 3.0742493183550454e-07, "loss": 0.0002, "num_tokens": 70028637.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6396737098693848, "sampling/importance_sampling_ratio/mean": 1.000227689743042, "sampling/importance_sampling_ratio/min": 0.6823206543922424, "sampling/sampling_logp_difference/max": 0.49449729919433594, "sampling/sampling_logp_difference/mean": 0.014636281877756119, "step": 1627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1371.0, "completions/max_terminated_length": 1371.0, "completions/mean_length": 464.1875, "completions/mean_terminated_length": 464.1875, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.6991268992424011, "epoch": 1.9950980392156863, "frac_reward_zero_std": 0.5, "grad_norm": 0.6553528561150402, "kl": 0.03818086162209511, "learning_rate": 3.0676770549269786e-07, "loss": 0.0924, "num_tokens": 70079593.0, "reward": 0.65625, "reward_std": 0.48935678601264954, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.6603423357009888, "sampling/importance_sampling_ratio/mean": 0.9998089075088501, "sampling/importance_sampling_ratio/min": 0.4069863557815552, "sampling/sampling_logp_difference/max": 0.8989756107330322, "sampling/sampling_logp_difference/mean": 0.01829574815928936, "step": 1628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 771.0, "completions/max_terminated_length": 771.0, "completions/mean_length": 347.5625, "completions/mean_terminated_length": 347.5625, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "entropy": 0.5262171626091003, "epoch": 1.9963235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.014334574542052806, "kl": 0.02268335595726967, "learning_rate": 3.0611087139920717e-07, "loss": 0.0002, "num_tokens": 70118557.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5073364973068237, "sampling/importance_sampling_ratio/mean": 0.9999362230300903, "sampling/importance_sampling_ratio/min": 0.7005050778388977, "sampling/sampling_logp_difference/max": 0.41034412384033203, "sampling/sampling_logp_difference/mean": 0.015955574810504913, "step": 1629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1939.0, "completions/max_terminated_length": 1939.0, "completions/mean_length": 444.015625, "completions/mean_terminated_length": 444.015625, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.4416491985321045, "epoch": 1.9975490196078431, "frac_reward_zero_std": 1.0, "grad_norm": 0.01585286729788227, "kl": 0.020589584484696388, "learning_rate": 3.054544308883643e-07, "loss": 0.0002, "num_tokens": 70166238.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4174822568893433, "sampling/importance_sampling_ratio/mean": 1.0001423358917236, "sampling/importance_sampling_ratio/min": 0.6944992542266846, "sampling/sampling_logp_difference/max": 0.3645641803741455, "sampling/sampling_logp_difference/mean": 0.012486220337450504, "step": 1630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 668.0, "completions/max_terminated_length": 668.0, "completions/mean_length": 364.984375, "completions/mean_terminated_length": 364.984375, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "entropy": 0.49764764308929443, "epoch": 1.9987745098039216, "frac_reward_zero_std": 1.0, "grad_norm": 0.012501175636537807, "kl": 0.01903236284852028, "learning_rate": 3.0479838529270186e-07, "loss": 0.0002, "num_tokens": 70206685.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4444838762283325, "sampling/importance_sampling_ratio/mean": 1.0000386238098145, "sampling/importance_sampling_ratio/min": 0.7244148850440979, "sampling/sampling_logp_difference/max": 0.3677520751953125, "sampling/sampling_logp_difference/mean": 0.01472003385424614, "step": 1631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1551.0, "completions/max_terminated_length": 1551.0, "completions/mean_length": 409.046875, "completions/mean_terminated_length": 409.046875, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.48040851950645447, "epoch": 2.0, "frac_reward_zero_std": 0.75, "grad_norm": 0.5759847145585357, "kl": 0.02159193530678749, "learning_rate": 3.0414273594395103e-07, "loss": -0.1198, "num_tokens": 70251648.0, "reward": 0.78125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.4333919286727905, "sampling/importance_sampling_ratio/mean": 0.999731719493866, "sampling/importance_sampling_ratio/min": 0.6368668079376221, "sampling/sampling_logp_difference/max": 0.45119476318359375, "sampling/sampling_logp_difference/mean": 0.014523344114422798, "step": 1632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1793.0, "completions/max_terminated_length": 1793.0, "completions/mean_length": 450.25, "completions/mean_terminated_length": 450.25, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "entropy": 0.5399717092514038, "epoch": 2.0012254901960786, "frac_reward_zero_std": 0.75, "grad_norm": 0.6277417858221161, "kl": 0.025396525859832764, "learning_rate": 3.034874841730382e-07, "loss": 0.1083, "num_tokens": 70302304.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999253749847412, "sampling/importance_sampling_ratio/min": 0.5177874565124512, "sampling/sampling_logp_difference/max": 0.9057252407073975, "sampling/sampling_logp_difference/mean": 0.014912931248545647, "step": 1633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 853.0, "completions/max_terminated_length": 853.0, "completions/mean_length": 354.109375, "completions/mean_terminated_length": 354.109375, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "entropy": 0.6280057430267334, "epoch": 2.002450980392157, "frac_reward_zero_std": 0.5, "grad_norm": 0.7871801041128128, "kl": 0.041916366666555405, "learning_rate": 3.0283263131008307e-07, "loss": 0.0171, "num_tokens": 70343063.0, "reward": 0.59375, "reward_std": 0.4515564441680908, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.4053069353103638, "sampling/importance_sampling_ratio/mean": 1.0005290508270264, "sampling/importance_sampling_ratio/min": 0.6452286243438721, "sampling/sampling_logp_difference/max": 0.43815064430236816, "sampling/sampling_logp_difference/mean": 0.017849110066890717, "step": 1634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 762.0, "completions/max_terminated_length": 762.0, "completions/mean_length": 370.953125, "completions/mean_terminated_length": 370.953125, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "entropy": 0.5916893482208252, "epoch": 2.0036764705882355, "frac_reward_zero_std": 0.75, "grad_norm": 0.4281142873653527, "kl": 0.03201092407107353, "learning_rate": 3.0217817868439545e-07, "loss": -0.0104, "num_tokens": 70381428.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.4373537302017212, "sampling/importance_sampling_ratio/mean": 1.0001351833343506, "sampling/importance_sampling_ratio/min": 0.7303296327590942, "sampling/sampling_logp_difference/max": 0.36280369758605957, "sampling/sampling_logp_difference/mean": 0.016279559582471848, "step": 1635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 403.0, "completions/max_terminated_length": 403.0, "completions/mean_length": 257.796875, "completions/mean_terminated_length": 257.796875, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.4618203341960907, "epoch": 2.0049019607843137, "frac_reward_zero_std": 1.0, "grad_norm": 0.019729924463763898, "kl": 0.02676289528608322, "learning_rate": 3.015241276244729e-07, "loss": 0.0003, "num_tokens": 70415143.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5985220670700073, "sampling/importance_sampling_ratio/mean": 0.9996711015701294, "sampling/importance_sampling_ratio/min": 0.6502892971038818, "sampling/sampling_logp_difference/max": 0.46907949447631836, "sampling/sampling_logp_difference/mean": 0.014237932860851288, "step": 1636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 862.0, "completions/max_terminated_length": 862.0, "completions/mean_length": 331.65625, "completions/mean_terminated_length": 331.65625, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.4437766671180725, "epoch": 2.0061274509803924, "frac_reward_zero_std": 1.0, "grad_norm": 0.012456698262222904, "kl": 0.018731314688920975, "learning_rate": 3.0087047945799724e-07, "loss": 0.0002, "num_tokens": 70450913.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3968743085861206, "sampling/importance_sampling_ratio/mean": 1.00029456615448, "sampling/importance_sampling_ratio/min": 0.6654497981071472, "sampling/sampling_logp_difference/max": 0.40729212760925293, "sampling/sampling_logp_difference/mean": 0.01397369522601366, "step": 1637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 925.0, "completions/max_terminated_length": 925.0, "completions/mean_length": 379.046875, "completions/mean_terminated_length": 379.046875, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "entropy": 0.44743669033050537, "epoch": 2.0073529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 0.752968412949984, "kl": 0.029897525906562805, "learning_rate": 3.002172355118331e-07, "loss": 0.0015, "num_tokens": 70494948.0, "reward": 0.71875, "reward_std": 0.38319888710975647, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.402601957321167, "sampling/importance_sampling_ratio/mean": 0.9997380971908569, "sampling/importance_sampling_ratio/min": 0.7289897799491882, "sampling/sampling_logp_difference/max": 0.3383290767669678, "sampling/sampling_logp_difference/mean": 0.012863595969974995, "step": 1638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1234.0, "completions/max_terminated_length": 1234.0, "completions/mean_length": 496.046875, "completions/mean_terminated_length": 496.046875, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "entropy": 0.6332497000694275, "epoch": 2.008578431372549, "frac_reward_zero_std": 0.5, "grad_norm": 0.8178819274035075, "kl": 0.02778559923171997, "learning_rate": 2.995643971120243e-07, "loss": 0.071, "num_tokens": 70545815.0, "reward": 0.625, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.5865206718444824, "sampling/importance_sampling_ratio/mean": 0.9996142387390137, "sampling/importance_sampling_ratio/min": 0.5942769646644592, "sampling/sampling_logp_difference/max": 0.5204098224639893, "sampling/sampling_logp_difference/mean": 0.016769861802458763, "step": 1639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/max_terminated_length": 527.0, "completions/mean_length": 286.625, "completions/mean_terminated_length": 286.625, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.4607362747192383, "epoch": 2.0098039215686274, "frac_reward_zero_std": 1.0, "grad_norm": 0.017102159912059694, "kl": 0.027709783986210823, "learning_rate": 2.9891196558379126e-07, "loss": 0.0003, "num_tokens": 70581455.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9998483657836914, "sampling/importance_sampling_ratio/min": 0.49429985880851746, "sampling/sampling_logp_difference/max": 0.7329187393188477, "sampling/sampling_logp_difference/mean": 0.0154039915651083, "step": 1640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 718.0, "completions/max_terminated_length": 718.0, "completions/mean_length": 362.796875, "completions/mean_terminated_length": 362.796875, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.46992388367652893, "epoch": 2.011029411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.01447421200691203, "kl": 0.01849254220724106, "learning_rate": 2.9825994225152884e-07, "loss": 0.0002, "num_tokens": 70620962.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4231518507003784, "sampling/importance_sampling_ratio/mean": 0.9995626211166382, "sampling/importance_sampling_ratio/min": 0.6092696189880371, "sampling/sampling_logp_difference/max": 0.49549436569213867, "sampling/sampling_logp_difference/mean": 0.014541897922754288, "step": 1641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1010.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 379.6875, "completions/mean_terminated_length": 379.6875, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.45539700984954834, "epoch": 2.0122549019607843, "frac_reward_zero_std": 1.0, "grad_norm": 0.012755120948274754, "kl": 0.0219133161008358, "learning_rate": 2.976083284388031e-07, "loss": 0.0002, "num_tokens": 70663630.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.43639075756073, "sampling/importance_sampling_ratio/mean": 0.9998008012771606, "sampling/importance_sampling_ratio/min": 0.5709230303764343, "sampling/sampling_logp_difference/max": 0.5605008602142334, "sampling/sampling_logp_difference/mean": 0.013878943398594856, "step": 1642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 752.0, "completions/max_terminated_length": 752.0, "completions/mean_length": 388.6875, "completions/mean_terminated_length": 388.6875, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "entropy": 0.5114088654518127, "epoch": 2.013480392156863, "frac_reward_zero_std": 0.5, "grad_norm": 0.9061218606118082, "kl": 0.025953814387321472, "learning_rate": 2.9695712546834885e-07, "loss": 0.0222, "num_tokens": 70716154.0, "reward": 0.875, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.408843755722046, "sampling/importance_sampling_ratio/mean": 0.9995434284210205, "sampling/importance_sampling_ratio/min": 0.6640582084655762, "sampling/sampling_logp_difference/max": 0.40938544273376465, "sampling/sampling_logp_difference/mean": 0.014701539650559425, "step": 1643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 757.0, "completions/max_terminated_length": 757.0, "completions/mean_length": 398.265625, "completions/mean_terminated_length": 398.265625, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "entropy": 0.45367592573165894, "epoch": 2.014705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.015073384745921086, "kl": 0.021007951349020004, "learning_rate": 2.9630633466206655e-07, "loss": 0.0002, "num_tokens": 70762683.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0003139972686768, "sampling/importance_sampling_ratio/min": 0.6286388635635376, "sampling/sampling_logp_difference/max": 1.1088426113128662, "sampling/sampling_logp_difference/mean": 0.014567296952009201, "step": 1644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 459.0, "completions/max_terminated_length": 459.0, "completions/mean_length": 259.65625, "completions/mean_terminated_length": 259.65625, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.5590072870254517, "epoch": 2.0159313725490198, "frac_reward_zero_std": 1.0, "grad_norm": 0.018744822485738992, "kl": 0.02554461732506752, "learning_rate": 2.9565595734102043e-07, "loss": 0.0003, "num_tokens": 70796453.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5194146633148193, "sampling/importance_sampling_ratio/mean": 0.9997324347496033, "sampling/importance_sampling_ratio/min": 0.4553687274456024, "sampling/sampling_logp_difference/max": 0.7866477966308594, "sampling/sampling_logp_difference/mean": 0.017640015110373497, "step": 1645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 330.0625, "completions/mean_terminated_length": 330.0625, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.5241436958312988, "epoch": 2.017156862745098, "frac_reward_zero_std": 0.75, "grad_norm": 0.683494154524939, "kl": 0.03207667917013168, "learning_rate": 2.950059948254355e-07, "loss": 0.0842, "num_tokens": 70835113.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.6019011735916138, "sampling/importance_sampling_ratio/mean": 1.0004627704620361, "sampling/importance_sampling_ratio/min": 0.6417599320411682, "sampling/sampling_logp_difference/max": 0.4711911678314209, "sampling/sampling_logp_difference/mean": 0.016941703855991364, "step": 1646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1048.0, "completions/max_terminated_length": 1048.0, "completions/mean_length": 410.46875, "completions/mean_terminated_length": 410.46875, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.4888048470020294, "epoch": 2.0183823529411766, "frac_reward_zero_std": 0.75, "grad_norm": 0.4666068235150727, "kl": 0.018854739144444466, "learning_rate": 2.943564484346943e-07, "loss": 0.0156, "num_tokens": 70881495.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.6221691370010376, "sampling/importance_sampling_ratio/mean": 1.0003249645233154, "sampling/importance_sampling_ratio/min": 0.6797688007354736, "sampling/sampling_logp_difference/max": 0.4837641716003418, "sampling/sampling_logp_difference/mean": 0.013611975125968456, "step": 1647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 613.0, "completions/max_terminated_length": 613.0, "completions/mean_length": 303.984375, "completions/mean_terminated_length": 303.984375, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "entropy": 0.5027990341186523, "epoch": 2.019607843137255, "frac_reward_zero_std": 0.75, "grad_norm": 0.6509969278537756, "kl": 0.025343406945466995, "learning_rate": 2.937073194873348e-07, "loss": 0.0019, "num_tokens": 70919782.0, "reward": 0.1875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 1.6499607563018799, "sampling/importance_sampling_ratio/mean": 0.9998193979263306, "sampling/importance_sampling_ratio/min": 0.6678634285926819, "sampling/sampling_logp_difference/max": 0.5007514953613281, "sampling/sampling_logp_difference/mean": 0.015301953069865704, "step": 1648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 772.0, "completions/max_terminated_length": 772.0, "completions/mean_length": 396.046875, "completions/mean_terminated_length": 396.046875, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "entropy": 0.6028868556022644, "epoch": 2.0208333333333335, "frac_reward_zero_std": 0.75, "grad_norm": 0.47026305944883773, "kl": 0.029583008959889412, "learning_rate": 2.930586093010477e-07, "loss": -0.0083, "num_tokens": 70960905.0, "reward": 0.34375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 1.973357915878296, "sampling/importance_sampling_ratio/mean": 0.9999582767486572, "sampling/importance_sampling_ratio/min": 0.6132530570030212, "sampling/sampling_logp_difference/max": 0.6797366142272949, "sampling/sampling_logp_difference/mean": 0.01639951765537262, "step": 1649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 460.0, "completions/max_terminated_length": 460.0, "completions/mean_length": 282.984375, "completions/mean_terminated_length": 282.984375, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.5866172313690186, "epoch": 2.0220588235294117, "frac_reward_zero_std": 0.5, "grad_norm": 0.8974701647309327, "kl": 0.041347797960042953, "learning_rate": 2.9241031919267363e-07, "loss": -0.0201, "num_tokens": 70993096.0, "reward": 0.15625, "reward_std": 0.3723389506340027, "rewards/decision_reward_func/mean": 0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.7264935970306396, "sampling/importance_sampling_ratio/mean": 1.000060796737671, "sampling/importance_sampling_ratio/min": 0.6219937801361084, "sampling/sampling_logp_difference/max": 0.5460925102233887, "sampling/sampling_logp_difference/mean": 0.017791874706745148, "step": 1650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 783.0, "completions/max_terminated_length": 783.0, "completions/mean_length": 447.703125, "completions/mean_terminated_length": 447.703125, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "entropy": 0.6162521839141846, "epoch": 2.0232843137254903, "frac_reward_zero_std": 0.75, "grad_norm": 0.5139294546321354, "kl": 0.0369582325220108, "learning_rate": 2.917624504782006e-07, "loss": 0.0011, "num_tokens": 71046293.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.635053277015686, "sampling/importance_sampling_ratio/mean": 1.0000710487365723, "sampling/importance_sampling_ratio/min": 0.6298444271087646, "sampling/sampling_logp_difference/max": 0.49167537689208984, "sampling/sampling_logp_difference/mean": 0.01633651740849018, "step": 1651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 671.0, "completions/max_terminated_length": 671.0, "completions/mean_length": 320.859375, "completions/mean_terminated_length": 320.859375, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.460366815328598, "epoch": 2.0245098039215685, "frac_reward_zero_std": 1.0, "grad_norm": 0.013327620973985654, "kl": 0.021539263427257538, "learning_rate": 2.911150044727605e-07, "loss": 0.0002, "num_tokens": 71089916.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6496676206588745, "sampling/importance_sampling_ratio/mean": 0.9999909400939941, "sampling/importance_sampling_ratio/min": 0.6931828260421753, "sampling/sampling_logp_difference/max": 0.5005738735198975, "sampling/sampling_logp_difference/mean": 0.014591380022466183, "step": 1652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1804.0, "completions/max_terminated_length": 1804.0, "completions/mean_length": 370.796875, "completions/mean_terminated_length": 370.796875, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "entropy": 0.4724261164665222, "epoch": 2.025735294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.025778855681634488, "kl": 0.023569460958242416, "learning_rate": 2.9046798249062824e-07, "loss": 0.0002, "num_tokens": 71134559.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5630115270614624, "sampling/importance_sampling_ratio/mean": 0.9999353289604187, "sampling/importance_sampling_ratio/min": 0.4309254288673401, "sampling/sampling_logp_difference/max": 0.841820240020752, "sampling/sampling_logp_difference/mean": 0.015683360397815704, "step": 1653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 660.0, "completions/max_terminated_length": 660.0, "completions/mean_length": 357.09375, "completions/mean_terminated_length": 357.09375, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "entropy": 0.5104557275772095, "epoch": 2.0269607843137254, "frac_reward_zero_std": 1.0, "grad_norm": 0.016816050885971486, "kl": 0.025217680260539055, "learning_rate": 2.898213858452173e-07, "loss": 0.0003, "num_tokens": 71176485.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.326668381690979, "sampling/importance_sampling_ratio/mean": 1.000131607055664, "sampling/importance_sampling_ratio/min": 0.6968556642532349, "sampling/sampling_logp_difference/max": 0.3611769676208496, "sampling/sampling_logp_difference/mean": 0.014666920527815819, "step": 1654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 661.0, "completions/max_terminated_length": 661.0, "completions/mean_length": 304.96875, "completions/mean_terminated_length": 304.96875, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.3904092609882355, "epoch": 2.028186274509804, "frac_reward_zero_std": 1.0, "grad_norm": 0.013771960071334464, "kl": 0.025198526680469513, "learning_rate": 2.891752158490778e-07, "loss": 0.0002, "num_tokens": 71213507.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6494903564453125, "sampling/importance_sampling_ratio/mean": 1.0002646446228027, "sampling/importance_sampling_ratio/min": 0.7352858185768127, "sampling/sampling_logp_difference/max": 0.5004663467407227, "sampling/sampling_logp_difference/mean": 0.012433291412889957, "step": 1655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 979.0, "completions/max_terminated_length": 979.0, "completions/mean_length": 511.90625, "completions/mean_terminated_length": 511.90625, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "entropy": 0.5100662112236023, "epoch": 2.0294117647058822, "frac_reward_zero_std": 0.5, "grad_norm": 0.6455059487288584, "kl": 0.01853322423994541, "learning_rate": 2.8852947381389405e-07, "loss": 0.0095, "num_tokens": 71266077.0, "reward": 0.375, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.5535160303115845, "sampling/importance_sampling_ratio/mean": 0.9999561309814453, "sampling/importance_sampling_ratio/min": 0.6725755333900452, "sampling/sampling_logp_difference/max": 0.4405207633972168, "sampling/sampling_logp_difference/mean": 0.013641925528645515, "step": 1656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1326.0, "completions/max_terminated_length": 1326.0, "completions/mean_length": 451.671875, "completions/mean_terminated_length": 451.671875, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "entropy": 0.45752230286598206, "epoch": 2.030637254901961, "frac_reward_zero_std": 0.75, "grad_norm": 0.5688255769357334, "kl": 0.021422475576400757, "learning_rate": 2.8788416105048117e-07, "loss": 0.0841, "num_tokens": 71317560.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.4063969850540161, "sampling/importance_sampling_ratio/mean": 1.0001459121704102, "sampling/importance_sampling_ratio/min": 0.5896305441856384, "sampling/sampling_logp_difference/max": 0.5282591581344604, "sampling/sampling_logp_difference/mean": 0.014265276491641998, "step": 1657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1239.0, "completions/max_terminated_length": 1239.0, "completions/mean_length": 545.0, "completions/mean_terminated_length": 545.0, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "entropy": 0.4229567348957062, "epoch": 2.031862745098039, "frac_reward_zero_std": 1.0, "grad_norm": 0.012426813771256848, "kl": 0.01589692384004593, "learning_rate": 2.8723927886878396e-07, "loss": 0.0002, "num_tokens": 71371512.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4700850248336792, "sampling/importance_sampling_ratio/mean": 1.0001275539398193, "sampling/importance_sampling_ratio/min": 0.6032107472419739, "sampling/sampling_logp_difference/max": 0.5054886341094971, "sampling/sampling_logp_difference/mean": 0.0124749056994915, "step": 1658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 407.0, "completions/max_terminated_length": 407.0, "completions/mean_length": 210.671875, "completions/mean_terminated_length": 210.671875, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.37372326850891113, "epoch": 2.0330882352941178, "frac_reward_zero_std": 1.0, "grad_norm": 0.016146968597434613, "kl": 0.021619969978928566, "learning_rate": 2.865948285778713e-07, "loss": 0.0002, "num_tokens": 71397155.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5327163934707642, "sampling/importance_sampling_ratio/mean": 1.000433325767517, "sampling/importance_sampling_ratio/min": 0.7067931294441223, "sampling/sampling_logp_difference/max": 0.42704153060913086, "sampling/sampling_logp_difference/mean": 0.01401694305241108, "step": 1659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 639.0, "completions/max_terminated_length": 639.0, "completions/mean_length": 323.0625, "completions/mean_terminated_length": 323.0625, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "entropy": 0.46809399127960205, "epoch": 2.034313725490196, "frac_reward_zero_std": 0.75, "grad_norm": 0.6563470414090191, "kl": 0.02382962964475155, "learning_rate": 2.8595081148593737e-07, "loss": -0.0228, "num_tokens": 71435767.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.3653180599212646, "sampling/importance_sampling_ratio/mean": 0.9998935461044312, "sampling/importance_sampling_ratio/min": 0.6078891754150391, "sampling/sampling_logp_difference/max": 0.49776268005371094, "sampling/sampling_logp_difference/mean": 0.013508414849638939, "step": 1660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.0, "completions/max_terminated_length": 546.0, "completions/mean_length": 279.828125, "completions/mean_terminated_length": 279.828125, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.544167160987854, "epoch": 2.0355392156862746, "frac_reward_zero_std": 0.5, "grad_norm": 0.9907689648603099, "kl": 0.03970886766910553, "learning_rate": 2.8530722890029534e-07, "loss": 0.0352, "num_tokens": 71469900.0, "reward": 0.65625, "reward_std": 0.42695626616477966, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.6002883911132812, "sampling/importance_sampling_ratio/mean": 1.0004212856292725, "sampling/importance_sampling_ratio/min": 0.6172181367874146, "sampling/sampling_logp_difference/max": 0.4825327396392822, "sampling/sampling_logp_difference/mean": 0.01691768318414688, "step": 1661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 812.0, "completions/max_terminated_length": 812.0, "completions/mean_length": 306.21875, "completions/mean_terminated_length": 306.21875, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.5454210042953491, "epoch": 2.036764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.8816637278691077, "kl": 0.045668236911296844, "learning_rate": 2.8466408212737776e-07, "loss": 0.0566, "num_tokens": 71504842.0, "reward": 0.84375, "reward_std": 0.3723389506340027, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.4053096771240234, "sampling/importance_sampling_ratio/mean": 0.9998595118522644, "sampling/importance_sampling_ratio/min": 0.6987270712852478, "sampling/sampling_logp_difference/max": 0.35849499702453613, "sampling/sampling_logp_difference/mean": 0.01540539599955082, "step": 1662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1043.0, "completions/max_terminated_length": 1043.0, "completions/mean_length": 391.640625, "completions/mean_terminated_length": 391.640625, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.44599246978759766, "epoch": 2.0379901960784315, "frac_reward_zero_std": 0.75, "grad_norm": 0.5651158839564806, "kl": 0.021035362035036087, "learning_rate": 2.840213724727315e-07, "loss": -0.0093, "num_tokens": 71546707.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.5899444818496704, "sampling/importance_sampling_ratio/mean": 1.0000982284545898, "sampling/importance_sampling_ratio/min": 0.4472723603248596, "sampling/sampling_logp_difference/max": 0.8045876026153564, "sampling/sampling_logp_difference/mean": 0.0144585482776165, "step": 1663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 897.0, "completions/max_terminated_length": 897.0, "completions/mean_length": 362.390625, "completions/mean_terminated_length": 362.390625, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.4464365839958191, "epoch": 2.0392156862745097, "frac_reward_zero_std": 1.0, "grad_norm": 0.014361395973328108, "kl": 0.01992211863398552, "learning_rate": 2.8337910124101625e-07, "loss": 0.0002, "num_tokens": 71585964.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.504683494567871, "sampling/importance_sampling_ratio/mean": 0.9998586773872375, "sampling/importance_sampling_ratio/min": 0.6985891461372375, "sampling/sampling_logp_difference/max": 0.40858256816864014, "sampling/sampling_logp_difference/mean": 0.013734648004174232, "step": 1664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 999.0, "completions/max_terminated_length": 999.0, "completions/mean_length": 399.328125, "completions/mean_terminated_length": 399.328125, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "entropy": 0.652000904083252, "epoch": 2.0404411764705883, "frac_reward_zero_std": 0.25, "grad_norm": 1.0713952820235801, "kl": 0.052682723850011826, "learning_rate": 2.8273726973600254e-07, "loss": 0.0291, "num_tokens": 71632465.0, "reward": 0.53125, "reward_std": 0.5986068248748779, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.5652927160263062, "sampling/importance_sampling_ratio/mean": 1.0000568628311157, "sampling/importance_sampling_ratio/min": 0.5372384190559387, "sampling/sampling_logp_difference/max": 0.6213133335113525, "sampling/sampling_logp_difference/mean": 0.01761525869369507, "step": 1665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 694.0, "completions/max_terminated_length": 694.0, "completions/mean_length": 390.421875, "completions/mean_terminated_length": 390.421875, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "entropy": 0.4944165349006653, "epoch": 2.0416666666666665, "frac_reward_zero_std": 1.0, "grad_norm": 0.012400777436796389, "kl": 0.01895260252058506, "learning_rate": 2.8209587926056687e-07, "loss": 0.0002, "num_tokens": 71677980.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.370939016342163, "sampling/importance_sampling_ratio/mean": 1.0001224279403687, "sampling/importance_sampling_ratio/min": 0.6173245906829834, "sampling/sampling_logp_difference/max": 0.4823603630065918, "sampling/sampling_logp_difference/mean": 0.014763755723834038, "step": 1666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 960.0, "completions/max_terminated_length": 960.0, "completions/mean_length": 321.59375, "completions/mean_terminated_length": 321.59375, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "entropy": 0.5036807060241699, "epoch": 2.042892156862745, "frac_reward_zero_std": 0.75, "grad_norm": 0.7416514671095708, "kl": 0.029038792476058006, "learning_rate": 2.8145493111669183e-07, "loss": 0.0008, "num_tokens": 71714338.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.4697530269622803, "sampling/importance_sampling_ratio/mean": 0.999809741973877, "sampling/importance_sampling_ratio/min": 0.5573611855506897, "sampling/sampling_logp_difference/max": 0.5845417976379395, "sampling/sampling_logp_difference/mean": 0.015143305994570255, "step": 1667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 899.0, "completions/max_terminated_length": 899.0, "completions/mean_length": 402.90625, "completions/mean_terminated_length": 402.90625, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "entropy": 0.4379514753818512, "epoch": 2.0441176470588234, "frac_reward_zero_std": 1.0, "grad_norm": 0.015657485333062004, "kl": 0.02224244736135006, "learning_rate": 2.808144266054612e-07, "loss": 0.0002, "num_tokens": 71760348.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000782012939453, "sampling/importance_sampling_ratio/min": 0.6147505044937134, "sampling/sampling_logp_difference/max": 0.7519397735595703, "sampling/sampling_logp_difference/mean": 0.01278570108115673, "step": 1668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1029.0, "completions/max_terminated_length": 1029.0, "completions/mean_length": 379.03125, "completions/mean_terminated_length": 379.03125, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "entropy": 0.4584651589393616, "epoch": 2.045343137254902, "frac_reward_zero_std": 0.75, "grad_norm": 0.5328943134966708, "kl": 0.032605454325675964, "learning_rate": 2.80174367027059e-07, "loss": 0.0333, "num_tokens": 71799230.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.6173615455627441, "sampling/importance_sampling_ratio/mean": 1.000399112701416, "sampling/importance_sampling_ratio/min": 0.6937639713287354, "sampling/sampling_logp_difference/max": 0.48079609870910645, "sampling/sampling_logp_difference/mean": 0.013999211601912975, "step": 1669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 582.0, "completions/max_terminated_length": 582.0, "completions/mean_length": 293.328125, "completions/mean_terminated_length": 293.328125, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.5604232549667358, "epoch": 2.0465686274509802, "frac_reward_zero_std": 0.75, "grad_norm": 0.6903792907607764, "kl": 0.0411398708820343, "learning_rate": 2.795347536807653e-07, "loss": 0.032, "num_tokens": 71832131.0, "reward": 0.8125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.6490923166275024, "sampling/importance_sampling_ratio/mean": 1.000320553779602, "sampling/importance_sampling_ratio/min": 0.6110190749168396, "sampling/sampling_logp_difference/max": 0.5002250671386719, "sampling/sampling_logp_difference/mean": 0.017146199941635132, "step": 1670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 744.0, "completions/max_terminated_length": 744.0, "completions/mean_length": 378.8125, "completions/mean_terminated_length": 378.8125, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "entropy": 0.5921155214309692, "epoch": 2.047794117647059, "frac_reward_zero_std": 0.5, "grad_norm": 1.013831445503894, "kl": 0.022434961050748825, "learning_rate": 2.7889558786495455e-07, "loss": 0.0358, "num_tokens": 71872679.0, "reward": 0.9375, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.4061676263809204, "sampling/importance_sampling_ratio/mean": 1.0002162456512451, "sampling/importance_sampling_ratio/min": 0.639398455619812, "sampling/sampling_logp_difference/max": 0.44722747802734375, "sampling/sampling_logp_difference/mean": 0.0170211773365736, "step": 1671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 654.0, "completions/max_terminated_length": 654.0, "completions/mean_length": 325.1875, "completions/mean_terminated_length": 325.1875, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.48364710807800293, "epoch": 2.049019607843137, "frac_reward_zero_std": 1.0, "grad_norm": 0.02283680045141435, "kl": 0.03426671773195267, "learning_rate": 2.782568708770933e-07, "loss": 0.0004, "num_tokens": 71911299.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5135303735733032, "sampling/importance_sampling_ratio/mean": 0.9998064041137695, "sampling/importance_sampling_ratio/min": 0.6200933456420898, "sampling/sampling_logp_difference/max": 0.47788524627685547, "sampling/sampling_logp_difference/mean": 0.015027796849608421, "step": 1672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 730.0, "completions/max_terminated_length": 730.0, "completions/mean_length": 385.609375, "completions/mean_terminated_length": 385.609375, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.44634225964546204, "epoch": 2.0502450980392157, "frac_reward_zero_std": 0.75, "grad_norm": 0.6510528059344495, "kl": 0.019537167623639107, "learning_rate": 2.7761860401373627e-07, "loss": 0.0048, "num_tokens": 71953370.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0003397464752197, "sampling/importance_sampling_ratio/min": 0.6730563640594482, "sampling/sampling_logp_difference/max": 0.7699360847473145, "sampling/sampling_logp_difference/mean": 0.013676985166966915, "step": 1673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1119.0, "completions/max_terminated_length": 1119.0, "completions/mean_length": 419.046875, "completions/mean_terminated_length": 419.046875, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.460202693939209, "epoch": 2.051470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 0.9044489320337761, "kl": 0.024155616760253906, "learning_rate": 2.7698078857052474e-07, "loss": 0.0702, "num_tokens": 71994525.0, "reward": 0.03125, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.4753797054290771, "sampling/importance_sampling_ratio/mean": 0.9996058940887451, "sampling/importance_sampling_ratio/min": 0.6134172677993774, "sampling/sampling_logp_difference/max": 0.4887099266052246, "sampling/sampling_logp_difference/mean": 0.013344889506697655, "step": 1674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 671.0, "completions/max_terminated_length": 671.0, "completions/mean_length": 319.046875, "completions/mean_terminated_length": 319.046875, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.46803611516952515, "epoch": 2.0526960784313726, "frac_reward_zero_std": 0.75, "grad_norm": 0.5887793191016403, "kl": 0.02319633588194847, "learning_rate": 2.763434258421836e-07, "loss": 0.0059, "num_tokens": 72032720.0, "reward": 0.0625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.4164409637451172, "sampling/importance_sampling_ratio/mean": 1.0002412796020508, "sampling/importance_sampling_ratio/min": 0.6957191228866577, "sampling/sampling_logp_difference/max": 0.3628091812133789, "sampling/sampling_logp_difference/mean": 0.014581522904336452, "step": 1675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 630.0, "completions/max_terminated_length": 630.0, "completions/mean_length": 309.421875, "completions/mean_terminated_length": 309.421875, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.39879143238067627, "epoch": 2.053921568627451, "frac_reward_zero_std": 1.0, "grad_norm": 0.01622000475623947, "kl": 0.020731668919324875, "learning_rate": 2.757065171225192e-07, "loss": 0.0002, "num_tokens": 72067403.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.511767864227295, "sampling/importance_sampling_ratio/mean": 1.0003325939178467, "sampling/importance_sampling_ratio/min": 0.6891030073165894, "sampling/sampling_logp_difference/max": 0.41327977180480957, "sampling/sampling_logp_difference/mean": 0.01275918073952198, "step": 1676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1605.0, "completions/max_terminated_length": 1605.0, "completions/mean_length": 424.421875, "completions/mean_terminated_length": 424.421875, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.4999920129776001, "epoch": 2.0551470588235294, "frac_reward_zero_std": 0.75, "grad_norm": 0.5819743943293305, "kl": 0.0202486515045166, "learning_rate": 2.750700637044155e-07, "loss": -0.036, "num_tokens": 72110870.0, "reward": -0.25, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": -0.25, "rewards/decision_reward_func/std": 0.9759001135826111, "sampling/importance_sampling_ratio/max": 1.4156907796859741, "sampling/importance_sampling_ratio/mean": 0.9999434947967529, "sampling/importance_sampling_ratio/min": 0.7459121942520142, "sampling/sampling_logp_difference/max": 0.34761762619018555, "sampling/sampling_logp_difference/mean": 0.014538116753101349, "step": 1677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 645.0, "completions/max_terminated_length": 645.0, "completions/mean_length": 378.984375, "completions/mean_terminated_length": 378.984375, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "entropy": 0.5792146921157837, "epoch": 2.0563725490196076, "frac_reward_zero_std": 1.0, "grad_norm": 0.020675968652927975, "kl": 0.03341690078377724, "learning_rate": 2.7443406687983264e-07, "loss": 0.0004, "num_tokens": 72155813.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4755656719207764, "sampling/importance_sampling_ratio/mean": 1.000087022781372, "sampling/importance_sampling_ratio/min": 0.6502669453620911, "sampling/sampling_logp_difference/max": 0.4303722381591797, "sampling/sampling_logp_difference/mean": 0.01637270301580429, "step": 1678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 678.0, "completions/max_terminated_length": 678.0, "completions/mean_length": 332.390625, "completions/mean_terminated_length": 332.390625, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.5259170532226562, "epoch": 2.0575980392156863, "frac_reward_zero_std": 0.5, "grad_norm": 0.7110923971246242, "kl": 0.02899279072880745, "learning_rate": 2.7379852793980416e-07, "loss": 0.0049, "num_tokens": 72194158.0, "reward": 0.5, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.2911875247955322, "sampling/importance_sampling_ratio/mean": 1.0003111362457275, "sampling/importance_sampling_ratio/min": 0.7791949510574341, "sampling/sampling_logp_difference/max": 0.25556230545043945, "sampling/sampling_logp_difference/mean": 0.01446933951228857, "step": 1679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 897.0, "completions/max_terminated_length": 897.0, "completions/mean_length": 441.171875, "completions/mean_terminated_length": 441.171875, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "entropy": 0.6257523894309998, "epoch": 2.0588235294117645, "frac_reward_zero_std": 0.5, "grad_norm": 0.8388880230286083, "kl": 0.034142691642045975, "learning_rate": 2.7316344817443363e-07, "loss": -0.0045, "num_tokens": 72241033.0, "reward": 0.09375, "reward_std": 0.4515564441680908, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.5541203022003174, "sampling/importance_sampling_ratio/mean": 0.9999149441719055, "sampling/importance_sampling_ratio/min": 0.6001227498054504, "sampling/sampling_logp_difference/max": 0.5106210708618164, "sampling/sampling_logp_difference/mean": 0.016130981966853142, "step": 1680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 748.0, "completions/max_terminated_length": 748.0, "completions/mean_length": 385.40625, "completions/mean_terminated_length": 385.40625, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.4822421669960022, "epoch": 2.060049019607843, "frac_reward_zero_std": 1.0, "grad_norm": 0.015864590927104284, "kl": 0.04027178883552551, "learning_rate": 2.7252882887289287e-07, "loss": 0.0003, "num_tokens": 72281667.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4221110343933105, "sampling/importance_sampling_ratio/mean": 1.0005894899368286, "sampling/importance_sampling_ratio/min": 0.5629484057426453, "sampling/sampling_logp_difference/max": 0.5745673179626465, "sampling/sampling_logp_difference/mean": 0.01461762748658657, "step": 1681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/max_terminated_length": 482.0, "completions/mean_length": 295.203125, "completions/mean_terminated_length": 295.203125, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.4482291042804718, "epoch": 2.0612745098039214, "frac_reward_zero_std": 1.0, "grad_norm": 0.01888993732383726, "kl": 0.022644508630037308, "learning_rate": 2.718946713234185e-07, "loss": 0.0002, "num_tokens": 72315936.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5277830362319946, "sampling/importance_sampling_ratio/mean": 1.0000574588775635, "sampling/importance_sampling_ratio/min": 0.4925408661365509, "sampling/sampling_logp_difference/max": 0.7081778049468994, "sampling/sampling_logp_difference/mean": 0.014900408685207367, "step": 1682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 885.0, "completions/max_terminated_length": 885.0, "completions/mean_length": 436.3125, "completions/mean_terminated_length": 436.3125, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "entropy": 0.5677525401115417, "epoch": 2.0625, "frac_reward_zero_std": 0.5, "grad_norm": 0.7587899876775522, "kl": 0.03726182132959366, "learning_rate": 2.712609768133106e-07, "loss": 0.0203, "num_tokens": 72367892.0, "reward": 0.1875, "reward_std": 0.4577302038669586, "rewards/decision_reward_func/mean": 0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 1.658127784729004, "sampling/importance_sampling_ratio/mean": 0.99971604347229, "sampling/importance_sampling_ratio/min": 0.42336800694465637, "sampling/sampling_logp_difference/max": 0.859513521194458, "sampling/sampling_logp_difference/mean": 0.01604394242167473, "step": 1683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 639.0, "completions/max_terminated_length": 639.0, "completions/mean_length": 354.90625, "completions/mean_terminated_length": 354.90625, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "entropy": 0.5542596578598022, "epoch": 2.063725490196078, "frac_reward_zero_std": 0.75, "grad_norm": 0.7149876966894074, "kl": 0.020380720496177673, "learning_rate": 2.7062774662892886e-07, "loss": -0.0341, "num_tokens": 72411950.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.4508193731307983, "sampling/importance_sampling_ratio/mean": 0.9998863935470581, "sampling/importance_sampling_ratio/min": 0.6986730098724365, "sampling/sampling_logp_difference/max": 0.3721284866333008, "sampling/sampling_logp_difference/mean": 0.015311913564801216, "step": 1684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 625.0, "completions/max_terminated_length": 625.0, "completions/mean_length": 379.265625, "completions/mean_terminated_length": 379.265625, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "entropy": 0.5659840703010559, "epoch": 2.064950980392157, "frac_reward_zero_std": 0.75, "grad_norm": 0.5847640055120148, "kl": 0.038056522607803345, "learning_rate": 2.6999498205569e-07, "loss": 0.0073, "num_tokens": 72454255.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.601252555847168, "sampling/importance_sampling_ratio/mean": 1.0001181364059448, "sampling/importance_sampling_ratio/min": 0.6203605532646179, "sampling/sampling_logp_difference/max": 0.47745442390441895, "sampling/sampling_logp_difference/mean": 0.017012223601341248, "step": 1685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 560.0, "completions/max_terminated_length": 560.0, "completions/mean_length": 316.09375, "completions/mean_terminated_length": 316.09375, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "entropy": 0.5147889256477356, "epoch": 2.0661764705882355, "frac_reward_zero_std": 0.5, "grad_norm": 0.8347699327210604, "kl": 0.04720322787761688, "learning_rate": 2.693626843780665e-07, "loss": 0.0069, "num_tokens": 72490725.0, "reward": 0.1875, "reward_std": 0.4787135720252991, "rewards/decision_reward_func/mean": 0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 1.2998892068862915, "sampling/importance_sampling_ratio/mean": 0.9997285604476929, "sampling/importance_sampling_ratio/min": 0.567869246006012, "sampling/sampling_logp_difference/max": 0.565864086151123, "sampling/sampling_logp_difference/mean": 0.01590607315301895, "step": 1686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/max_terminated_length": 547.0, "completions/mean_length": 334.515625, "completions/mean_terminated_length": 334.515625, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.4378436505794525, "epoch": 2.0674019607843137, "frac_reward_zero_std": 1.0, "grad_norm": 0.01949987657553268, "kl": 0.022003548219799995, "learning_rate": 2.687308548795825e-07, "loss": 0.0002, "num_tokens": 72531718.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5836412906646729, "sampling/importance_sampling_ratio/mean": 1.000117301940918, "sampling/importance_sampling_ratio/min": 0.3751697540283203, "sampling/sampling_logp_difference/max": 0.9803767204284668, "sampling/sampling_logp_difference/mean": 0.014253325760364532, "step": 1687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1464.0, "completions/max_terminated_length": 1464.0, "completions/mean_length": 383.3125, "completions/mean_terminated_length": 383.3125, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "entropy": 0.6154356002807617, "epoch": 2.0686274509803924, "frac_reward_zero_std": 0.5, "grad_norm": 0.9647130128478375, "kl": 0.04197505861520767, "learning_rate": 2.6809949484281164e-07, "loss": -0.1633, "num_tokens": 72582650.0, "reward": 0.46875, "reward_std": 0.3723389506340027, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.5471539497375488, "sampling/importance_sampling_ratio/mean": 1.0002763271331787, "sampling/importance_sampling_ratio/min": 0.45335066318511963, "sampling/sampling_logp_difference/max": 0.7910892963409424, "sampling/sampling_logp_difference/mean": 0.017210951074957848, "step": 1688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 664.0, "completions/max_terminated_length": 664.0, "completions/mean_length": 322.53125, "completions/mean_terminated_length": 322.53125, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.48597294092178345, "epoch": 2.0698529411764706, "frac_reward_zero_std": 0.75, "grad_norm": 0.7374040573845746, "kl": 0.02847595512866974, "learning_rate": 2.674686055493748e-07, "loss": 0.0122, "num_tokens": 72621052.0, "reward": 0.40625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.3842300176620483, "sampling/importance_sampling_ratio/mean": 0.9999864101409912, "sampling/importance_sampling_ratio/min": 0.6747081875801086, "sampling/sampling_logp_difference/max": 0.3934750556945801, "sampling/sampling_logp_difference/mean": 0.016529209911823273, "step": 1689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1264.0, "completions/max_terminated_length": 1264.0, "completions/mean_length": 332.265625, "completions/mean_terminated_length": 332.265625, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "entropy": 0.6093271970748901, "epoch": 2.071078431372549, "frac_reward_zero_std": 0.75, "grad_norm": 0.7096488470999758, "kl": 0.028761014342308044, "learning_rate": 2.668381882799375e-07, "loss": 0.003, "num_tokens": 72660237.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.3497471809387207, "sampling/importance_sampling_ratio/mean": 1.000152587890625, "sampling/importance_sampling_ratio/min": 0.6459595561027527, "sampling/sampling_logp_difference/max": 0.43701839447021484, "sampling/sampling_logp_difference/mean": 0.01728716492652893, "step": 1690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/max_terminated_length": 370.0, "completions/mean_length": 239.484375, "completions/mean_terminated_length": 239.484375, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.4190979599952698, "epoch": 2.0723039215686274, "frac_reward_zero_std": 1.0, "grad_norm": 0.0169746410370701, "kl": 0.0230766199529171, "learning_rate": 2.662082443142068e-07, "loss": 0.0002, "num_tokens": 72691836.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3705024719238281, "sampling/importance_sampling_ratio/mean": 0.9996927380561829, "sampling/importance_sampling_ratio/min": 0.6660661101341248, "sampling/sampling_logp_difference/max": 0.40636634826660156, "sampling/sampling_logp_difference/mean": 0.014543424360454082, "step": 1691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 864.0, "completions/max_terminated_length": 864.0, "completions/mean_length": 413.953125, "completions/mean_terminated_length": 413.953125, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.47108715772628784, "epoch": 2.073529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 0.6819709382366175, "kl": 0.022091001272201538, "learning_rate": 2.6557877493092883e-07, "loss": 0.0042, "num_tokens": 72735401.0, "reward": 0.78125, "reward_std": 0.4101392924785614, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.439373254776001, "sampling/importance_sampling_ratio/mean": 1.0002541542053223, "sampling/importance_sampling_ratio/min": 0.6996729969978333, "sampling/sampling_logp_difference/max": 0.36420774459838867, "sampling/sampling_logp_difference/mean": 0.0130135677754879, "step": 1692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1021.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 362.859375, "completions/mean_terminated_length": 362.859375, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.4610968232154846, "epoch": 2.0747549019607843, "frac_reward_zero_std": 0.5, "grad_norm": 0.809305216410646, "kl": 0.02067846804857254, "learning_rate": 2.6494978140788686e-07, "loss": 0.0637, "num_tokens": 72774928.0, "reward": 0.90625, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.3258241415023804, "sampling/importance_sampling_ratio/mean": 1.0000083446502686, "sampling/importance_sampling_ratio/min": 0.6483193635940552, "sampling/sampling_logp_difference/max": 0.43337178230285645, "sampling/sampling_logp_difference/mean": 0.014804702252149582, "step": 1693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 672.0, "completions/max_terminated_length": 672.0, "completions/mean_length": 399.53125, "completions/mean_terminated_length": 399.53125, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "entropy": 0.5739798545837402, "epoch": 2.075980392156863, "frac_reward_zero_std": 0.75, "grad_norm": 0.5892854341560712, "kl": 0.03698611631989479, "learning_rate": 2.643212650218976e-07, "loss": 0.0257, "num_tokens": 72819026.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.5575488805770874, "sampling/importance_sampling_ratio/mean": 0.999732255935669, "sampling/importance_sampling_ratio/min": 0.6412438154220581, "sampling/sampling_logp_difference/max": 0.44434547424316406, "sampling/sampling_logp_difference/mean": 0.016486775130033493, "step": 1694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 806.0, "completions/max_terminated_length": 806.0, "completions/mean_length": 318.171875, "completions/mean_terminated_length": 318.171875, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.6267750263214111, "epoch": 2.077205882352941, "frac_reward_zero_std": 0.75, "grad_norm": 0.6690762062830121, "kl": 0.036213167011737823, "learning_rate": 2.6369322704881e-07, "loss": -0.004, "num_tokens": 72860861.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.6269322633743286, "sampling/importance_sampling_ratio/mean": 1.0005102157592773, "sampling/importance_sampling_ratio/min": 0.5421531796455383, "sampling/sampling_logp_difference/max": 0.6122066974639893, "sampling/sampling_logp_difference/mean": 0.018567079678177834, "step": 1695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 810.0, "completions/max_terminated_length": 810.0, "completions/mean_length": 415.375, "completions/mean_terminated_length": 415.375, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "entropy": 0.4267962574958801, "epoch": 2.0784313725490198, "frac_reward_zero_std": 0.5, "grad_norm": 0.6515695938801614, "kl": 0.023348167538642883, "learning_rate": 2.6306566876350067e-07, "loss": 0.0569, "num_tokens": 72912949.0, "reward": 0.65625, "reward_std": 0.42695626616477966, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.6157500743865967, "sampling/importance_sampling_ratio/mean": 1.000084400177002, "sampling/importance_sampling_ratio/min": 0.6934130191802979, "sampling/sampling_logp_difference/max": 0.4797992706298828, "sampling/sampling_logp_difference/mean": 0.012274454347789288, "step": 1696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 661.0, "completions/max_terminated_length": 661.0, "completions/mean_length": 270.6875, "completions/mean_terminated_length": 270.6875, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.342054545879364, "epoch": 2.079656862745098, "frac_reward_zero_std": 1.0, "grad_norm": 0.029540921492356046, "kl": 0.029995687305927277, "learning_rate": 2.6243859143987367e-07, "loss": 0.0003, "num_tokens": 72943153.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.4634524583816528, "sampling/importance_sampling_ratio/mean": 1.0002511739730835, "sampling/importance_sampling_ratio/min": 0.5517528653144836, "sampling/sampling_logp_difference/max": 0.5946550369262695, "sampling/sampling_logp_difference/mean": 0.012794015929102898, "step": 1697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1012.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 341.765625, "completions/mean_terminated_length": 341.765625, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.5476503372192383, "epoch": 2.0808823529411766, "frac_reward_zero_std": 0.5, "grad_norm": 0.8325004356448894, "kl": 0.0373598113656044, "learning_rate": 2.6181199635085616e-07, "loss": -0.0046, "num_tokens": 72979922.0, "reward": 0.75, "reward_std": 0.3811737596988678, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.531914472579956, "sampling/importance_sampling_ratio/mean": 1.0003451108932495, "sampling/importance_sampling_ratio/min": 0.6546944975852966, "sampling/sampling_logp_difference/max": 0.42651820182800293, "sampling/sampling_logp_difference/mean": 0.01695142686367035, "step": 1698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 673.0, "completions/max_terminated_length": 673.0, "completions/mean_length": 382.53125, "completions/mean_terminated_length": 382.53125, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "entropy": 0.4912796914577484, "epoch": 2.082107843137255, "frac_reward_zero_std": 1.0, "grad_norm": 0.013362583943009026, "kl": 0.01511281356215477, "learning_rate": 2.6118588476839607e-07, "loss": 0.0002, "num_tokens": 73022004.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.585763931274414, "sampling/importance_sampling_ratio/mean": 0.9998785257339478, "sampling/importance_sampling_ratio/min": 0.6967669725418091, "sampling/sampling_logp_difference/max": 0.46106624603271484, "sampling/sampling_logp_difference/mean": 0.014874340035021305, "step": 1699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 908.0, "completions/max_terminated_length": 908.0, "completions/mean_length": 262.625, "completions/mean_terminated_length": 262.625, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.4137740433216095, "epoch": 2.0833333333333335, "frac_reward_zero_std": 0.75, "grad_norm": 0.7516429923144071, "kl": 0.026030300185084343, "learning_rate": 2.6056025796346094e-07, "loss": -0.0231, "num_tokens": 73055388.0, "reward": 0.71875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.6007599830627441, "sampling/importance_sampling_ratio/mean": 0.9996942281723022, "sampling/importance_sampling_ratio/min": 0.6333224177360535, "sampling/sampling_logp_difference/max": 0.47047853469848633, "sampling/sampling_logp_difference/mean": 0.014633871614933014, "step": 1700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 683.0, "completions/max_terminated_length": 683.0, "completions/mean_length": 303.203125, "completions/mean_terminated_length": 303.203125, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "entropy": 0.5178366303443909, "epoch": 2.0845588235294117, "frac_reward_zero_std": 0.75, "grad_norm": 0.9433660276659344, "kl": 0.028237704187631607, "learning_rate": 2.599351172060329e-07, "loss": -0.0553, "num_tokens": 73092585.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.5294018983840942, "sampling/importance_sampling_ratio/mean": 1.0001275539398193, "sampling/importance_sampling_ratio/min": 0.6506426334381104, "sampling/sampling_logp_difference/max": 0.4297947883605957, "sampling/sampling_logp_difference/mean": 0.016513455659151077, "step": 1701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 665.0, "completions/max_terminated_length": 665.0, "completions/mean_length": 377.8125, "completions/mean_terminated_length": 377.8125, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "entropy": 0.5032227635383606, "epoch": 2.0857843137254903, "frac_reward_zero_std": 0.5, "grad_norm": 0.8701740242881489, "kl": 0.02922811359167099, "learning_rate": 2.593104637651087e-07, "loss": -0.0003, "num_tokens": 73136525.0, "reward": 0.9375, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.580574631690979, "sampling/importance_sampling_ratio/mean": 1.0003910064697266, "sampling/importance_sampling_ratio/min": 0.6742587089538574, "sampling/sampling_logp_difference/max": 0.45778846740722656, "sampling/sampling_logp_difference/mean": 0.014330850914120674, "step": 1702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1070.0, "completions/max_terminated_length": 1070.0, "completions/mean_length": 382.40625, "completions/mean_terminated_length": 382.40625, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.5404931902885437, "epoch": 2.0870098039215685, "frac_reward_zero_std": 0.5, "grad_norm": 0.8557434999601077, "kl": 0.030667897313833237, "learning_rate": 2.5868629890869463e-07, "loss": -0.01, "num_tokens": 73178535.0, "reward": 0.78125, "reward_std": 0.42516323924064636, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.6042413711547852, "sampling/importance_sampling_ratio/mean": 0.9993143081665039, "sampling/importance_sampling_ratio/min": 0.6132339835166931, "sampling/sampling_logp_difference/max": 0.48900866508483887, "sampling/sampling_logp_difference/mean": 0.015601353719830513, "step": 1703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 962.0, "completions/max_terminated_length": 962.0, "completions/mean_length": 403.140625, "completions/mean_terminated_length": 403.140625, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "entropy": 0.35893452167510986, "epoch": 2.088235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.01428499788397899, "kl": 0.015791337937116623, "learning_rate": 2.580626239038061e-07, "loss": 0.0002, "num_tokens": 73221632.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5295768976211548, "sampling/importance_sampling_ratio/mean": 1.000108242034912, "sampling/importance_sampling_ratio/min": 0.6109800338745117, "sampling/sampling_logp_difference/max": 0.4926910400390625, "sampling/sampling_logp_difference/mean": 0.011777489446103573, "step": 1704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1419.0, "completions/max_terminated_length": 1419.0, "completions/mean_length": 419.21875, "completions/mean_terminated_length": 419.21875, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.5036827921867371, "epoch": 2.0894607843137254, "frac_reward_zero_std": 0.75, "grad_norm": 0.5724760840920763, "kl": 0.020546970888972282, "learning_rate": 2.5743944001646387e-07, "loss": 0.0184, "num_tokens": 73269198.0, "reward": 0.40625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.597886085510254, "sampling/importance_sampling_ratio/mean": 0.9996945261955261, "sampling/importance_sampling_ratio/min": 0.5330086350440979, "sampling/sampling_logp_difference/max": 0.6292177438735962, "sampling/sampling_logp_difference/mean": 0.01442687027156353, "step": 1705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 678.0, "completions/max_terminated_length": 678.0, "completions/mean_length": 275.109375, "completions/mean_terminated_length": 275.109375, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.5141398906707764, "epoch": 2.090686274509804, "frac_reward_zero_std": 0.75, "grad_norm": 0.7635825012403009, "kl": 0.02890859730541706, "learning_rate": 2.568167485116919e-07, "loss": -0.0034, "num_tokens": 73307397.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.5050733089447021, "sampling/importance_sampling_ratio/mean": 0.9997601509094238, "sampling/importance_sampling_ratio/min": 0.6371925473213196, "sampling/sampling_logp_difference/max": 0.4506833553314209, "sampling/sampling_logp_difference/mean": 0.017416298389434814, "step": 1706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 682.0, "completions/max_terminated_length": 682.0, "completions/mean_length": 340.3125, "completions/mean_terminated_length": 340.3125, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.5022113919258118, "epoch": 2.0919117647058822, "frac_reward_zero_std": 1.0, "grad_norm": 0.025230253621261998, "kl": 0.04195264354348183, "learning_rate": 2.5619455065351435e-07, "loss": 0.0004, "num_tokens": 73349321.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5277694463729858, "sampling/importance_sampling_ratio/mean": 0.9997950792312622, "sampling/importance_sampling_ratio/min": 0.6364211440086365, "sampling/sampling_logp_difference/max": 0.45189476013183594, "sampling/sampling_logp_difference/mean": 0.014753085561096668, "step": 1707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/max_terminated_length": 382.0, "completions/mean_length": 233.453125, "completions/mean_terminated_length": 233.453125, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.3645936846733093, "epoch": 2.093137254901961, "frac_reward_zero_std": 0.75, "grad_norm": 0.8533837982525848, "kl": 0.026026880368590355, "learning_rate": 2.555728477049532e-07, "loss": 0.0274, "num_tokens": 73379686.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.5488251447677612, "sampling/importance_sampling_ratio/mean": 0.999061107635498, "sampling/importance_sampling_ratio/min": 0.6171556115150452, "sampling/sampling_logp_difference/max": 0.4826340675354004, "sampling/sampling_logp_difference/mean": 0.01225719228386879, "step": 1708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 589.0, "completions/max_terminated_length": 589.0, "completions/mean_length": 309.578125, "completions/mean_terminated_length": 309.578125, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.38191086053848267, "epoch": 2.094362745098039, "frac_reward_zero_std": 1.0, "grad_norm": 0.030338147059902173, "kl": 0.01935330219566822, "learning_rate": 2.5495164092802646e-07, "loss": 0.0002, "num_tokens": 73418907.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3210481405258179, "sampling/importance_sampling_ratio/mean": 1.0000977516174316, "sampling/importance_sampling_ratio/min": 0.43092864751815796, "sampling/sampling_logp_difference/max": 0.8418127298355103, "sampling/sampling_logp_difference/mean": 0.01252442505210638, "step": 1709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 855.0, "completions/max_terminated_length": 855.0, "completions/mean_length": 366.546875, "completions/mean_terminated_length": 366.546875, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.39363542199134827, "epoch": 2.0955882352941178, "frac_reward_zero_std": 1.0, "grad_norm": 0.012506390471098378, "kl": 0.017166946083307266, "learning_rate": 2.5433093158374437e-07, "loss": 0.0002, "num_tokens": 73460014.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4352002143859863, "sampling/importance_sampling_ratio/mean": 0.9998884797096252, "sampling/importance_sampling_ratio/min": 0.6887385249137878, "sampling/sampling_logp_difference/max": 0.3728935718536377, "sampling/sampling_logp_difference/mean": 0.012156963348388672, "step": 1710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1359.0, "completions/max_terminated_length": 1359.0, "completions/mean_length": 511.71875, "completions/mean_terminated_length": 511.71875, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.4896746873855591, "epoch": 2.096813725490196, "frac_reward_zero_std": 0.75, "grad_norm": 0.49823248713452667, "kl": 0.02335747331380844, "learning_rate": 2.537107209321074e-07, "loss": -0.0361, "num_tokens": 73512524.0, "reward": 0.78125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.517001748085022, "sampling/importance_sampling_ratio/mean": 1.0001250505447388, "sampling/importance_sampling_ratio/min": 0.5417264103889465, "sampling/sampling_logp_difference/max": 0.6129941940307617, "sampling/sampling_logp_difference/mean": 0.01494036614894867, "step": 1711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/max_terminated_length": 491.0, "completions/mean_length": 299.578125, "completions/mean_terminated_length": 299.578125, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.49885010719299316, "epoch": 2.0980392156862746, "frac_reward_zero_std": 0.5, "grad_norm": 0.848895547571001, "kl": 0.043381642550230026, "learning_rate": 2.5309101023210424e-07, "loss": -0.01, "num_tokens": 73547393.0, "reward": 0.75, "reward_std": 0.3811737596988678, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.298102855682373, "sampling/importance_sampling_ratio/mean": 0.9998029470443726, "sampling/importance_sampling_ratio/min": 0.6955012679100037, "sampling/sampling_logp_difference/max": 0.36312246322631836, "sampling/sampling_logp_difference/mean": 0.015541560016572475, "step": 1712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 713.0, "completions/max_terminated_length": 713.0, "completions/mean_length": 314.109375, "completions/mean_terminated_length": 314.109375, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.48140913248062134, "epoch": 2.099264705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.020279841889401688, "kl": 0.023213975131511688, "learning_rate": 2.524718007417081e-07, "loss": 0.0002, "num_tokens": 73584712.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4029725790023804, "sampling/importance_sampling_ratio/mean": 1.0004212856292725, "sampling/importance_sampling_ratio/min": 0.7068970799446106, "sampling/sampling_logp_difference/max": 0.34687018394470215, "sampling/sampling_logp_difference/mean": 0.016174428164958954, "step": 1713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 732.0, "completions/max_terminated_length": 732.0, "completions/mean_length": 378.28125, "completions/mean_terminated_length": 378.28125, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "entropy": 0.4467616379261017, "epoch": 2.1004901960784315, "frac_reward_zero_std": 0.75, "grad_norm": 0.5737537052713066, "kl": 0.029158279299736023, "learning_rate": 2.518530937178751e-07, "loss": -0.0197, "num_tokens": 73629962.0, "reward": 0.6875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.5001684427261353, "sampling/importance_sampling_ratio/mean": 0.9995923638343811, "sampling/importance_sampling_ratio/min": 0.6690421104431152, "sampling/sampling_logp_difference/max": 0.4055774211883545, "sampling/sampling_logp_difference/mean": 0.012887176126241684, "step": 1714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 843.0, "completions/max_terminated_length": 843.0, "completions/mean_length": 379.6875, "completions/mean_terminated_length": 379.6875, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "entropy": 0.538237452507019, "epoch": 2.1017156862745097, "frac_reward_zero_std": 0.75, "grad_norm": 0.5235712197001618, "kl": 0.03413379192352295, "learning_rate": 2.512348904165411e-07, "loss": 0.0089, "num_tokens": 73672278.0, "reward": 0.375, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.4833472967147827, "sampling/importance_sampling_ratio/mean": 0.9999058842658997, "sampling/importance_sampling_ratio/min": 0.6904944181442261, "sampling/sampling_logp_difference/max": 0.394301176071167, "sampling/sampling_logp_difference/mean": 0.015694132074713707, "step": 1715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 594.0, "completions/max_terminated_length": 594.0, "completions/mean_length": 293.984375, "completions/mean_terminated_length": 293.984375, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.3397350013256073, "epoch": 2.1029411764705883, "frac_reward_zero_std": 1.0, "grad_norm": 0.014791335404807988, "kl": 0.016390640288591385, "learning_rate": 2.5061719209262e-07, "loss": 0.0002, "num_tokens": 73705909.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.472502589225769, "sampling/importance_sampling_ratio/mean": 0.9999521374702454, "sampling/importance_sampling_ratio/min": 0.6372247934341431, "sampling/sampling_logp_difference/max": 0.45063281059265137, "sampling/sampling_logp_difference/mean": 0.012621153146028519, "step": 1716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1161.0, "completions/max_terminated_length": 1161.0, "completions/mean_length": 305.359375, "completions/mean_terminated_length": 305.359375, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.45973142981529236, "epoch": 2.1041666666666665, "frac_reward_zero_std": 1.0, "grad_norm": 0.015164213611805668, "kl": 0.022756041958928108, "learning_rate": 2.500000000000001e-07, "loss": 0.0002, "num_tokens": 73745132.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5204071998596191, "sampling/importance_sampling_ratio/mean": 0.9998887181282043, "sampling/importance_sampling_ratio/min": 0.7018263936042786, "sampling/sampling_logp_difference/max": 0.418978214263916, "sampling/sampling_logp_difference/mean": 0.015427280217409134, "step": 1717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 550.0, "completions/max_terminated_length": 550.0, "completions/mean_length": 289.1875, "completions/mean_terminated_length": 289.1875, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.32827770709991455, "epoch": 2.105392156862745, "frac_reward_zero_std": 1.0, "grad_norm": 0.012171411262536496, "kl": 0.01710369437932968, "learning_rate": 2.49383315391542e-07, "loss": 0.0002, "num_tokens": 73777544.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000159502029419, "sampling/importance_sampling_ratio/min": 0.6430398225784302, "sampling/sampling_logp_difference/max": 1.1525812149047852, "sampling/sampling_logp_difference/mean": 0.01137426495552063, "step": 1718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/max_terminated_length": 371.0, "completions/mean_length": 236.546875, "completions/mean_terminated_length": 236.546875, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.4051102101802826, "epoch": 2.1066176470588234, "frac_reward_zero_std": 0.75, "grad_norm": 0.7380323840103375, "kl": 0.03420143201947212, "learning_rate": 2.4876713951907685e-07, "loss": 0.0117, "num_tokens": 73808731.0, "reward": 0.6875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.4852532148361206, "sampling/importance_sampling_ratio/mean": 0.9995799660682678, "sampling/importance_sampling_ratio/min": 0.6877328157424927, "sampling/sampling_logp_difference/max": 0.395585298538208, "sampling/sampling_logp_difference/mean": 0.015258078463375568, "step": 1719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 825.0, "completions/max_terminated_length": 825.0, "completions/mean_length": 404.390625, "completions/mean_terminated_length": 404.390625, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "entropy": 0.41578054428100586, "epoch": 2.107843137254902, "frac_reward_zero_std": 0.5, "grad_norm": 0.6340064675997239, "kl": 0.04044253006577492, "learning_rate": 2.481514736334022e-07, "loss": 0.0065, "num_tokens": 73848932.0, "reward": 0.125, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 1.7537012100219727, "sampling/importance_sampling_ratio/mean": 0.9995579719543457, "sampling/importance_sampling_ratio/min": 0.6322280168533325, "sampling/sampling_logp_difference/max": 0.5617284774780273, "sampling/sampling_logp_difference/mean": 0.013629546388983727, "step": 1720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 542.0, "completions/max_terminated_length": 542.0, "completions/mean_length": 337.75, "completions/mean_terminated_length": 337.75, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.5371871590614319, "epoch": 2.1090686274509802, "frac_reward_zero_std": 1.0, "grad_norm": 0.01986727966051994, "kl": 0.02905861660838127, "learning_rate": 2.4753631898428134e-07, "loss": 0.0003, "num_tokens": 73891028.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.358864188194275, "sampling/importance_sampling_ratio/mean": 0.9993336796760559, "sampling/importance_sampling_ratio/min": 0.5910720825195312, "sampling/sampling_logp_difference/max": 0.5258172750473022, "sampling/sampling_logp_difference/mean": 0.016041235998272896, "step": 1721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 803.0, "completions/max_terminated_length": 803.0, "completions/mean_length": 442.34375, "completions/mean_terminated_length": 442.34375, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.5542048215866089, "epoch": 2.110294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 0.687692865428078, "kl": 0.025842182338237762, "learning_rate": 2.4692167682043853e-07, "loss": 0.0136, "num_tokens": 73949498.0, "reward": 0.375, "reward_std": 0.42078250646591187, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000511407852173, "sampling/importance_sampling_ratio/min": 0.6115259528160095, "sampling/sampling_logp_difference/max": 0.7502529621124268, "sampling/sampling_logp_difference/mean": 0.01586291566491127, "step": 1722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1505.0, "completions/max_terminated_length": 1505.0, "completions/mean_length": 363.53125, "completions/mean_terminated_length": 363.53125, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.6430662870407104, "epoch": 2.111519607843137, "frac_reward_zero_std": 0.75, "grad_norm": 0.6639599930536603, "kl": 0.043321676552295685, "learning_rate": 2.4630754838955896e-07, "loss": -0.0255, "num_tokens": 73989532.0, "reward": 0.25, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.25, "rewards/decision_reward_func/std": 0.9759001135826111, "sampling/importance_sampling_ratio/max": 1.502223253250122, "sampling/importance_sampling_ratio/mean": 0.9998401999473572, "sampling/importance_sampling_ratio/min": 0.36815980076789856, "sampling/sampling_logp_difference/max": 0.9992382526397705, "sampling/sampling_logp_difference/mean": 0.018640726804733276, "step": 1723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 981.0, "completions/max_terminated_length": 981.0, "completions/mean_length": 393.96875, "completions/mean_terminated_length": 393.96875, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "entropy": 0.5507345795631409, "epoch": 2.1127450980392157, "frac_reward_zero_std": 1.0, "grad_norm": 0.02533722274622793, "kl": 0.03626510873436928, "learning_rate": 2.456939349382843e-07, "loss": 0.0004, "num_tokens": 74033562.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.5140984058380127, "sampling/importance_sampling_ratio/mean": 0.9996709823608398, "sampling/importance_sampling_ratio/min": 0.315917432308197, "sampling/sampling_logp_difference/max": 1.1522743701934814, "sampling/sampling_logp_difference/mean": 0.01513767521828413, "step": 1724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.0, "completions/max_terminated_length": 523.0, "completions/mean_length": 281.328125, "completions/mean_terminated_length": 281.328125, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.3600448668003082, "epoch": 2.113970588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.010791866560874722, "kl": 0.015356038697063923, "learning_rate": 2.450808377122107e-07, "loss": 0.0001, "num_tokens": 74069247.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5521602630615234, "sampling/importance_sampling_ratio/mean": 0.9998409748077393, "sampling/importance_sampling_ratio/min": 0.7050653100013733, "sampling/sampling_logp_difference/max": 0.4396476745605469, "sampling/sampling_logp_difference/mean": 0.012374557554721832, "step": 1725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 406.0, "completions/max_terminated_length": 406.0, "completions/mean_length": 276.484375, "completions/mean_terminated_length": 276.484375, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "entropy": 0.38843047618865967, "epoch": 2.1151960784313726, "frac_reward_zero_std": 1.0, "grad_norm": 0.01473840198489579, "kl": 0.018719788640737534, "learning_rate": 2.4446825795588716e-07, "loss": 0.0002, "num_tokens": 74106430.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6355847120285034, "sampling/importance_sampling_ratio/mean": 0.9998336434364319, "sampling/importance_sampling_ratio/min": 0.6557287573814392, "sampling/sampling_logp_difference/max": 0.4920003414154053, "sampling/sampling_logp_difference/mean": 0.012953289784491062, "step": 1726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1022.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 387.28125, "completions/mean_terminated_length": 387.28125, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "entropy": 0.4456513524055481, "epoch": 2.116421568627451, "frac_reward_zero_std": 0.75, "grad_norm": 0.627159640051825, "kl": 0.01942710019648075, "learning_rate": 2.438561969128114e-07, "loss": 0.0168, "num_tokens": 74151328.0, "reward": 0.625, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.7779879570007324, "sampling/importance_sampling_ratio/mean": 0.999868631362915, "sampling/importance_sampling_ratio/min": 0.6247954368591309, "sampling/sampling_logp_difference/max": 0.5754823684692383, "sampling/sampling_logp_difference/mean": 0.013424822129309177, "step": 1727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 924.0, "completions/max_terminated_length": 924.0, "completions/mean_length": 283.1875, "completions/mean_terminated_length": 283.1875, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.5226067304611206, "epoch": 2.1176470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 1.1029256780619665, "kl": 0.03938576579093933, "learning_rate": 2.43244655825429e-07, "loss": 0.0988, "num_tokens": 74183628.0, "reward": 0.71875, "reward_std": 0.42695626616477966, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.4263712167739868, "sampling/importance_sampling_ratio/mean": 0.999748706817627, "sampling/importance_sampling_ratio/min": 0.6426404714584351, "sampling/sampling_logp_difference/max": 0.4421699047088623, "sampling/sampling_logp_difference/mean": 0.017249733209609985, "step": 1728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 477.0, "completions/max_terminated_length": 477.0, "completions/mean_length": 261.03125, "completions/mean_terminated_length": 261.03125, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.3855496048927307, "epoch": 2.1188725490196076, "frac_reward_zero_std": 1.0, "grad_norm": 0.013305638871255014, "kl": 0.017637986689805984, "learning_rate": 2.4263363593512903e-07, "loss": 0.0002, "num_tokens": 74215070.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5610761642456055, "sampling/importance_sampling_ratio/mean": 0.9995185136795044, "sampling/importance_sampling_ratio/min": 0.6850067973136902, "sampling/sampling_logp_difference/max": 0.4453754425048828, "sampling/sampling_logp_difference/mean": 0.013497929088771343, "step": 1729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 455.0, "completions/max_terminated_length": 455.0, "completions/mean_length": 259.90625, "completions/mean_terminated_length": 259.90625, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.4830878973007202, "epoch": 2.1200980392156863, "frac_reward_zero_std": 0.75, "grad_norm": 0.8547085465761257, "kl": 0.03354845196008682, "learning_rate": 2.4202313848224364e-07, "loss": 0.0113, "num_tokens": 74249064.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.549027442932129, "sampling/importance_sampling_ratio/mean": 1.0001047849655151, "sampling/importance_sampling_ratio/min": 0.6374812722206116, "sampling/sampling_logp_difference/max": 0.45023036003112793, "sampling/sampling_logp_difference/mean": 0.016318736597895622, "step": 1730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 790.0, "completions/max_terminated_length": 790.0, "completions/mean_length": 445.921875, "completions/mean_terminated_length": 445.921875, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "entropy": 0.6630765795707703, "epoch": 2.1213235294117645, "frac_reward_zero_std": 0.75, "grad_norm": 0.4219706753466774, "kl": 0.04050531983375549, "learning_rate": 2.414131647060436e-07, "loss": -0.0052, "num_tokens": 74303283.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.6516098976135254, "sampling/importance_sampling_ratio/mean": 1.0001780986785889, "sampling/importance_sampling_ratio/min": 0.7034770250320435, "sampling/sampling_logp_difference/max": 0.5017504692077637, "sampling/sampling_logp_difference/mean": 0.016256937757134438, "step": 1731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 590.0, "completions/max_terminated_length": 590.0, "completions/mean_length": 297.8125, "completions/mean_terminated_length": 297.8125, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "entropy": 0.4143299162387848, "epoch": 2.122549019607843, "frac_reward_zero_std": 1.0, "grad_norm": 0.013103007205601489, "kl": 0.01543065533041954, "learning_rate": 2.4080371584473745e-07, "loss": 0.0002, "num_tokens": 74338215.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9997206926345825, "sampling/importance_sampling_ratio/min": 0.6324706077575684, "sampling/sampling_logp_difference/max": 0.9375678300857544, "sampling/sampling_logp_difference/mean": 0.014294767752289772, "step": 1732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 864.0, "completions/max_terminated_length": 864.0, "completions/mean_length": 420.046875, "completions/mean_terminated_length": 420.046875, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "entropy": 0.3865177035331726, "epoch": 2.123774509803922, "frac_reward_zero_std": 0.25, "grad_norm": 0.7984280128034323, "kl": 0.018068712204694748, "learning_rate": 2.4019479313546757e-07, "loss": 0.0128, "num_tokens": 74389642.0, "reward": 0.84375, "reward_std": 0.4515564441680908, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.561230182647705, "sampling/importance_sampling_ratio/mean": 0.999937891960144, "sampling/importance_sampling_ratio/min": 0.6886354088783264, "sampling/sampling_logp_difference/max": 0.44547414779663086, "sampling/sampling_logp_difference/mean": 0.011668814346194267, "step": 1733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 821.0, "completions/max_terminated_length": 821.0, "completions/mean_length": 349.375, "completions/mean_terminated_length": 349.375, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "entropy": 0.5955759286880493, "epoch": 2.125, "frac_reward_zero_std": 0.5, "grad_norm": 0.8738911797545725, "kl": 0.03658360242843628, "learning_rate": 2.395863978143083e-07, "loss": -0.0371, "num_tokens": 74437698.0, "reward": 0.0625, "reward_std": 0.44091323018074036, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.6460249423980713, "sampling/importance_sampling_ratio/mean": 1.0004098415374756, "sampling/importance_sampling_ratio/min": 0.6972631812095642, "sampling/sampling_logp_difference/max": 0.4983632564544678, "sampling/sampling_logp_difference/mean": 0.017471540719270706, "step": 1734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2271.0, "completions/max_terminated_length": 2271.0, "completions/mean_length": 395.9375, "completions/mean_terminated_length": 395.9375, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.4484209418296814, "epoch": 2.126225490196078, "frac_reward_zero_std": 0.5, "grad_norm": 0.7876833603427977, "kl": 0.02177775651216507, "learning_rate": 2.3897853111626417e-07, "loss": -0.1432, "num_tokens": 74482942.0, "reward": 0.0625, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.4401085376739502, "sampling/importance_sampling_ratio/mean": 0.999648928642273, "sampling/importance_sampling_ratio/min": 0.511132001876831, "sampling/sampling_logp_difference/max": 0.671127438545227, "sampling/sampling_logp_difference/mean": 0.015229678712785244, "step": 1735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 437.0, "completions/max_terminated_length": 437.0, "completions/mean_length": 260.296875, "completions/mean_terminated_length": 260.296875, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.46272674202919006, "epoch": 2.127450980392157, "frac_reward_zero_std": 1.0, "grad_norm": 0.020153108445443286, "kl": 0.020883072167634964, "learning_rate": 2.383711942752652e-07, "loss": 0.0002, "num_tokens": 74518065.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.7444247007369995, "sampling/importance_sampling_ratio/mean": 1.0003677606582642, "sampling/importance_sampling_ratio/min": 0.429462730884552, "sampling/sampling_logp_difference/max": 0.8452203273773193, "sampling/sampling_logp_difference/mean": 0.01599929854273796, "step": 1736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 679.0, "completions/max_terminated_length": 679.0, "completions/mean_length": 358.9375, "completions/mean_terminated_length": 358.9375, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "entropy": 0.5445575714111328, "epoch": 2.1286764705882355, "frac_reward_zero_std": 0.75, "grad_norm": 0.6550940606004491, "kl": 0.025325104594230652, "learning_rate": 2.377643885241674e-07, "loss": 0.018, "num_tokens": 74564445.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.8028918504714966, "sampling/importance_sampling_ratio/mean": 1.0002456903457642, "sampling/importance_sampling_ratio/min": 0.5826129913330078, "sampling/sampling_logp_difference/max": 0.5893919467926025, "sampling/sampling_logp_difference/mean": 0.016004450619220734, "step": 1737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1114.0, "completions/max_terminated_length": 1114.0, "completions/mean_length": 358.09375, "completions/mean_terminated_length": 358.09375, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "entropy": 0.5815480947494507, "epoch": 2.1299019607843137, "frac_reward_zero_std": 1.0, "grad_norm": 0.023002830706381552, "kl": 0.03620787709951401, "learning_rate": 2.371581150947476e-07, "loss": 0.0003, "num_tokens": 74605811.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.534935474395752, "sampling/importance_sampling_ratio/mean": 0.9992749094963074, "sampling/importance_sampling_ratio/min": 0.5543031692504883, "sampling/sampling_logp_difference/max": 0.5900435447692871, "sampling/sampling_logp_difference/mean": 0.0164111889898777, "step": 1738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1487.0, "completions/max_terminated_length": 1487.0, "completions/mean_length": 454.546875, "completions/mean_terminated_length": 454.546875, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "entropy": 0.40670230984687805, "epoch": 2.1311274509803924, "frac_reward_zero_std": 0.5, "grad_norm": 0.6926186314886548, "kl": 0.022381991147994995, "learning_rate": 2.3655237521770282e-07, "loss": 0.0846, "num_tokens": 74654838.0, "reward": 0.78125, "reward_std": 0.42516323924064636, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.6160167455673218, "sampling/importance_sampling_ratio/mean": 0.9998741149902344, "sampling/importance_sampling_ratio/min": 0.6582848429679871, "sampling/sampling_logp_difference/max": 0.4799642562866211, "sampling/sampling_logp_difference/mean": 0.012490885332226753, "step": 1739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 354.078125, "completions/mean_terminated_length": 354.078125, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "entropy": 0.45894086360931396, "epoch": 2.1323529411764706, "frac_reward_zero_std": 0.75, "grad_norm": 0.5805415948378898, "kl": 0.01929197460412979, "learning_rate": 2.3594717012264642e-07, "loss": 0.0036, "num_tokens": 74697099.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.485336422920227, "sampling/importance_sampling_ratio/mean": 0.999925434589386, "sampling/importance_sampling_ratio/min": 0.6691774725914001, "sampling/sampling_logp_difference/max": 0.4017059803009033, "sampling/sampling_logp_difference/mean": 0.013643586076796055, "step": 1740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 826.0, "completions/max_terminated_length": 826.0, "completions/mean_length": 373.140625, "completions/mean_terminated_length": 373.140625, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.5042092800140381, "epoch": 2.133578431372549, "frac_reward_zero_std": 0.75, "grad_norm": 0.7819221640682846, "kl": 0.02612592652440071, "learning_rate": 2.3534250103810627e-07, "loss": 0.0554, "num_tokens": 74739428.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.495924472808838, "sampling/importance_sampling_ratio/mean": 0.9999970197677612, "sampling/importance_sampling_ratio/min": 0.703187882900238, "sampling/sampling_logp_difference/max": 0.4027444124221802, "sampling/sampling_logp_difference/mean": 0.015232869423925877, "step": 1741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1046.0, "completions/max_terminated_length": 1046.0, "completions/mean_length": 443.0625, "completions/mean_terminated_length": 443.0625, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "entropy": 0.6704312562942505, "epoch": 2.1348039215686274, "frac_reward_zero_std": 0.25, "grad_norm": 0.919131098878526, "kl": 0.03282884508371353, "learning_rate": 2.3473836919152263e-07, "loss": -0.0098, "num_tokens": 74788264.0, "reward": 0.625, "reward_std": 0.6645200252532959, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.4573925733566284, "sampling/importance_sampling_ratio/mean": 0.9996484518051147, "sampling/importance_sampling_ratio/min": 0.6515196561813354, "sampling/sampling_logp_difference/max": 0.4284477233886719, "sampling/sampling_logp_difference/mean": 0.01776862144470215, "step": 1742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 787.0, "completions/max_terminated_length": 787.0, "completions/mean_length": 380.203125, "completions/mean_terminated_length": 380.203125, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.5715155601501465, "epoch": 2.136029411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.018924280757480302, "kl": 0.03498055040836334, "learning_rate": 2.3413477580924475e-07, "loss": 0.0004, "num_tokens": 74830341.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4727421998977661, "sampling/importance_sampling_ratio/mean": 0.9999808073043823, "sampling/importance_sampling_ratio/min": 0.5260628461837769, "sampling/sampling_logp_difference/max": 0.6423345804214478, "sampling/sampling_logp_difference/mean": 0.016608279198408127, "step": 1743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 768.0, "completions/max_terminated_length": 768.0, "completions/mean_length": 344.609375, "completions/mean_terminated_length": 344.609375, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.39271268248558044, "epoch": 2.1372549019607843, "frac_reward_zero_std": 1.0, "grad_norm": 0.01387577033098735, "kl": 0.01762808859348297, "learning_rate": 2.3353172211652884e-07, "loss": 0.0002, "num_tokens": 74873660.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4718658924102783, "sampling/importance_sampling_ratio/mean": 1.0001920461654663, "sampling/importance_sampling_ratio/min": 0.7219640612602234, "sampling/sampling_logp_difference/max": 0.38653087615966797, "sampling/sampling_logp_difference/mean": 0.013374952599406242, "step": 1744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 600.0, "completions/max_terminated_length": 600.0, "completions/mean_length": 333.296875, "completions/mean_terminated_length": 333.296875, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.45699289441108704, "epoch": 2.138480392156863, "frac_reward_zero_std": 0.75, "grad_norm": 0.593184921469702, "kl": 0.02926437184214592, "learning_rate": 2.329292093375356e-07, "loss": -0.0187, "num_tokens": 74910847.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.4965347051620483, "sampling/importance_sampling_ratio/mean": 1.0006437301635742, "sampling/importance_sampling_ratio/min": 0.6664905548095703, "sampling/sampling_logp_difference/max": 0.4057292938232422, "sampling/sampling_logp_difference/mean": 0.014737940393388271, "step": 1745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 871.0, "completions/max_terminated_length": 871.0, "completions/mean_length": 351.265625, "completions/mean_terminated_length": 351.265625, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.5937134027481079, "epoch": 2.139705882352941, "frac_reward_zero_std": 0.75, "grad_norm": 0.5806822510236173, "kl": 0.04054301604628563, "learning_rate": 2.3232723869532816e-07, "loss": -0.0334, "num_tokens": 74952256.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.4756840467453003, "sampling/importance_sampling_ratio/mean": 0.9996174573898315, "sampling/importance_sampling_ratio/min": 0.5492525696754456, "sampling/sampling_logp_difference/max": 0.5991969108581543, "sampling/sampling_logp_difference/mean": 0.017714403569698334, "step": 1746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 707.0, "completions/max_terminated_length": 707.0, "completions/mean_length": 328.953125, "completions/mean_terminated_length": 328.953125, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.5280562043190002, "epoch": 2.1409313725490198, "frac_reward_zero_std": 0.75, "grad_norm": 0.7039083654181103, "kl": 0.056415855884552, "learning_rate": 2.3172581141186858e-07, "loss": -0.0012, "num_tokens": 74988029.0, "reward": 0.71875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.6007344722747803, "sampling/importance_sampling_ratio/mean": 1.0011019706726074, "sampling/importance_sampling_ratio/min": 0.6988828778266907, "sampling/sampling_logp_difference/max": 0.4704625606536865, "sampling/sampling_logp_difference/mean": 0.015636516734957695, "step": 1747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 791.0, "completions/max_terminated_length": 791.0, "completions/mean_length": 359.90625, "completions/mean_terminated_length": 359.90625, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.5433694124221802, "epoch": 2.142156862745098, "frac_reward_zero_std": 0.75, "grad_norm": 0.6979205941206791, "kl": 0.027081456035375595, "learning_rate": 2.3112492870801602e-07, "loss": -0.0206, "num_tokens": 75031575.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.8158022165298462, "sampling/importance_sampling_ratio/mean": 0.9997183084487915, "sampling/importance_sampling_ratio/min": 0.6503779888153076, "sampling/sampling_logp_difference/max": 0.5965273380279541, "sampling/sampling_logp_difference/mean": 0.016648240387439728, "step": 1748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 917.0, "completions/max_terminated_length": 917.0, "completions/mean_length": 363.71875, "completions/mean_terminated_length": 363.71875, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.4356986880302429, "epoch": 2.1433823529411766, "frac_reward_zero_std": 1.0, "grad_norm": 0.01248789286433759, "kl": 0.017103979364037514, "learning_rate": 2.3052459180352458e-07, "loss": 0.0002, "num_tokens": 75074469.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6050015687942505, "sampling/importance_sampling_ratio/mean": 1.0000321865081787, "sampling/importance_sampling_ratio/min": 0.6474044919013977, "sampling/sampling_logp_difference/max": 0.47312474250793457, "sampling/sampling_logp_difference/mean": 0.014834500849246979, "step": 1749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 699.0, "completions/max_terminated_length": 699.0, "completions/mean_length": 380.765625, "completions/mean_terminated_length": 380.765625, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "entropy": 0.4628799855709076, "epoch": 2.144607843137255, "frac_reward_zero_std": 0.75, "grad_norm": 0.6438460865960441, "kl": 0.018534405156970024, "learning_rate": 2.2992480191704e-07, "loss": 0.023, "num_tokens": 75123446.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.389288306236267, "sampling/importance_sampling_ratio/mean": 1.0000110864639282, "sampling/importance_sampling_ratio/min": 0.45584267377853394, "sampling/sampling_logp_difference/max": 0.7856075763702393, "sampling/sampling_logp_difference/mean": 0.014296547509729862, "step": 1750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 591.0, "completions/max_terminated_length": 591.0, "completions/mean_length": 316.453125, "completions/mean_terminated_length": 316.453125, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "entropy": 0.414615660905838, "epoch": 2.1458333333333335, "frac_reward_zero_std": 0.75, "grad_norm": 0.6240218635965796, "kl": 0.03126148506999016, "learning_rate": 2.2932556026609777e-07, "loss": 0.012, "num_tokens": 75165443.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.3654415607452393, "sampling/importance_sampling_ratio/mean": 0.9992640018463135, "sampling/importance_sampling_ratio/min": 0.538354754447937, "sampling/sampling_logp_difference/max": 0.6192375421524048, "sampling/sampling_logp_difference/mean": 0.012964664027094841, "step": 1751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 799.0, "completions/max_terminated_length": 799.0, "completions/mean_length": 390.59375, "completions/mean_terminated_length": 390.59375, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "entropy": 0.5276812314987183, "epoch": 2.1470588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 0.01481971575668713, "kl": 0.01874239183962345, "learning_rate": 2.2872686806712032e-07, "loss": 0.0002, "num_tokens": 75212761.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6923457384109497, "sampling/importance_sampling_ratio/mean": 0.9998956918716431, "sampling/importance_sampling_ratio/min": 0.6863887906074524, "sampling/sampling_logp_difference/max": 0.5261155366897583, "sampling/sampling_logp_difference/mean": 0.015958938747644424, "step": 1752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 763.0, "completions/max_terminated_length": 763.0, "completions/mean_length": 345.65625, "completions/mean_terminated_length": 345.65625, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "entropy": 0.6365110278129578, "epoch": 2.1482843137254903, "frac_reward_zero_std": 1.0, "grad_norm": 0.022451918273140012, "kl": 0.035243064165115356, "learning_rate": 2.2812872653541498e-07, "loss": 0.0004, "num_tokens": 75259235.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6430164575576782, "sampling/importance_sampling_ratio/mean": 0.9998261332511902, "sampling/importance_sampling_ratio/min": 0.6622360348701477, "sampling/sampling_logp_difference/max": 0.4965338706970215, "sampling/sampling_logp_difference/mean": 0.019115151837468147, "step": 1753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 960.0, "completions/max_terminated_length": 960.0, "completions/mean_length": 413.78125, "completions/mean_terminated_length": 413.78125, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "entropy": 0.570486307144165, "epoch": 2.1495098039215685, "frac_reward_zero_std": 0.5, "grad_norm": 0.782050915369964, "kl": 0.027090933173894882, "learning_rate": 2.2753113688517155e-07, "loss": -0.0224, "num_tokens": 75309157.0, "reward": -0.125, "reward_std": 0.42078250646591187, "rewards/decision_reward_func/mean": -0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 1.9895678758621216, "sampling/importance_sampling_ratio/mean": 0.9998296499252319, "sampling/importance_sampling_ratio/min": 0.4655434489250183, "sampling/sampling_logp_difference/max": 0.7645498514175415, "sampling/sampling_logp_difference/mean": 0.016041027382016182, "step": 1754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 811.0, "completions/max_terminated_length": 811.0, "completions/mean_length": 436.9375, "completions/mean_terminated_length": 436.9375, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "entropy": 0.5041824579238892, "epoch": 2.150735294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.6734999323471528, "kl": 0.02482408471405506, "learning_rate": 2.2693410032945853e-07, "loss": 0.0236, "num_tokens": 75359617.0, "reward": 0.28125, "reward_std": 0.375, "rewards/decision_reward_func/mean": 0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 1.4824254512786865, "sampling/importance_sampling_ratio/mean": 0.9999762177467346, "sampling/importance_sampling_ratio/min": 0.7022252678871155, "sampling/sampling_logp_difference/max": 0.3936796188354492, "sampling/sampling_logp_difference/mean": 0.014009371399879456, "step": 1755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 653.0, "completions/max_terminated_length": 653.0, "completions/mean_length": 376.09375, "completions/mean_terminated_length": 376.09375, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.4325650930404663, "epoch": 2.1519607843137254, "frac_reward_zero_std": 1.0, "grad_norm": 0.013137353456541255, "kl": 0.017430711537599564, "learning_rate": 2.2633761808022272e-07, "loss": 0.0002, "num_tokens": 75403751.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5710235834121704, "sampling/importance_sampling_ratio/mean": 0.9999185800552368, "sampling/importance_sampling_ratio/min": 0.6955205798149109, "sampling/sampling_logp_difference/max": 0.45172739028930664, "sampling/sampling_logp_difference/mean": 0.013775065541267395, "step": 1756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/max_terminated_length": 490.0, "completions/mean_length": 226.4375, "completions/mean_terminated_length": 226.4375, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.48947209119796753, "epoch": 2.153186274509804, "frac_reward_zero_std": 1.0, "grad_norm": 0.02881081918863393, "kl": 0.04936813563108444, "learning_rate": 2.2574169134828526e-07, "loss": 0.0005, "num_tokens": 75433315.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4081372022628784, "sampling/importance_sampling_ratio/mean": 0.9999172687530518, "sampling/importance_sampling_ratio/min": 0.6515045762062073, "sampling/sampling_logp_difference/max": 0.4284708499908447, "sampling/sampling_logp_difference/mean": 0.01661210134625435, "step": 1757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 568.0, "completions/max_terminated_length": 568.0, "completions/mean_length": 310.703125, "completions/mean_terminated_length": 310.703125, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "entropy": 0.6091895699501038, "epoch": 2.1544117647058822, "frac_reward_zero_std": 0.75, "grad_norm": 0.6355472157040862, "kl": 0.056815147399902344, "learning_rate": 2.2514632134333932e-07, "loss": 0.0205, "num_tokens": 75470880.0, "reward": 0.40625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.5681653022766113, "sampling/importance_sampling_ratio/mean": 0.9990297555923462, "sampling/importance_sampling_ratio/min": 0.5888137221336365, "sampling/sampling_logp_difference/max": 0.5296454429626465, "sampling/sampling_logp_difference/mean": 0.01838735118508339, "step": 1758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1130.0, "completions/max_terminated_length": 1130.0, "completions/mean_length": 426.890625, "completions/mean_terminated_length": 426.890625, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.3501693606376648, "epoch": 2.155637254901961, "frac_reward_zero_std": 0.75, "grad_norm": 0.5540785059488716, "kl": 0.019245166331529617, "learning_rate": 2.2455150927394878e-07, "loss": 0.0152, "num_tokens": 75516537.0, "reward": 0.65625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.4883500337600708, "sampling/importance_sampling_ratio/mean": 0.9998290538787842, "sampling/importance_sampling_ratio/min": 0.637241780757904, "sampling/sampling_logp_difference/max": 0.450606107711792, "sampling/sampling_logp_difference/mean": 0.010674959048628807, "step": 1759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 979.0, "completions/max_terminated_length": 979.0, "completions/mean_length": 383.625, "completions/mean_terminated_length": 383.625, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "entropy": 0.5233919620513916, "epoch": 2.156862745098039, "frac_reward_zero_std": 0.75, "grad_norm": 0.6541338153409176, "kl": 0.027801301330327988, "learning_rate": 2.2395725634754402e-07, "loss": -0.0051, "num_tokens": 75560801.0, "reward": 0.375, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.9557626247406006, "sampling/importance_sampling_ratio/mean": 0.9998083710670471, "sampling/importance_sampling_ratio/min": 0.34201982617378235, "sampling/sampling_logp_difference/max": 1.07288658618927, "sampling/sampling_logp_difference/mean": 0.015930503606796265, "step": 1760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 698.0, "completions/max_terminated_length": 698.0, "completions/mean_length": 311.3125, "completions/mean_terminated_length": 311.3125, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.38150012493133545, "epoch": 2.1580882352941178, "frac_reward_zero_std": 1.0, "grad_norm": 0.012974766596860117, "kl": 0.016668274998664856, "learning_rate": 2.2336356377042143e-07, "loss": 0.0002, "num_tokens": 75595749.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.3947930335998535, "sampling/importance_sampling_ratio/mean": 0.9998011589050293, "sampling/importance_sampling_ratio/min": 0.6133143305778503, "sampling/sampling_logp_difference/max": 0.4888777732849121, "sampling/sampling_logp_difference/mean": 0.013696101494133472, "step": 1761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 589.0, "completions/max_terminated_length": 589.0, "completions/mean_length": 258.34375, "completions/mean_terminated_length": 258.34375, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.4524621069431305, "epoch": 2.159313725490196, "frac_reward_zero_std": 0.75, "grad_norm": 0.988714046585919, "kl": 0.02388782612979412, "learning_rate": 2.2277043274773854e-07, "loss": 0.0587, "num_tokens": 75630651.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.5995018482208252, "sampling/importance_sampling_ratio/mean": 1.0000114440917969, "sampling/importance_sampling_ratio/min": 0.7128355503082275, "sampling/sampling_logp_difference/max": 0.4696922302246094, "sampling/sampling_logp_difference/mean": 0.015624705702066422, "step": 1762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 849.0, "completions/max_terminated_length": 849.0, "completions/mean_length": 374.859375, "completions/mean_terminated_length": 374.859375, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.487304151058197, "epoch": 2.1605392156862746, "frac_reward_zero_std": 0.75, "grad_norm": 0.592344434580537, "kl": 0.02347145415842533, "learning_rate": 2.221778644835144e-07, "loss": -0.0011, "num_tokens": 75670882.0, "reward": 0.6875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.612146019935608, "sampling/importance_sampling_ratio/mean": 1.0007858276367188, "sampling/importance_sampling_ratio/min": 0.6991313099861145, "sampling/sampling_logp_difference/max": 0.4775662422180176, "sampling/sampling_logp_difference/mean": 0.015616096556186676, "step": 1763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 794.0, "completions/max_terminated_length": 794.0, "completions/mean_length": 325.6875, "completions/mean_terminated_length": 325.6875, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.43694692850112915, "epoch": 2.161764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.013324123645864085, "kl": 0.01890558749437332, "learning_rate": 2.215858601806246e-07, "loss": 0.0002, "num_tokens": 75707246.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.444117546081543, "sampling/importance_sampling_ratio/mean": 0.9996368885040283, "sampling/importance_sampling_ratio/min": 0.44408971071243286, "sampling/sampling_logp_difference/max": 0.8117287158966064, "sampling/sampling_logp_difference/mean": 0.014755330979824066, "step": 1764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 603.0, "completions/max_terminated_length": 603.0, "completions/mean_length": 280.84375, "completions/mean_terminated_length": 280.84375, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.3582904636859894, "epoch": 2.1629901960784315, "frac_reward_zero_std": 1.0, "grad_norm": 0.013339707288681238, "kl": 0.014572355896234512, "learning_rate": 2.2099442104080075e-07, "loss": 0.0001, "num_tokens": 75739540.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4348258972167969, "sampling/importance_sampling_ratio/mean": 1.0002689361572266, "sampling/importance_sampling_ratio/min": 0.6999179124832153, "sampling/sampling_logp_difference/max": 0.36104345321655273, "sampling/sampling_logp_difference/mean": 0.012973255477845669, "step": 1765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1660.0, "completions/max_terminated_length": 1660.0, "completions/mean_length": 446.390625, "completions/mean_terminated_length": 446.390625, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.5479265451431274, "epoch": 2.1642156862745097, "frac_reward_zero_std": 0.5, "grad_norm": 0.6721456104971545, "kl": 0.028092846274375916, "learning_rate": 2.2040354826462664e-07, "loss": 0.0442, "num_tokens": 75789437.0, "reward": 0.84375, "reward_std": 0.3723389506340027, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.9720683097839355, "sampling/importance_sampling_ratio/mean": 0.999979555606842, "sampling/importance_sampling_ratio/min": 0.6298375129699707, "sampling/sampling_logp_difference/max": 0.6790828704833984, "sampling/sampling_logp_difference/mean": 0.015599248930811882, "step": 1766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 449.0, "completions/max_terminated_length": 449.0, "completions/mean_length": 237.0, "completions/mean_terminated_length": 237.0, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.3486824035644531, "epoch": 2.1654411764705883, "frac_reward_zero_std": 1.0, "grad_norm": 0.026691360699336954, "kl": 0.019821444526314735, "learning_rate": 2.1981324305153642e-07, "loss": 0.0002, "num_tokens": 75819677.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4026508331298828, "sampling/importance_sampling_ratio/mean": 0.9995940327644348, "sampling/importance_sampling_ratio/min": 0.3368481695652008, "sampling/sampling_logp_difference/max": 1.0881229639053345, "sampling/sampling_logp_difference/mean": 0.013767353259027004, "step": 1767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 959.0, "completions/max_terminated_length": 959.0, "completions/mean_length": 407.765625, "completions/mean_terminated_length": 407.765625, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.6362612843513489, "epoch": 2.1666666666666665, "frac_reward_zero_std": 0.5, "grad_norm": 0.7642424001237907, "kl": 0.059950895607471466, "learning_rate": 2.192235065998126e-07, "loss": -0.0424, "num_tokens": 75864302.0, "reward": -0.15625, "reward_std": 0.375, "rewards/decision_reward_func/mean": -0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.9657596349716187, "sampling/importance_sampling_ratio/mean": 0.9997900128364563, "sampling/importance_sampling_ratio/min": 0.620373010635376, "sampling/sampling_logp_difference/max": 0.6758787631988525, "sampling/sampling_logp_difference/mean": 0.01927364617586136, "step": 1768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 778.0, "completions/max_terminated_length": 778.0, "completions/mean_length": 377.15625, "completions/mean_terminated_length": 377.15625, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "entropy": 0.34446388483047485, "epoch": 2.167892156862745, "frac_reward_zero_std": 1.0, "grad_norm": 0.011374104369885727, "kl": 0.014298150315880775, "learning_rate": 2.1863434010658272e-07, "loss": 0.0001, "num_tokens": 75906568.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4754632711410522, "sampling/importance_sampling_ratio/mean": 1.0003790855407715, "sampling/importance_sampling_ratio/min": 0.3941488564014435, "sampling/sampling_logp_difference/max": 0.9310266971588135, "sampling/sampling_logp_difference/mean": 0.012215018272399902, "step": 1769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 856.0, "completions/max_terminated_length": 856.0, "completions/mean_length": 354.390625, "completions/mean_terminated_length": 354.390625, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.5766065716743469, "epoch": 2.1691176470588234, "frac_reward_zero_std": 0.5, "grad_norm": 0.9742638710641486, "kl": 0.024866174906492233, "learning_rate": 2.1804574476781733e-07, "loss": -0.0119, "num_tokens": 75945377.0, "reward": 0.46875, "reward_std": 0.3723389506340027, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.6208434104919434, "sampling/importance_sampling_ratio/mean": 0.999862551689148, "sampling/importance_sampling_ratio/min": 0.6151823997497559, "sampling/sampling_logp_difference/max": 0.4858365058898926, "sampling/sampling_logp_difference/mean": 0.018182164058089256, "step": 1770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 788.0, "completions/max_terminated_length": 788.0, "completions/mean_length": 336.96875, "completions/mean_terminated_length": 336.96875, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.34472033381462097, "epoch": 2.170343137254902, "frac_reward_zero_std": 1.0, "grad_norm": 0.011086484233133229, "kl": 0.016545370221138, "learning_rate": 2.1745772177832755e-07, "loss": 0.0002, "num_tokens": 75986399.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.431592583656311, "sampling/importance_sampling_ratio/mean": 0.9996466636657715, "sampling/importance_sampling_ratio/min": 0.6150025725364685, "sampling/sampling_logp_difference/max": 0.4861288070678711, "sampling/sampling_logp_difference/mean": 0.010565451346337795, "step": 1771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 611.0, "completions/max_terminated_length": 611.0, "completions/mean_length": 367.203125, "completions/mean_terminated_length": 367.203125, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "entropy": 0.3424224257469177, "epoch": 2.1715686274509802, "frac_reward_zero_std": 1.0, "grad_norm": 0.00938332569957705, "kl": 0.012218338437378407, "learning_rate": 2.1687027233176318e-07, "loss": 0.0001, "num_tokens": 76024332.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5995464324951172, "sampling/importance_sampling_ratio/mean": 1.0000746250152588, "sampling/importance_sampling_ratio/min": 0.6198518872261047, "sampling/sampling_logp_difference/max": 0.4782748222351074, "sampling/sampling_logp_difference/mean": 0.012330936267971992, "step": 1772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 943.0, "completions/max_terminated_length": 943.0, "completions/mean_length": 458.84375, "completions/mean_terminated_length": 458.84375, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "entropy": 0.46120163798332214, "epoch": 2.172794117647059, "frac_reward_zero_std": 0.25, "grad_norm": 0.7878636578747619, "kl": 0.02984142303466797, "learning_rate": 2.1628339762060914e-07, "loss": -0.0046, "num_tokens": 76073474.0, "reward": 0.5625, "reward_std": 0.5915650129318237, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000483989715576, "sampling/importance_sampling_ratio/min": 0.6782348155975342, "sampling/sampling_logp_difference/max": 0.8764259815216064, "sampling/sampling_logp_difference/mean": 0.014205863699316978, "step": 1773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2272.0, "completions/max_terminated_length": 2272.0, "completions/mean_length": 448.375, "completions/mean_terminated_length": 448.375, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.4896865785121918, "epoch": 2.174019607843137, "frac_reward_zero_std": 0.5, "grad_norm": 0.8150722507728153, "kl": 0.02151583321392536, "learning_rate": 2.1569709883618382e-07, "loss": 0.2222, "num_tokens": 76125322.0, "reward": 0.84375, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.4031267166137695, "sampling/importance_sampling_ratio/mean": 0.9996109008789062, "sampling/importance_sampling_ratio/min": 0.3494633436203003, "sampling/sampling_logp_difference/max": 1.051356554031372, "sampling/sampling_logp_difference/mean": 0.01413875725120306, "step": 1774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 893.0, "completions/max_terminated_length": 893.0, "completions/mean_length": 428.625, "completions/mean_terminated_length": 428.625, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "entropy": 0.5062355399131775, "epoch": 2.1752450980392157, "frac_reward_zero_std": 1.0, "grad_norm": 0.01443873164628169, "kl": 0.016429398208856583, "learning_rate": 2.1511137716863687e-07, "loss": 0.0002, "num_tokens": 76174626.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.549997329711914, "sampling/importance_sampling_ratio/mean": 1.0004839897155762, "sampling/importance_sampling_ratio/min": 0.6348854899406433, "sampling/sampling_logp_difference/max": 0.45431065559387207, "sampling/sampling_logp_difference/mean": 0.015576144680380821, "step": 1775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 689.0, "completions/max_terminated_length": 689.0, "completions/mean_length": 307.546875, "completions/mean_terminated_length": 307.546875, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.5730075836181641, "epoch": 2.176470588235294, "frac_reward_zero_std": 0.75, "grad_norm": 0.7890440961143326, "kl": 0.049911100417375565, "learning_rate": 2.1452623380694602e-07, "loss": 0.0138, "num_tokens": 76211653.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.999625563621521, "sampling/importance_sampling_ratio/min": 0.648290753364563, "sampling/sampling_logp_difference/max": 0.8089714050292969, "sampling/sampling_logp_difference/mean": 0.01694975048303604, "step": 1776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 815.0, "completions/max_terminated_length": 815.0, "completions/mean_length": 397.984375, "completions/mean_terminated_length": 397.984375, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "entropy": 0.5312198400497437, "epoch": 2.1776960784313726, "frac_reward_zero_std": 0.75, "grad_norm": 0.6399287289630681, "kl": 0.01935184746980667, "learning_rate": 2.1394166993891526e-07, "loss": -0.0059, "num_tokens": 76259716.0, "reward": 0.28125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 1.5400789976119995, "sampling/importance_sampling_ratio/mean": 1.0000766515731812, "sampling/importance_sampling_ratio/min": 0.6801638603210449, "sampling/sampling_logp_difference/max": 0.43183374404907227, "sampling/sampling_logp_difference/mean": 0.01587652787566185, "step": 1777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1129.0, "completions/max_terminated_length": 1129.0, "completions/mean_length": 474.609375, "completions/mean_terminated_length": 474.609375, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "entropy": 0.43323805928230286, "epoch": 2.178921568627451, "frac_reward_zero_std": 0.75, "grad_norm": 0.551203635747799, "kl": 0.024525746703147888, "learning_rate": 2.1335768675117205e-07, "loss": -0.0347, "num_tokens": 76309723.0, "reward": 0.3125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.533180832862854, "sampling/importance_sampling_ratio/mean": 1.000063180923462, "sampling/importance_sampling_ratio/min": 0.6262595653533936, "sampling/sampling_logp_difference/max": 0.4679903984069824, "sampling/sampling_logp_difference/mean": 0.011732966639101505, "step": 1778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1287.0, "completions/max_terminated_length": 1287.0, "completions/mean_length": 410.984375, "completions/mean_terminated_length": 410.984375, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.5070970058441162, "epoch": 2.1801470588235294, "frac_reward_zero_std": 0.25, "grad_norm": 0.9966906357669135, "kl": 0.025504346936941147, "learning_rate": 2.1277428542916555e-07, "loss": 0.0019, "num_tokens": 76355370.0, "reward": 0.4375, "reward_std": 0.42078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.5322457551956177, "sampling/importance_sampling_ratio/mean": 1.0002498626708984, "sampling/importance_sampling_ratio/min": 0.5678319931030273, "sampling/sampling_logp_difference/max": 0.565929651260376, "sampling/sampling_logp_difference/mean": 0.01661764830350876, "step": 1779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 954.0, "completions/max_terminated_length": 954.0, "completions/mean_length": 329.46875, "completions/mean_terminated_length": 329.46875, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "entropy": 0.4374690651893616, "epoch": 2.1813725490196076, "frac_reward_zero_std": 0.5, "grad_norm": 1.0445303937014276, "kl": 0.021757710725069046, "learning_rate": 2.121914671571633e-07, "loss": -0.015, "num_tokens": 76390728.0, "reward": 0.53125, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.609833836555481, "sampling/importance_sampling_ratio/mean": 0.9997439384460449, "sampling/importance_sampling_ratio/min": 0.679651141166687, "sampling/sampling_logp_difference/max": 0.47613096237182617, "sampling/sampling_logp_difference/mean": 0.01329371239989996, "step": 1780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 959.0, "completions/max_terminated_length": 959.0, "completions/mean_length": 379.875, "completions/mean_terminated_length": 379.875, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.44099897146224976, "epoch": 2.1825980392156863, "frac_reward_zero_std": 1.0, "grad_norm": 0.01320538499858289, "kl": 0.023931413888931274, "learning_rate": 2.1160923311824934e-07, "loss": 0.0002, "num_tokens": 76433280.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5681368112564087, "sampling/importance_sampling_ratio/mean": 1.0000789165496826, "sampling/importance_sampling_ratio/min": 0.6254727840423584, "sampling/sampling_logp_difference/max": 0.4692475497722626, "sampling/sampling_logp_difference/mean": 0.013851837255060673, "step": 1781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 616.0, "completions/max_terminated_length": 616.0, "completions/mean_length": 390.921875, "completions/mean_terminated_length": 390.921875, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "entropy": 0.43824511766433716, "epoch": 2.1838235294117645, "frac_reward_zero_std": 0.75, "grad_norm": 0.48819200491383924, "kl": 0.030220435932278633, "learning_rate": 2.110275844943223e-07, "loss": -0.004, "num_tokens": 76473483.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.632972002029419, "sampling/importance_sampling_ratio/mean": 1.0000301599502563, "sampling/importance_sampling_ratio/min": 0.4920208156108856, "sampling/sampling_logp_difference/max": 0.7092342376708984, "sampling/sampling_logp_difference/mean": 0.014807917177677155, "step": 1782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/max_terminated_length": 547.0, "completions/mean_length": 239.375, "completions/mean_terminated_length": 239.375, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.36483100056648254, "epoch": 2.185049019607843, "frac_reward_zero_std": 1.0, "grad_norm": 0.01573119086451856, "kl": 0.019060898572206497, "learning_rate": 2.1044652246609173e-07, "loss": 0.0002, "num_tokens": 76501315.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9744044542312622, "sampling/importance_sampling_ratio/mean": 1.0003244876861572, "sampling/importance_sampling_ratio/min": 0.6082813739776611, "sampling/sampling_logp_difference/max": 0.6802668571472168, "sampling/sampling_logp_difference/mean": 0.014241357333958149, "step": 1783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 940.0, "completions/max_terminated_length": 940.0, "completions/mean_length": 333.390625, "completions/mean_terminated_length": 333.390625, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.4749557673931122, "epoch": 2.186274509803922, "frac_reward_zero_std": 0.5, "grad_norm": 0.8559693053949111, "kl": 0.037697769701480865, "learning_rate": 2.098660482130768e-07, "loss": 0.0118, "num_tokens": 76535820.0, "reward": 0.78125, "reward_std": 0.375, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.5575480461120605, "sampling/importance_sampling_ratio/mean": 0.9998631477355957, "sampling/importance_sampling_ratio/min": 0.6660311818122864, "sampling/sampling_logp_difference/max": 0.443112850189209, "sampling/sampling_logp_difference/mean": 0.016541382297873497, "step": 1784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 577.0, "completions/max_terminated_length": 577.0, "completions/mean_length": 303.890625, "completions/mean_terminated_length": 303.890625, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.5034958124160767, "epoch": 2.1875, "frac_reward_zero_std": 0.75, "grad_norm": 0.6344787086580151, "kl": 0.027975041419267654, "learning_rate": 2.092861629136033e-07, "loss": -0.0095, "num_tokens": 76571781.0, "reward": 0.84375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.451397180557251, "sampling/importance_sampling_ratio/mean": 1.0001013278961182, "sampling/importance_sampling_ratio/min": 0.6830977201461792, "sampling/sampling_logp_difference/max": 0.3811173439025879, "sampling/sampling_logp_difference/mean": 0.01643506810069084, "step": 1785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1385.0, "completions/max_terminated_length": 1385.0, "completions/mean_length": 395.875, "completions/mean_terminated_length": 395.875, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.5687342882156372, "epoch": 2.188725490196078, "frac_reward_zero_std": 0.5, "grad_norm": 0.7951923772087894, "kl": 0.03608422353863716, "learning_rate": 2.0870686774480196e-07, "loss": 0.0099, "num_tokens": 76614669.0, "reward": 0.5, "reward_std": 0.34156501293182373, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5079455375671387, "sampling/importance_sampling_ratio/mean": 0.9999356269836426, "sampling/importance_sampling_ratio/min": 0.6452227830886841, "sampling/sampling_logp_difference/max": 0.43815961480140686, "sampling/sampling_logp_difference/mean": 0.017551179975271225, "step": 1786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 827.0, "completions/max_terminated_length": 827.0, "completions/mean_length": 426.421875, "completions/mean_terminated_length": 426.421875, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "entropy": 0.41938868165016174, "epoch": 2.189950980392157, "frac_reward_zero_std": 0.5, "grad_norm": 0.732638733147525, "kl": 0.01780617982149124, "learning_rate": 2.0812816388260519e-07, "loss": 0.0008, "num_tokens": 76662792.0, "reward": 0.8125, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.771519422531128, "sampling/importance_sampling_ratio/mean": 1.0000965595245361, "sampling/importance_sampling_ratio/min": 0.7476180791854858, "sampling/sampling_logp_difference/max": 0.5718376636505127, "sampling/sampling_logp_difference/mean": 0.012383291497826576, "step": 1787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 670.0, "completions/max_terminated_length": 670.0, "completions/mean_length": 418.640625, "completions/mean_terminated_length": 418.640625, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "entropy": 0.4688049554824829, "epoch": 2.1911764705882355, "frac_reward_zero_std": 0.75, "grad_norm": 0.586172153833077, "kl": 0.01958777755498886, "learning_rate": 2.0755005250174484e-07, "loss": 0.0303, "num_tokens": 76708049.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.462200403213501, "sampling/importance_sampling_ratio/mean": 1.0001919269561768, "sampling/importance_sampling_ratio/min": 0.5369683504104614, "sampling/sampling_logp_difference/max": 0.6218161582946777, "sampling/sampling_logp_difference/mean": 0.014721857383847237, "step": 1788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 643.0, "completions/max_terminated_length": 643.0, "completions/mean_length": 309.75, "completions/mean_terminated_length": 309.75, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.4835771918296814, "epoch": 2.1924019607843137, "frac_reward_zero_std": 0.5, "grad_norm": 0.7988149745843116, "kl": 0.035089440643787384, "learning_rate": 2.0697253477575088e-07, "loss": 0.0019, "num_tokens": 76740385.0, "reward": -0.28125, "reward_std": 0.375, "rewards/decision_reward_func/mean": -0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 1.5777113437652588, "sampling/importance_sampling_ratio/mean": 0.9998034834861755, "sampling/importance_sampling_ratio/min": 0.47002559900283813, "sampling/sampling_logp_difference/max": 0.7549681663513184, "sampling/sampling_logp_difference/mean": 0.01651717536151409, "step": 1789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1329.0, "completions/max_terminated_length": 1329.0, "completions/mean_length": 517.0625, "completions/mean_terminated_length": 517.0625, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "entropy": 0.47341665625572205, "epoch": 2.1936274509803924, "frac_reward_zero_std": 0.75, "grad_norm": 0.42867629086204984, "kl": 0.018566863611340523, "learning_rate": 2.0639561187694733e-07, "loss": -0.0276, "num_tokens": 76789221.0, "reward": 0.15625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.4751449823379517, "sampling/importance_sampling_ratio/mean": 0.9999544620513916, "sampling/importance_sampling_ratio/min": 0.6985273957252502, "sampling/sampling_logp_difference/max": 0.38875627517700195, "sampling/sampling_logp_difference/mean": 0.012809643521904945, "step": 1790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 872.0, "completions/max_terminated_length": 872.0, "completions/mean_length": 393.75, "completions/mean_terminated_length": 393.75, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "entropy": 0.5406627655029297, "epoch": 2.1948529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 0.8326234116858651, "kl": 0.04092046618461609, "learning_rate": 2.0581928497645164e-07, "loss": -0.0368, "num_tokens": 76832645.0, "reward": 0.625, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.5222831964492798, "sampling/importance_sampling_ratio/mean": 0.999886155128479, "sampling/importance_sampling_ratio/min": 0.42315590381622314, "sampling/sampling_logp_difference/max": 0.8600146770477295, "sampling/sampling_logp_difference/mean": 0.01532385591417551, "step": 1791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 597.0, "completions/max_terminated_length": 597.0, "completions/mean_length": 309.5, "completions/mean_terminated_length": 309.5, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.4911215901374817, "epoch": 2.196078431372549, "frac_reward_zero_std": 0.75, "grad_norm": 0.7241527404176268, "kl": 0.045423537492752075, "learning_rate": 2.0524355524417015e-07, "loss": 0.0249, "num_tokens": 76870341.0, "reward": 0.40625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.401857614517212, "sampling/importance_sampling_ratio/mean": 1.0000666379928589, "sampling/importance_sampling_ratio/min": 0.3731275200843811, "sampling/sampling_logp_difference/max": 0.985835075378418, "sampling/sampling_logp_difference/mean": 0.015887223184108734, "step": 1792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1665.0, "completions/max_terminated_length": 1665.0, "completions/mean_length": 437.6875, "completions/mean_terminated_length": 437.6875, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.4720090627670288, "epoch": 2.1973039215686274, "frac_reward_zero_std": 1.0, "grad_norm": 0.011029965666226863, "kl": 0.019094645977020264, "learning_rate": 2.0466842384879829e-07, "loss": 0.0002, "num_tokens": 76915361.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4198359251022339, "sampling/importance_sampling_ratio/mean": 0.9999592304229736, "sampling/importance_sampling_ratio/min": 0.6622446775436401, "sampling/sampling_logp_difference/max": 0.4121202230453491, "sampling/sampling_logp_difference/mean": 0.013551521115005016, "step": 1793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 688.0, "completions/max_terminated_length": 688.0, "completions/mean_length": 420.46875, "completions/mean_terminated_length": 420.46875, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "entropy": 0.5159863829612732, "epoch": 2.198529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 0.7228521360330035, "kl": 0.030291909351944923, "learning_rate": 2.0409389195781623e-07, "loss": 0.0074, "num_tokens": 76960415.0, "reward": -0.5, "reward_std": 0.4787135720252991, "rewards/decision_reward_func/mean": -0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.999875545501709, "sampling/importance_sampling_ratio/min": 0.7162312865257263, "sampling/sampling_logp_difference/max": 0.8952300548553467, "sampling/sampling_logp_difference/mean": 0.013707790523767471, "step": 1794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 852.0, "completions/max_terminated_length": 852.0, "completions/mean_length": 386.15625, "completions/mean_terminated_length": 386.15625, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.4558834731578827, "epoch": 2.1997549019607843, "frac_reward_zero_std": 0.75, "grad_norm": 0.4975253107139135, "kl": 0.02137260138988495, "learning_rate": 2.0351996073748713e-07, "loss": -0.006, "num_tokens": 77006313.0, "reward": -0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": -0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.494931697845459, "sampling/importance_sampling_ratio/mean": 0.9999817609786987, "sampling/importance_sampling_ratio/min": 0.5934169292449951, "sampling/sampling_logp_difference/max": 0.5218580961227417, "sampling/sampling_logp_difference/mean": 0.014942828565835953, "step": 1795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1101.0, "completions/max_terminated_length": 1101.0, "completions/mean_length": 514.453125, "completions/mean_terminated_length": 514.453125, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "entropy": 0.36901798844337463, "epoch": 2.200980392156863, "frac_reward_zero_std": 0.75, "grad_norm": 0.41719171463282134, "kl": 0.015447447076439857, "learning_rate": 2.0294663135285533e-07, "loss": -0.0226, "num_tokens": 77058422.0, "reward": -0.40625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": -0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.6464955806732178, "sampling/importance_sampling_ratio/mean": 1.000065803527832, "sampling/importance_sampling_ratio/min": 0.33762669563293457, "sampling/sampling_logp_difference/max": 1.0858144760131836, "sampling/sampling_logp_difference/mean": 0.01147955097258091, "step": 1796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 563.0, "completions/max_terminated_length": 563.0, "completions/mean_length": 289.984375, "completions/mean_terminated_length": 289.984375, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.4735583961009979, "epoch": 2.202205882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.022788655613174186, "kl": 0.04544063284993172, "learning_rate": 2.0237390496774282e-07, "loss": 0.0004, "num_tokens": 77091413.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5980902910232544, "sampling/importance_sampling_ratio/mean": 1.0002607107162476, "sampling/importance_sampling_ratio/min": 0.7062193751335144, "sampling/sampling_logp_difference/max": 0.4688093662261963, "sampling/sampling_logp_difference/mean": 0.016025643795728683, "step": 1797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1471.0, "completions/max_terminated_length": 1471.0, "completions/mean_length": 421.71875, "completions/mean_terminated_length": 421.71875, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "entropy": 0.4297964572906494, "epoch": 2.2034313725490198, "frac_reward_zero_std": 0.5, "grad_norm": 0.860817466810629, "kl": 0.020351514220237732, "learning_rate": 2.0180178274474834e-07, "loss": -0.1723, "num_tokens": 77139683.0, "reward": 0.25, "reward_std": 0.44091323018074036, "rewards/decision_reward_func/mean": 0.25, "rewards/decision_reward_func/std": 0.9759001135826111, "sampling/importance_sampling_ratio/max": 1.4709787368774414, "sampling/importance_sampling_ratio/mean": 1.0000901222229004, "sampling/importance_sampling_ratio/min": 0.6008861660957336, "sampling/sampling_logp_difference/max": 0.5093498229980469, "sampling/sampling_logp_difference/mean": 0.01320959534496069, "step": 1798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 750.0, "completions/max_terminated_length": 750.0, "completions/mean_length": 365.875, "completions/mean_terminated_length": 365.875, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "entropy": 0.4839719533920288, "epoch": 2.204656862745098, "frac_reward_zero_std": 0.75, "grad_norm": 0.545923742828105, "kl": 0.021880680695176125, "learning_rate": 2.012302658452432e-07, "loss": 0.0162, "num_tokens": 77178507.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.8895013332366943, "sampling/importance_sampling_ratio/mean": 0.9999265670776367, "sampling/importance_sampling_ratio/min": 0.6132227778434753, "sampling/sampling_logp_difference/max": 0.6363129615783691, "sampling/sampling_logp_difference/mean": 0.014623960480093956, "step": 1799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 694.0, "completions/max_terminated_length": 694.0, "completions/mean_length": 325.140625, "completions/mean_terminated_length": 325.140625, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.5012704133987427, "epoch": 2.2058823529411766, "frac_reward_zero_std": 1.0, "grad_norm": 0.01560284043358411, "kl": 0.025328850373625755, "learning_rate": 2.0065935542937073e-07, "loss": 0.0002, "num_tokens": 77216244.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.593296766281128, "sampling/importance_sampling_ratio/mean": 1.0000742673873901, "sampling/importance_sampling_ratio/min": 0.6411710977554321, "sampling/sampling_logp_difference/max": 0.4658052921295166, "sampling/sampling_logp_difference/mean": 0.01652994006872177, "step": 1800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 983.0, "completions/max_terminated_length": 983.0, "completions/mean_length": 470.640625, "completions/mean_terminated_length": 470.640625, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.3929814398288727, "epoch": 2.207107843137255, "frac_reward_zero_std": 0.75, "grad_norm": 0.44108099505491044, "kl": 0.01509043201804161, "learning_rate": 2.0008905265604315e-07, "loss": 0.0437, "num_tokens": 77265597.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.5071613788604736, "sampling/importance_sampling_ratio/mean": 1.0000154972076416, "sampling/importance_sampling_ratio/min": 0.6513578295707703, "sampling/sampling_logp_difference/max": 0.4286961555480957, "sampling/sampling_logp_difference/mean": 0.011911094188690186, "step": 1801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 587.0, "completions/max_terminated_length": 587.0, "completions/mean_length": 306.59375, "completions/mean_terminated_length": 306.59375, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.4309977889060974, "epoch": 2.2083333333333335, "frac_reward_zero_std": 0.75, "grad_norm": 0.8914553936128676, "kl": 0.019010132178664207, "learning_rate": 1.995193586829387e-07, "loss": 0.0276, "num_tokens": 77299155.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.6355873346328735, "sampling/importance_sampling_ratio/mean": 1.0001317262649536, "sampling/importance_sampling_ratio/min": 0.6110513806343079, "sampling/sampling_logp_difference/max": 0.49257421493530273, "sampling/sampling_logp_difference/mean": 0.014738054946064949, "step": 1802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 745.0, "completions/max_terminated_length": 745.0, "completions/mean_length": 341.796875, "completions/mean_terminated_length": 341.796875, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "entropy": 0.5130549669265747, "epoch": 2.2095588235294117, "frac_reward_zero_std": 0.5, "grad_norm": 0.9477515349700422, "kl": 0.024997591972351074, "learning_rate": 1.989502746665001e-07, "loss": -0.0103, "num_tokens": 77335974.0, "reward": 0.5, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5277607440948486, "sampling/importance_sampling_ratio/mean": 1.0004420280456543, "sampling/importance_sampling_ratio/min": 0.6819170117378235, "sampling/sampling_logp_difference/max": 0.42380309104919434, "sampling/sampling_logp_difference/mean": 0.014988093636929989, "step": 1803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 950.0, "completions/max_terminated_length": 950.0, "completions/mean_length": 339.03125, "completions/mean_terminated_length": 339.03125, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.3996391296386719, "epoch": 2.2107843137254903, "frac_reward_zero_std": 0.75, "grad_norm": 0.5147803317779194, "kl": 0.01696379855275154, "learning_rate": 1.9838180176193176e-07, "loss": 0.0044, "num_tokens": 77385192.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.6012650728225708, "sampling/importance_sampling_ratio/mean": 0.9997930526733398, "sampling/importance_sampling_ratio/min": 0.48236948251724243, "sampling/sampling_logp_difference/max": 0.7290449142456055, "sampling/sampling_logp_difference/mean": 0.013464299961924553, "step": 1804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1639.0, "completions/max_terminated_length": 1639.0, "completions/mean_length": 369.28125, "completions/mean_terminated_length": 369.28125, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.4800667464733124, "epoch": 2.2120098039215685, "frac_reward_zero_std": 0.75, "grad_norm": 0.6707894504313506, "kl": 0.02475295588374138, "learning_rate": 1.9781394112319787e-07, "loss": -0.0121, "num_tokens": 77423226.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.3323075771331787, "sampling/importance_sampling_ratio/mean": 1.0001165866851807, "sampling/importance_sampling_ratio/min": 0.659538209438324, "sampling/sampling_logp_difference/max": 0.4162154197692871, "sampling/sampling_logp_difference/mean": 0.01550539955496788, "step": 1805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1890.0, "completions/max_terminated_length": 1890.0, "completions/mean_length": 575.015625, "completions/mean_terminated_length": 575.015625, "completions/min_length": 277.0, "completions/min_terminated_length": 277.0, "entropy": 0.6628866195678711, "epoch": 2.213235294117647, "frac_reward_zero_std": 0.25, "grad_norm": 0.7404301609221825, "kl": 0.031223822385072708, "learning_rate": 1.9724669390301946e-07, "loss": 0.0366, "num_tokens": 77482523.0, "reward": 0.53125, "reward_std": 0.7129635810852051, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.4470759630203247, "sampling/importance_sampling_ratio/mean": 0.9995838403701782, "sampling/importance_sampling_ratio/min": 0.6802341938018799, "sampling/sampling_logp_difference/max": 0.38531816005706787, "sampling/sampling_logp_difference/mean": 0.016113482415676117, "step": 1806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 732.0, "completions/max_terminated_length": 732.0, "completions/mean_length": 374.875, "completions/mean_terminated_length": 374.875, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.43825531005859375, "epoch": 2.2144607843137254, "frac_reward_zero_std": 0.75, "grad_norm": 0.5094910059758306, "kl": 0.03277820721268654, "learning_rate": 1.9668006125287228e-07, "loss": 0.0015, "num_tokens": 77521539.0, "reward": 0.15625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.6273024082183838, "sampling/importance_sampling_ratio/mean": 1.0002131462097168, "sampling/importance_sampling_ratio/min": 0.40282776951789856, "sampling/sampling_logp_difference/max": 0.9092462062835693, "sampling/sampling_logp_difference/mean": 0.014419611543416977, "step": 1807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1010.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 455.0625, "completions/mean_terminated_length": 455.0625, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "entropy": 0.42090708017349243, "epoch": 2.215686274509804, "frac_reward_zero_std": 0.75, "grad_norm": 0.5772465557669479, "kl": 0.016718709841370583, "learning_rate": 1.96114044322985e-07, "loss": 0.0236, "num_tokens": 77565431.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.6275135278701782, "sampling/importance_sampling_ratio/mean": 0.9999971389770508, "sampling/importance_sampling_ratio/min": 0.6485288739204407, "sampling/sampling_logp_difference/max": 0.48705339431762695, "sampling/sampling_logp_difference/mean": 0.012571683153510094, "step": 1808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 983.0, "completions/max_terminated_length": 983.0, "completions/mean_length": 324.84375, "completions/mean_terminated_length": 324.84375, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.4453199505805969, "epoch": 2.2169117647058822, "frac_reward_zero_std": 0.75, "grad_norm": 0.7444904109892722, "kl": 0.019820133224129677, "learning_rate": 1.9554864426233604e-07, "loss": 0.0075, "num_tokens": 77599757.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.9876788854599, "sampling/importance_sampling_ratio/mean": 1.000046968460083, "sampling/importance_sampling_ratio/min": 0.46248549222946167, "sampling/sampling_logp_difference/max": 0.7711400985717773, "sampling/sampling_logp_difference/mean": 0.014711258001625538, "step": 1809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/max_terminated_length": 508.0, "completions/mean_length": 244.859375, "completions/mean_terminated_length": 244.859375, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.5091066360473633, "epoch": 2.218137254901961, "frac_reward_zero_std": 1.0, "grad_norm": 0.029205609368414775, "kl": 0.05558595433831215, "learning_rate": 1.9498386221865165e-07, "loss": 0.0005, "num_tokens": 77628852.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5183864831924438, "sampling/importance_sampling_ratio/mean": 0.9997705817222595, "sampling/importance_sampling_ratio/min": 0.6135804653167725, "sampling/sampling_logp_difference/max": 0.48844385147094727, "sampling/sampling_logp_difference/mean": 0.017911771312355995, "step": 1810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 946.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 366.875, "completions/mean_terminated_length": 366.875, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.4989481270313263, "epoch": 2.219362745098039, "frac_reward_zero_std": 0.75, "grad_norm": 0.9117962166804473, "kl": 0.037387870252132416, "learning_rate": 1.944196993384034e-07, "loss": -0.046, "num_tokens": 77675612.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000181198120117, "sampling/importance_sampling_ratio/min": 0.5284532904624939, "sampling/sampling_logp_difference/max": 0.9685583114624023, "sampling/sampling_logp_difference/mean": 0.015777675434947014, "step": 1811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1457.0, "completions/max_terminated_length": 1457.0, "completions/mean_length": 377.5, "completions/mean_terminated_length": 377.5, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "entropy": 0.5131951570510864, "epoch": 2.2205882352941178, "frac_reward_zero_std": 0.75, "grad_norm": 0.5873281466855611, "kl": 0.024611230939626694, "learning_rate": 1.9385615676680661e-07, "loss": -0.119, "num_tokens": 77716412.0, "reward": 0.6875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.4778432846069336, "sampling/importance_sampling_ratio/mean": 0.9997417330741882, "sampling/importance_sampling_ratio/min": 0.6147649884223938, "sampling/sampling_logp_difference/max": 0.4865152835845947, "sampling/sampling_logp_difference/mean": 0.016773562878370285, "step": 1812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 582.0, "completions/max_terminated_length": 582.0, "completions/mean_length": 346.234375, "completions/mean_terminated_length": 346.234375, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.5243304371833801, "epoch": 2.221813725490196, "frac_reward_zero_std": 0.75, "grad_norm": 0.5644109802913048, "kl": 0.03661858290433884, "learning_rate": 1.932932356478168e-07, "loss": 0.0148, "num_tokens": 77754459.0, "reward": -0.21875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": -0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 1.5283530950546265, "sampling/importance_sampling_ratio/mean": 0.9999675154685974, "sampling/importance_sampling_ratio/min": 0.5615813136100769, "sampling/sampling_logp_difference/max": 0.5769987106323242, "sampling/sampling_logp_difference/mean": 0.015164226293563843, "step": 1813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1132.0, "completions/max_terminated_length": 1132.0, "completions/mean_length": 344.859375, "completions/mean_terminated_length": 344.859375, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.5121687054634094, "epoch": 2.2230392156862746, "frac_reward_zero_std": 0.5, "grad_norm": 0.8634048231645187, "kl": 0.02519151195883751, "learning_rate": 1.9273093712412796e-07, "loss": -0.0157, "num_tokens": 77794226.0, "reward": 0.71875, "reward_std": 0.38319888710975647, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.5754395723342896, "sampling/importance_sampling_ratio/mean": 1.000154972076416, "sampling/importance_sampling_ratio/min": 0.5088934302330017, "sampling/sampling_logp_difference/max": 0.6755166053771973, "sampling/sampling_logp_difference/mean": 0.01657344587147236, "step": 1814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 983.0, "completions/max_terminated_length": 983.0, "completions/mean_length": 340.734375, "completions/mean_terminated_length": 340.734375, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.4553510546684265, "epoch": 2.224264705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.01230311017124306, "kl": 0.017795678228139877, "learning_rate": 1.9216926233717084e-07, "loss": 0.0002, "num_tokens": 77835697.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6423442363739014, "sampling/importance_sampling_ratio/mean": 1.0003767013549805, "sampling/importance_sampling_ratio/min": 0.682036280632019, "sampling/sampling_logp_difference/max": 0.49612462520599365, "sampling/sampling_logp_difference/mean": 0.014353410340845585, "step": 1815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 965.0, "completions/max_terminated_length": 965.0, "completions/mean_length": 361.03125, "completions/mean_terminated_length": 361.03125, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.5390446186065674, "epoch": 2.2254901960784315, "frac_reward_zero_std": 0.75, "grad_norm": 0.573576394881269, "kl": 0.03515120595693588, "learning_rate": 1.9160821242710957e-07, "loss": -0.0053, "num_tokens": 77874899.0, "reward": 0.21875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9997150301933289, "sampling/importance_sampling_ratio/min": 0.4596952497959137, "sampling/sampling_logp_difference/max": 0.7771915197372437, "sampling/sampling_logp_difference/mean": 0.01597515121102333, "step": 1816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1540.0, "completions/max_terminated_length": 1540.0, "completions/mean_length": 432.21875, "completions/mean_terminated_length": 432.21875, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "entropy": 0.40136513113975525, "epoch": 2.2267156862745097, "frac_reward_zero_std": 0.5, "grad_norm": 0.8326256670570322, "kl": 0.01833195611834526, "learning_rate": 1.9104778853283987e-07, "loss": -0.0194, "num_tokens": 77920033.0, "reward": 0.875, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.4684343338012695, "sampling/importance_sampling_ratio/mean": 1.0003108978271484, "sampling/importance_sampling_ratio/min": 0.7084636688232422, "sampling/sampling_logp_difference/max": 0.38419675827026367, "sampling/sampling_logp_difference/mean": 0.013238780200481415, "step": 1817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 650.0, "completions/max_terminated_length": 650.0, "completions/mean_length": 370.40625, "completions/mean_terminated_length": 370.40625, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "entropy": 0.6671915054321289, "epoch": 2.2279411764705883, "frac_reward_zero_std": 0.25, "grad_norm": 1.0236047888259607, "kl": 0.038343288004398346, "learning_rate": 1.9048799179198655e-07, "loss": 0.0219, "num_tokens": 77957563.0, "reward": 0.875, "reward_std": 0.42078250646591187, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.6007554531097412, "sampling/importance_sampling_ratio/mean": 1.0002281665802002, "sampling/importance_sampling_ratio/min": 0.7043696641921997, "sampling/sampling_logp_difference/max": 0.4704756736755371, "sampling/sampling_logp_difference/mean": 0.01767636463046074, "step": 1818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1143.0, "completions/max_terminated_length": 1143.0, "completions/mean_length": 309.234375, "completions/mean_terminated_length": 309.234375, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.35850587487220764, "epoch": 2.2291666666666665, "frac_reward_zero_std": 1.0, "grad_norm": 0.013631515646856692, "kl": 0.018051637336611748, "learning_rate": 1.8992882334090188e-07, "loss": 0.0002, "num_tokens": 77992042.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6023966073989868, "sampling/importance_sampling_ratio/mean": 1.0003914833068848, "sampling/importance_sampling_ratio/min": 0.6254510879516602, "sampling/sampling_logp_difference/max": 0.4715003967285156, "sampling/sampling_logp_difference/mean": 0.01314765028655529, "step": 1819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1004.0, "completions/max_terminated_length": 1004.0, "completions/mean_length": 426.171875, "completions/mean_terminated_length": 426.171875, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "entropy": 0.40699896216392517, "epoch": 2.230392156862745, "frac_reward_zero_std": 1.0, "grad_norm": 0.014528670781013501, "kl": 0.016173139214515686, "learning_rate": 1.893702843146623e-07, "loss": 0.0002, "num_tokens": 78039541.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000300407409668, "sampling/importance_sampling_ratio/min": 0.612627387046814, "sampling/sampling_logp_difference/max": 0.7695990800857544, "sampling/sampling_logp_difference/mean": 0.012432975694537163, "step": 1820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1185.0, "completions/max_terminated_length": 1185.0, "completions/mean_length": 305.109375, "completions/mean_terminated_length": 305.109375, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.4268977642059326, "epoch": 2.2316176470588234, "frac_reward_zero_std": 1.0, "grad_norm": 0.013542892714786266, "kl": 0.01863497495651245, "learning_rate": 1.8881237584706632e-07, "loss": 0.0002, "num_tokens": 78075484.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5071556568145752, "sampling/importance_sampling_ratio/mean": 1.0005778074264526, "sampling/importance_sampling_ratio/min": 0.6745736598968506, "sampling/sampling_logp_difference/max": 0.41022419929504395, "sampling/sampling_logp_difference/mean": 0.01498476043343544, "step": 1821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1270.0, "completions/max_terminated_length": 1270.0, "completions/mean_length": 366.5625, "completions/mean_terminated_length": 366.5625, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.5170689225196838, "epoch": 2.232843137254902, "frac_reward_zero_std": 0.75, "grad_norm": 0.6726280191077447, "kl": 0.024042444303631783, "learning_rate": 1.8825509907063326e-07, "loss": 0.0815, "num_tokens": 78114592.0, "reward": -0.0625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": -0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001697540283203, "sampling/importance_sampling_ratio/min": 0.6246797442436218, "sampling/sampling_logp_difference/max": 0.6973974704742432, "sampling/sampling_logp_difference/mean": 0.015440784394741058, "step": 1822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 963.0, "completions/max_terminated_length": 963.0, "completions/mean_length": 437.515625, "completions/mean_terminated_length": 437.515625, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.5320929288864136, "epoch": 2.2340686274509802, "frac_reward_zero_std": 0.5, "grad_norm": 0.7442936016089974, "kl": 0.02673964947462082, "learning_rate": 1.8769845511659927e-07, "loss": -0.0073, "num_tokens": 78160849.0, "reward": 0.0, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.3671332597732544, "sampling/importance_sampling_ratio/mean": 0.999408483505249, "sampling/importance_sampling_ratio/min": 0.6916012167930603, "sampling/sampling_logp_difference/max": 0.3687458038330078, "sampling/sampling_logp_difference/mean": 0.016456158831715584, "step": 1823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1184.0, "completions/max_terminated_length": 1184.0, "completions/mean_length": 407.015625, "completions/mean_terminated_length": 407.015625, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.45794880390167236, "epoch": 2.235294117647059, "frac_reward_zero_std": 0.75, "grad_norm": 0.6125091514359265, "kl": 0.020363878458738327, "learning_rate": 1.871424451149169e-07, "loss": -0.061, "num_tokens": 78204690.0, "reward": 0.78125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.5926154851913452, "sampling/importance_sampling_ratio/mean": 0.9995171427726746, "sampling/importance_sampling_ratio/min": 0.5997016429901123, "sampling/sampling_logp_difference/max": 0.5113229751586914, "sampling/sampling_logp_difference/mean": 0.013621710240840912, "step": 1824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1048.0, "completions/max_terminated_length": 1048.0, "completions/mean_length": 528.546875, "completions/mean_terminated_length": 528.546875, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "entropy": 0.5617486238479614, "epoch": 2.236519607843137, "frac_reward_zero_std": 0.25, "grad_norm": 0.8483819973160336, "kl": 0.026536012068390846, "learning_rate": 1.865870701942504e-07, "loss": 0.0487, "num_tokens": 78261573.0, "reward": 0.46875, "reward_std": 0.6413977742195129, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.6485744714736938, "sampling/importance_sampling_ratio/mean": 0.9999451637268066, "sampling/importance_sampling_ratio/min": 0.23401731252670288, "sampling/sampling_logp_difference/max": 1.4523601531982422, "sampling/sampling_logp_difference/mean": 0.015000881627202034, "step": 1825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 851.0, "completions/max_terminated_length": 851.0, "completions/mean_length": 418.140625, "completions/mean_terminated_length": 418.140625, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "entropy": 0.45768770575523376, "epoch": 2.2377450980392157, "frac_reward_zero_std": 0.75, "grad_norm": 0.4953519143126773, "kl": 0.02575334906578064, "learning_rate": 1.8603233148197632e-07, "loss": 0.0166, "num_tokens": 78306926.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.4474334716796875, "sampling/importance_sampling_ratio/mean": 0.9996877908706665, "sampling/importance_sampling_ratio/min": 0.655702531337738, "sampling/sampling_logp_difference/max": 0.4220479726791382, "sampling/sampling_logp_difference/mean": 0.01282263733446598, "step": 1826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 761.0, "completions/max_terminated_length": 761.0, "completions/mean_length": 326.109375, "completions/mean_terminated_length": 326.109375, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.47683221101760864, "epoch": 2.238970588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.01813322522245654, "kl": 0.028757065534591675, "learning_rate": 1.8547823010417873e-07, "loss": 0.0002, "num_tokens": 78342293.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5545629262924194, "sampling/importance_sampling_ratio/mean": 1.0002126693725586, "sampling/importance_sampling_ratio/min": 0.5947788953781128, "sampling/sampling_logp_difference/max": 0.5195655822753906, "sampling/sampling_logp_difference/mean": 0.014945060946047306, "step": 1827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 586.0, "completions/max_terminated_length": 586.0, "completions/mean_length": 364.515625, "completions/mean_terminated_length": 364.515625, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "entropy": 0.4864698648452759, "epoch": 2.2401960784313726, "frac_reward_zero_std": 1.0, "grad_norm": 0.015026819978659303, "kl": 0.026134688407182693, "learning_rate": 1.8492476718564866e-07, "loss": 0.0003, "num_tokens": 78384694.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5277607440948486, "sampling/importance_sampling_ratio/mean": 0.9998325109481812, "sampling/importance_sampling_ratio/min": 0.6952037811279297, "sampling/sampling_logp_difference/max": 0.42380309104919434, "sampling/sampling_logp_difference/mean": 0.014997739344835281, "step": 1828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 864.0, "completions/max_terminated_length": 864.0, "completions/mean_length": 375.203125, "completions/mean_terminated_length": 375.203125, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "entropy": 0.474571168422699, "epoch": 2.241421568627451, "frac_reward_zero_std": 0.75, "grad_norm": 0.7057682849144691, "kl": 0.036596354097127914, "learning_rate": 1.8437194384988058e-07, "loss": 0.0501, "num_tokens": 78426659.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.4300943613052368, "sampling/importance_sampling_ratio/mean": 1.0001161098480225, "sampling/importance_sampling_ratio/min": 0.6493422985076904, "sampling/sampling_logp_difference/max": 0.43179523944854736, "sampling/sampling_logp_difference/mean": 0.014439743012189865, "step": 1829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1930.0, "completions/max_terminated_length": 1930.0, "completions/mean_length": 402.671875, "completions/mean_terminated_length": 402.671875, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.4075208008289337, "epoch": 2.2426470588235294, "frac_reward_zero_std": 0.75, "grad_norm": 0.49292166937112764, "kl": 0.02172679826617241, "learning_rate": 1.8381976121907067e-07, "loss": 0.0179, "num_tokens": 78468142.0, "reward": -0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": -0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.506484866142273, "sampling/importance_sampling_ratio/mean": 1.0000433921813965, "sampling/importance_sampling_ratio/min": 0.7054005861282349, "sampling/sampling_logp_difference/max": 0.40977907180786133, "sampling/sampling_logp_difference/mean": 0.014462907798588276, "step": 1830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1492.0, "completions/max_terminated_length": 1492.0, "completions/mean_length": 465.6875, "completions/mean_terminated_length": 465.6875, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.32122695446014404, "epoch": 2.2438725490196076, "frac_reward_zero_std": 0.5, "grad_norm": 0.6877840021298376, "kl": 0.015160547569394112, "learning_rate": 1.832682204141152e-07, "loss": 0.019, "num_tokens": 78516458.0, "reward": 0.90625, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.6805397272109985, "sampling/importance_sampling_ratio/mean": 0.9999575614929199, "sampling/importance_sampling_ratio/min": 0.6873292922973633, "sampling/sampling_logp_difference/max": 0.5191149711608887, "sampling/sampling_logp_difference/mean": 0.010458738543093204, "step": 1831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 749.0, "completions/max_terminated_length": 749.0, "completions/mean_length": 284.4375, "completions/mean_terminated_length": 284.4375, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.4195646345615387, "epoch": 2.2450980392156863, "frac_reward_zero_std": 1.0, "grad_norm": 0.02311901506336226, "kl": 0.04340868070721626, "learning_rate": 1.8271732255460643e-07, "loss": 0.0004, "num_tokens": 78552182.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.2854276895523071, "sampling/importance_sampling_ratio/mean": 0.9998728036880493, "sampling/importance_sampling_ratio/min": 0.5658827424049377, "sampling/sampling_logp_difference/max": 0.5693683624267578, "sampling/sampling_logp_difference/mean": 0.01417627651244402, "step": 1832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1020.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 425.28125, "completions/mean_terminated_length": 425.28125, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "entropy": 0.4010704755783081, "epoch": 2.2463235294117645, "frac_reward_zero_std": 1.0, "grad_norm": 0.011060136700345821, "kl": 0.013865049928426743, "learning_rate": 1.8216706875883252e-07, "loss": 0.0001, "num_tokens": 78596456.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.661383032798767, "sampling/importance_sampling_ratio/mean": 0.9998809695243835, "sampling/importance_sampling_ratio/min": 0.688477635383606, "sampling/sampling_logp_difference/max": 0.5076503753662109, "sampling/sampling_logp_difference/mean": 0.012745695188641548, "step": 1833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 684.0, "completions/max_terminated_length": 684.0, "completions/mean_length": 386.71875, "completions/mean_terminated_length": 386.71875, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.3890659213066101, "epoch": 2.247549019607843, "frac_reward_zero_std": 1.0, "grad_norm": 0.01340695300740491, "kl": 0.015656478703022003, "learning_rate": 1.816174601437736e-07, "loss": 0.0001, "num_tokens": 78640646.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.475721836090088, "sampling/importance_sampling_ratio/mean": 1.0001487731933594, "sampling/importance_sampling_ratio/min": 0.6675333380699158, "sampling/sampling_logp_difference/max": 0.40416598320007324, "sampling/sampling_logp_difference/mean": 0.013032941147685051, "step": 1834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 790.0, "completions/max_terminated_length": 790.0, "completions/mean_length": 416.625, "completions/mean_terminated_length": 416.625, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "entropy": 0.665643572807312, "epoch": 2.248774509803922, "frac_reward_zero_std": 0.75, "grad_norm": 0.5268437254364967, "kl": 0.036319926381111145, "learning_rate": 1.8106849782510058e-07, "loss": -0.0297, "num_tokens": 78685182.0, "reward": 0.375, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.372318148612976, "sampling/importance_sampling_ratio/mean": 0.9997155070304871, "sampling/importance_sampling_ratio/min": 0.6932216286659241, "sampling/sampling_logp_difference/max": 0.3664054870605469, "sampling/sampling_logp_difference/mean": 0.018284786492586136, "step": 1835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1825.0, "completions/max_terminated_length": 1825.0, "completions/mean_length": 450.171875, "completions/mean_terminated_length": 450.171875, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.5839906930923462, "epoch": 2.25, "frac_reward_zero_std": 0.5, "grad_norm": 0.6369693569203507, "kl": 0.028595414012670517, "learning_rate": 1.8052018291717215e-07, "loss": -0.0287, "num_tokens": 78737337.0, "reward": 0.78125, "reward_std": 0.375, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.366051197052002, "sampling/importance_sampling_ratio/mean": 0.9997481107711792, "sampling/importance_sampling_ratio/min": 0.394179105758667, "sampling/sampling_logp_difference/max": 0.9309499263763428, "sampling/sampling_logp_difference/mean": 0.017006445676088333, "step": 1836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1425.0, "completions/max_terminated_length": 1425.0, "completions/mean_length": 429.265625, "completions/mean_terminated_length": 429.265625, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.39437341690063477, "epoch": 2.251225490196078, "frac_reward_zero_std": 0.75, "grad_norm": 0.5855328560149056, "kl": 0.018869005143642426, "learning_rate": 1.7997251653303247e-07, "loss": -0.041, "num_tokens": 78785402.0, "reward": 0.6875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.6057831048965454, "sampling/importance_sampling_ratio/mean": 0.999871551990509, "sampling/importance_sampling_ratio/min": 0.14018785953521729, "sampling/sampling_logp_difference/max": 1.9647718667984009, "sampling/sampling_logp_difference/mean": 0.012879624962806702, "step": 1837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 564.0, "completions/max_terminated_length": 564.0, "completions/mean_length": 304.375, "completions/mean_terminated_length": 304.375, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.39975598454475403, "epoch": 2.252450980392157, "frac_reward_zero_std": 0.75, "grad_norm": 0.5490632695238515, "kl": 0.02362503856420517, "learning_rate": 1.7942549978441012e-07, "loss": 0.0204, "num_tokens": 78824002.0, "reward": 0.59375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.4425772428512573, "sampling/importance_sampling_ratio/mean": 1.000264048576355, "sampling/importance_sampling_ratio/min": 0.6171485781669617, "sampling/sampling_logp_difference/max": 0.48264551162719727, "sampling/sampling_logp_difference/mean": 0.013322560116648674, "step": 1838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 607.0, "completions/max_terminated_length": 607.0, "completions/mean_length": 373.265625, "completions/mean_terminated_length": 373.265625, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.3995002508163452, "epoch": 2.2536764705882355, "frac_reward_zero_std": 0.75, "grad_norm": 0.595653264074915, "kl": 0.018353605642914772, "learning_rate": 1.7887913378171422e-07, "loss": -0.0091, "num_tokens": 78862707.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.5300079584121704, "sampling/importance_sampling_ratio/mean": 1.000309944152832, "sampling/importance_sampling_ratio/min": 0.6890257596969604, "sampling/sampling_logp_difference/max": 0.42527294158935547, "sampling/sampling_logp_difference/mean": 0.013491815887391567, "step": 1839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 633.0, "completions/max_terminated_length": 633.0, "completions/mean_length": 374.171875, "completions/mean_terminated_length": 374.171875, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "entropy": 0.34053659439086914, "epoch": 2.2549019607843137, "frac_reward_zero_std": 1.0, "grad_norm": 0.009390249273605036, "kl": 0.013118261471390724, "learning_rate": 1.783334196340331e-07, "loss": 0.0001, "num_tokens": 78905230.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5981541872024536, "sampling/importance_sampling_ratio/mean": 0.99994957447052, "sampling/importance_sampling_ratio/min": 0.6557512879371643, "sampling/sampling_logp_difference/max": 0.4688493013381958, "sampling/sampling_logp_difference/mean": 0.012115593999624252, "step": 1840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 828.0, "completions/max_terminated_length": 828.0, "completions/mean_length": 269.84375, "completions/mean_terminated_length": 269.84375, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.35178232192993164, "epoch": 2.256127450980392, "frac_reward_zero_std": 0.75, "grad_norm": 0.7388813680084697, "kl": 0.027219243347644806, "learning_rate": 1.777883584491317e-07, "loss": -0.0019, "num_tokens": 78936436.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.9949262142181396, "sampling/importance_sampling_ratio/mean": 1.0002716779708862, "sampling/importance_sampling_ratio/min": 0.6546491980552673, "sampling/sampling_logp_difference/max": 0.6906070709228516, "sampling/sampling_logp_difference/mean": 0.01376100443303585, "step": 1841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 580.0, "completions/max_terminated_length": 580.0, "completions/mean_length": 330.53125, "completions/mean_terminated_length": 330.53125, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.3753882646560669, "epoch": 2.2573529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.012089614197538682, "kl": 0.019231054931879044, "learning_rate": 1.7724395133345022e-07, "loss": 0.0002, "num_tokens": 78978326.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4387964010238647, "sampling/importance_sampling_ratio/mean": 0.999958872795105, "sampling/importance_sampling_ratio/min": 0.6300273537635803, "sampling/sampling_logp_difference/max": 0.4619920253753662, "sampling/sampling_logp_difference/mean": 0.01341361366212368, "step": 1842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 798.0, "completions/max_terminated_length": 798.0, "completions/mean_length": 329.265625, "completions/mean_terminated_length": 329.265625, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.45973289012908936, "epoch": 2.258578431372549, "frac_reward_zero_std": 1.0, "grad_norm": 0.03815210424440193, "kl": 0.03967190533876419, "learning_rate": 1.7670019939210023e-07, "loss": 0.0003, "num_tokens": 79015495.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5203777551651, "sampling/importance_sampling_ratio/mean": 1.0003726482391357, "sampling/importance_sampling_ratio/min": 0.6814418435096741, "sampling/sampling_logp_difference/max": 0.4189589023590088, "sampling/sampling_logp_difference/mean": 0.015268797054886818, "step": 1843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1137.0, "completions/max_terminated_length": 1137.0, "completions/mean_length": 315.8125, "completions/mean_terminated_length": 315.8125, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.46514904499053955, "epoch": 2.2598039215686274, "frac_reward_zero_std": 1.0, "grad_norm": 0.02162449798701416, "kl": 0.03386976942420006, "learning_rate": 1.761571037288637e-07, "loss": 0.0003, "num_tokens": 79050027.0, "reward": -0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": -0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5876190662384033, "sampling/importance_sampling_ratio/mean": 0.9999237656593323, "sampling/importance_sampling_ratio/min": 0.5784816741943359, "sampling/sampling_logp_difference/max": 0.5473483800888062, "sampling/sampling_logp_difference/mean": 0.01454075239598751, "step": 1844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 867.0, "completions/max_terminated_length": 867.0, "completions/mean_length": 408.46875, "completions/mean_terminated_length": 408.46875, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "entropy": 0.5049855709075928, "epoch": 2.261029411764706, "frac_reward_zero_std": 0.75, "grad_norm": 0.4800062639138308, "kl": 0.02893223613500595, "learning_rate": 1.7561466544619076e-07, "loss": -0.0218, "num_tokens": 79099817.0, "reward": 0.25, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.25, "rewards/decision_reward_func/std": 0.9759001135826111, "sampling/importance_sampling_ratio/max": 1.4366099834442139, "sampling/importance_sampling_ratio/mean": 0.9996892213821411, "sampling/importance_sampling_ratio/min": 0.6833348870277405, "sampling/sampling_logp_difference/max": 0.380770206451416, "sampling/sampling_logp_difference/mean": 0.015779659152030945, "step": 1845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/max_terminated_length": 479.0, "completions/mean_length": 264.609375, "completions/mean_terminated_length": 264.609375, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.39258676767349243, "epoch": 2.2622549019607843, "frac_reward_zero_std": 1.0, "grad_norm": 0.014349887412007276, "kl": 0.016227813437581062, "learning_rate": 1.7507288564519646e-07, "loss": 0.0002, "num_tokens": 79130784.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5866132974624634, "sampling/importance_sampling_ratio/mean": 1.000051498413086, "sampling/importance_sampling_ratio/min": 0.634133517742157, "sampling/sampling_logp_difference/max": 0.46160173416137695, "sampling/sampling_logp_difference/mean": 0.01308516412973404, "step": 1846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 970.0, "completions/max_terminated_length": 970.0, "completions/mean_length": 481.65625, "completions/mean_terminated_length": 481.65625, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "entropy": 0.48957353830337524, "epoch": 2.263480392156863, "frac_reward_zero_std": 0.75, "grad_norm": 0.42472564061007784, "kl": 0.022334149107336998, "learning_rate": 1.7453176542565956e-07, "loss": 0.0131, "num_tokens": 79186330.0, "reward": 0.0625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.5064202547073364, "sampling/importance_sampling_ratio/mean": 0.9998606443405151, "sampling/importance_sampling_ratio/min": 0.6773408651351929, "sampling/sampling_logp_difference/max": 0.40973615646362305, "sampling/sampling_logp_difference/mean": 0.013824197463691235, "step": 1847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1708.0, "completions/max_terminated_length": 1708.0, "completions/mean_length": 521.96875, "completions/mean_terminated_length": 521.96875, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "entropy": 0.567033588886261, "epoch": 2.264705882352941, "frac_reward_zero_std": 0.75, "grad_norm": 0.5272099243399846, "kl": 0.020734522491693497, "learning_rate": 1.7399130588601968e-07, "loss": -0.0079, "num_tokens": 79245096.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.7036032676696777, "sampling/importance_sampling_ratio/mean": 0.9998452663421631, "sampling/importance_sampling_ratio/min": 0.4891483187675476, "sampling/sampling_logp_difference/max": 0.7150895595550537, "sampling/sampling_logp_difference/mean": 0.01653275638818741, "step": 1848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 860.0, "completions/max_terminated_length": 860.0, "completions/mean_length": 398.625, "completions/mean_terminated_length": 398.625, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "entropy": 0.33853015303611755, "epoch": 2.2659313725490198, "frac_reward_zero_std": 1.0, "grad_norm": 0.16389423685985277, "kl": 0.018137875944375992, "learning_rate": 1.7345150812337562e-07, "loss": 0.0002, "num_tokens": 79288960.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4829621315002441, "sampling/importance_sampling_ratio/mean": 1.000073790550232, "sampling/importance_sampling_ratio/min": 0.6808790564537048, "sampling/sampling_logp_difference/max": 0.3940415382385254, "sampling/sampling_logp_difference/mean": 0.010951514355838299, "step": 1849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 974.0, "completions/max_terminated_length": 974.0, "completions/mean_length": 508.65625, "completions/mean_terminated_length": 508.65625, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "entropy": 0.43531110882759094, "epoch": 2.267156862745098, "frac_reward_zero_std": 0.25, "grad_norm": 0.7202606593472544, "kl": 0.02092622220516205, "learning_rate": 1.7291237323348284e-07, "loss": 0.0799, "num_tokens": 79337626.0, "reward": 0.8125, "reward_std": 0.47360679507255554, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.4597969055175781, "sampling/importance_sampling_ratio/mean": 1.0001006126403809, "sampling/importance_sampling_ratio/min": 0.6975887417793274, "sampling/sampling_logp_difference/max": 0.3782973289489746, "sampling/sampling_logp_difference/mean": 0.012409440241754055, "step": 1850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/max_terminated_length": 476.0, "completions/mean_length": 306.6875, "completions/mean_terminated_length": 306.6875, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "entropy": 0.495835542678833, "epoch": 2.2683823529411766, "frac_reward_zero_std": 0.75, "grad_norm": 0.6815891727713098, "kl": 0.019778715446591377, "learning_rate": 1.7237390231075055e-07, "loss": 0.0029, "num_tokens": 79380486.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.608889102935791, "sampling/importance_sampling_ratio/mean": 0.9998217225074768, "sampling/importance_sampling_ratio/min": 0.6527426838874817, "sampling/sampling_logp_difference/max": 0.4755439758300781, "sampling/sampling_logp_difference/mean": 0.016442645341157913, "step": 1851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1195.0, "completions/max_terminated_length": 1195.0, "completions/mean_length": 402.1875, "completions/mean_terminated_length": 402.1875, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "entropy": 0.4828660488128662, "epoch": 2.269607843137255, "frac_reward_zero_std": 0.5, "grad_norm": 0.8406021366438231, "kl": 0.021436717361211777, "learning_rate": 1.7183609644824092e-07, "loss": -0.0081, "num_tokens": 79425106.0, "reward": 0.6875, "reward_std": 0.3811737596988678, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.551693320274353, "sampling/importance_sampling_ratio/mean": 1.0002994537353516, "sampling/importance_sampling_ratio/min": 0.5470100045204163, "sampling/sampling_logp_difference/max": 0.6032881736755371, "sampling/sampling_logp_difference/mean": 0.015464906580746174, "step": 1852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 807.0, "completions/max_terminated_length": 807.0, "completions/mean_length": 423.515625, "completions/mean_terminated_length": 423.515625, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.4393807649612427, "epoch": 2.2708333333333335, "frac_reward_zero_std": 0.75, "grad_norm": 0.38818305697301125, "kl": 0.03784859552979469, "learning_rate": 1.7129895673766575e-07, "loss": 0.0073, "num_tokens": 79469075.0, "reward": 0.59375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0003139972686768, "sampling/importance_sampling_ratio/min": 0.6005309820175171, "sampling/sampling_logp_difference/max": 0.9375696182250977, "sampling/sampling_logp_difference/mean": 0.014375714585185051, "step": 1853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 795.0, "completions/max_terminated_length": 795.0, "completions/mean_length": 362.71875, "completions/mean_terminated_length": 362.71875, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.4038153886795044, "epoch": 2.2720588235294117, "frac_reward_zero_std": 0.75, "grad_norm": 0.591328884631241, "kl": 0.02259822189807892, "learning_rate": 1.707624842693844e-07, "loss": 0.013, "num_tokens": 79518193.0, "reward": 0.625, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.5288913249969482, "sampling/importance_sampling_ratio/mean": 1.0000289678573608, "sampling/importance_sampling_ratio/min": 0.6206696629524231, "sampling/sampling_logp_difference/max": 0.4769563674926758, "sampling/sampling_logp_difference/mean": 0.013017506338655949, "step": 1854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1324.0, "completions/max_terminated_length": 1324.0, "completions/mean_length": 462.171875, "completions/mean_terminated_length": 462.171875, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.4122299551963806, "epoch": 2.2732843137254903, "frac_reward_zero_std": 0.5, "grad_norm": 0.6650181858935416, "kl": 0.024149810895323753, "learning_rate": 1.7022668013240227e-07, "loss": 0.0717, "num_tokens": 79565244.0, "reward": 0.90625, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.5422439575195312, "sampling/importance_sampling_ratio/mean": 0.999788224697113, "sampling/importance_sampling_ratio/min": 0.657778263092041, "sampling/sampling_logp_difference/max": 0.43323850631713867, "sampling/sampling_logp_difference/mean": 0.013101176358759403, "step": 1855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 563.0, "completions/max_terminated_length": 563.0, "completions/mean_length": 347.0625, "completions/mean_terminated_length": 347.0625, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "entropy": 0.5883066654205322, "epoch": 2.2745098039215685, "frac_reward_zero_std": 1.0, "grad_norm": 0.01664013718753665, "kl": 0.02187223918735981, "learning_rate": 1.696915454143676e-07, "loss": 0.0002, "num_tokens": 79603984.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.601819396018982, "sampling/importance_sampling_ratio/mean": 1.000472068786621, "sampling/importance_sampling_ratio/min": 0.7588745951652527, "sampling/sampling_logp_difference/max": 0.47114014625549316, "sampling/sampling_logp_difference/mean": 0.01659969799220562, "step": 1856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 686.0, "completions/max_terminated_length": 686.0, "completions/mean_length": 374.15625, "completions/mean_terminated_length": 374.15625, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "entropy": 0.5421885251998901, "epoch": 2.275735294117647, "frac_reward_zero_std": 0.75, "grad_norm": 0.7154221275282273, "kl": 0.04159776121377945, "learning_rate": 1.691570812015704e-07, "loss": 0.0071, "num_tokens": 79646746.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.5262054204940796, "sampling/importance_sampling_ratio/mean": 0.999886155128479, "sampling/importance_sampling_ratio/min": 0.4179096519947052, "sampling/sampling_logp_difference/max": 0.8724900484085083, "sampling/sampling_logp_difference/mean": 0.016840901225805283, "step": 1857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1232.0, "completions/max_terminated_length": 1232.0, "completions/mean_length": 354.9375, "completions/mean_terminated_length": 354.9375, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.48874521255493164, "epoch": 2.2769607843137254, "frac_reward_zero_std": 0.75, "grad_norm": 0.6479176069918003, "kl": 0.02058054506778717, "learning_rate": 1.6862328857893855e-07, "loss": -0.0426, "num_tokens": 79686182.0, "reward": 0.65625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.336474895477295, "sampling/importance_sampling_ratio/mean": 1.0001873970031738, "sampling/importance_sampling_ratio/min": 0.45967480540275574, "sampling/sampling_logp_difference/max": 0.7772359848022461, "sampling/sampling_logp_difference/mean": 0.015158320777118206, "step": 1858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 997.0, "completions/max_terminated_length": 997.0, "completions/mean_length": 394.125, "completions/mean_terminated_length": 394.125, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.6044921278953552, "epoch": 2.278186274509804, "frac_reward_zero_std": 0.75, "grad_norm": 0.6154226911318204, "kl": 0.03354589268565178, "learning_rate": 1.680901686300376e-07, "loss": 0.052, "num_tokens": 79733310.0, "reward": 0.84375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.6339075565338135, "sampling/importance_sampling_ratio/mean": 0.9997779726982117, "sampling/importance_sampling_ratio/min": 0.5171712040901184, "sampling/sampling_logp_difference/max": 0.6593812704086304, "sampling/sampling_logp_difference/mean": 0.0174876619130373, "step": 1859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1136.0, "completions/max_terminated_length": 1136.0, "completions/mean_length": 526.921875, "completions/mean_terminated_length": 526.921875, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "entropy": 0.6017006039619446, "epoch": 2.2794117647058822, "frac_reward_zero_std": 0.5, "grad_norm": 0.6428140832976054, "kl": 0.023457251489162445, "learning_rate": 1.6755772243706712e-07, "loss": 0.0056, "num_tokens": 79784889.0, "reward": 0.375, "reward_std": 0.49553054571151733, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.5853791236877441, "sampling/importance_sampling_ratio/mean": 1.000252366065979, "sampling/importance_sampling_ratio/min": 0.6657382249832153, "sampling/sampling_logp_difference/max": 0.46082353591918945, "sampling/sampling_logp_difference/mean": 0.016281254589557648, "step": 1860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 904.0, "completions/max_terminated_length": 904.0, "completions/mean_length": 415.90625, "completions/mean_terminated_length": 415.90625, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "entropy": 0.601885199546814, "epoch": 2.280637254901961, "frac_reward_zero_std": 0.75, "grad_norm": 0.5610008441875265, "kl": 0.03715939447283745, "learning_rate": 1.6702595108085942e-07, "loss": -0.0032, "num_tokens": 79832643.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001473426818848, "sampling/importance_sampling_ratio/min": 0.5339652299880981, "sampling/sampling_logp_difference/max": 0.7580962181091309, "sampling/sampling_logp_difference/mean": 0.016394129022955894, "step": 1861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1070.0, "completions/max_terminated_length": 1070.0, "completions/mean_length": 388.53125, "completions/mean_terminated_length": 388.53125, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "entropy": 0.5891439318656921, "epoch": 2.281862745098039, "frac_reward_zero_std": 0.75, "grad_norm": 0.5136197978501733, "kl": 0.03981625288724899, "learning_rate": 1.6649485564087644e-07, "loss": 0.0178, "num_tokens": 79878325.0, "reward": 0.625, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.9894694089889526, "sampling/importance_sampling_ratio/mean": 0.9999544024467468, "sampling/importance_sampling_ratio/min": 0.6274318695068359, "sampling/sampling_logp_difference/max": 0.6878679990768433, "sampling/sampling_logp_difference/mean": 0.017296168953180313, "step": 1862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1262.0, "completions/max_terminated_length": 1262.0, "completions/mean_length": 391.40625, "completions/mean_terminated_length": 391.40625, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "entropy": 0.44842445850372314, "epoch": 2.2830882352941178, "frac_reward_zero_std": 0.5, "grad_norm": 0.7901751638567789, "kl": 0.02250845544040203, "learning_rate": 1.6596443719520826e-07, "loss": -0.0462, "num_tokens": 79919695.0, "reward": 0.46875, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.5469281673431396, "sampling/importance_sampling_ratio/mean": 1.0002522468566895, "sampling/importance_sampling_ratio/min": 0.6924613118171692, "sampling/sampling_logp_difference/max": 0.43627119064331055, "sampling/sampling_logp_difference/mean": 0.013811723329126835, "step": 1863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 773.0, "completions/max_terminated_length": 773.0, "completions/mean_length": 355.453125, "completions/mean_terminated_length": 355.453125, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.39392364025115967, "epoch": 2.284313725490196, "frac_reward_zero_std": 1.0, "grad_norm": 0.012259973403249157, "kl": 0.01871407777070999, "learning_rate": 1.6543469682057104e-07, "loss": 0.0002, "num_tokens": 79959212.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3670474290847778, "sampling/importance_sampling_ratio/mean": 1.0003539323806763, "sampling/importance_sampling_ratio/min": 0.48741430044174194, "sampling/sampling_logp_difference/max": 0.7186408042907715, "sampling/sampling_logp_difference/mean": 0.01286897249519825, "step": 1864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 558.0, "completions/max_terminated_length": 558.0, "completions/mean_length": 329.828125, "completions/mean_terminated_length": 329.828125, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "entropy": 0.4575803577899933, "epoch": 2.2855392156862746, "frac_reward_zero_std": 0.75, "grad_norm": 0.5427375842677135, "kl": 0.020671941339969635, "learning_rate": 1.6490563559230357e-07, "loss": 0.002, "num_tokens": 79995185.0, "reward": 0.84375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.4690696001052856, "sampling/importance_sampling_ratio/mean": 0.999742329120636, "sampling/importance_sampling_ratio/min": 0.5838098526000977, "sampling/sampling_logp_difference/max": 0.538179874420166, "sampling/sampling_logp_difference/mean": 0.015413546934723854, "step": 1865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1005.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 437.0, "completions/mean_terminated_length": 437.0, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "entropy": 0.5946385860443115, "epoch": 2.286764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.949154353282725, "kl": 0.022330714389681816, "learning_rate": 1.6437725458436725e-07, "loss": -0.0172, "num_tokens": 80040337.0, "reward": 0.5, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999403953552246, "sampling/importance_sampling_ratio/min": 0.6622447967529297, "sampling/sampling_logp_difference/max": 0.8495306968688965, "sampling/sampling_logp_difference/mean": 0.016430579125881195, "step": 1866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 465.0, "completions/max_terminated_length": 465.0, "completions/mean_length": 303.078125, "completions/mean_terminated_length": 303.078125, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.36189883947372437, "epoch": 2.2879901960784315, "frac_reward_zero_std": 0.75, "grad_norm": 0.7590880046042753, "kl": 0.029862381517887115, "learning_rate": 1.6384955486934154e-07, "loss": 0.0081, "num_tokens": 80074998.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.3882126808166504, "sampling/importance_sampling_ratio/mean": 1.000244379043579, "sampling/importance_sampling_ratio/min": 0.6833302974700928, "sampling/sampling_logp_difference/max": 0.38077688217163086, "sampling/sampling_logp_difference/mean": 0.012282568961381912, "step": 1867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 656.0, "completions/max_terminated_length": 656.0, "completions/mean_length": 354.875, "completions/mean_terminated_length": 354.875, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "entropy": 0.48507827520370483, "epoch": 2.2892156862745097, "frac_reward_zero_std": 1.0, "grad_norm": 0.016143328119441913, "kl": 0.019447145983576775, "learning_rate": 1.633225375184239e-07, "loss": 0.0002, "num_tokens": 80114846.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3411262035369873, "sampling/importance_sampling_ratio/mean": 1.0002696514129639, "sampling/importance_sampling_ratio/min": 0.6616789102554321, "sampling/sampling_logp_difference/max": 0.41297483444213867, "sampling/sampling_logp_difference/mean": 0.015502939000725746, "step": 1868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 828.0, "completions/max_terminated_length": 828.0, "completions/mean_length": 358.78125, "completions/mean_terminated_length": 358.78125, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.46355995535850525, "epoch": 2.2904411764705883, "frac_reward_zero_std": 1.0, "grad_norm": 0.015298278703307928, "kl": 0.016273291781544685, "learning_rate": 1.6279620360142594e-07, "loss": 0.0002, "num_tokens": 80151168.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3638160228729248, "sampling/importance_sampling_ratio/mean": 0.9999186992645264, "sampling/importance_sampling_ratio/min": 0.657878577709198, "sampling/sampling_logp_difference/max": 0.41873490810394287, "sampling/sampling_logp_difference/mean": 0.014711827039718628, "step": 1869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 748.0, "completions/max_terminated_length": 748.0, "completions/mean_length": 294.6875, "completions/mean_terminated_length": 294.6875, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.4519715905189514, "epoch": 2.2916666666666665, "frac_reward_zero_std": 0.75, "grad_norm": 0.8452560696749052, "kl": 0.017621010541915894, "learning_rate": 1.62270554186772e-07, "loss": -0.07, "num_tokens": 80184252.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.5735963582992554, "sampling/importance_sampling_ratio/mean": 1.0002588033676147, "sampling/importance_sampling_ratio/min": 0.6024032235145569, "sampling/sampling_logp_difference/max": 0.5068283081054688, "sampling/sampling_logp_difference/mean": 0.015100234188139439, "step": 1870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1232.0, "completions/max_terminated_length": 1232.0, "completions/mean_length": 434.171875, "completions/mean_terminated_length": 434.171875, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.475411593914032, "epoch": 2.292892156862745, "frac_reward_zero_std": 0.75, "grad_norm": 0.6838790761292117, "kl": 0.021699070930480957, "learning_rate": 1.6174559034149737e-07, "loss": 0.0951, "num_tokens": 80230407.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.6382114887237549, "sampling/importance_sampling_ratio/mean": 0.9999608993530273, "sampling/importance_sampling_ratio/min": 0.5362951755523682, "sampling/sampling_logp_difference/max": 0.6230705976486206, "sampling/sampling_logp_difference/mean": 0.01389777846634388, "step": 1871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1044.0, "completions/max_terminated_length": 1044.0, "completions/mean_length": 435.0, "completions/mean_terminated_length": 435.0, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.5123493075370789, "epoch": 2.2941176470588234, "frac_reward_zero_std": 0.75, "grad_norm": 0.5990357674598017, "kl": 0.017401328310370445, "learning_rate": 1.6122131313124538e-07, "loss": 0.0468, "num_tokens": 80277015.0, "reward": 0.375, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.4385300874710083, "sampling/importance_sampling_ratio/mean": 1.0002658367156982, "sampling/importance_sampling_ratio/min": 0.650388240814209, "sampling/sampling_logp_difference/max": 0.43018579483032227, "sampling/sampling_logp_difference/mean": 0.015563370659947395, "step": 1872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1409.0, "completions/max_terminated_length": 1409.0, "completions/mean_length": 381.75, "completions/mean_terminated_length": 381.75, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "entropy": 0.6458398103713989, "epoch": 2.295343137254902, "frac_reward_zero_std": 0.5, "grad_norm": 0.8248459627225806, "kl": 0.05016747862100601, "learning_rate": 1.606977236202654e-07, "loss": 0.0367, "num_tokens": 80317431.0, "reward": -0.21875, "reward_std": 0.375, "rewards/decision_reward_func/mean": -0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 1.330887794494629, "sampling/importance_sampling_ratio/mean": 0.9995974898338318, "sampling/importance_sampling_ratio/min": 0.4416184425354004, "sampling/sampling_logp_difference/max": 0.8173090219497681, "sampling/sampling_logp_difference/mean": 0.017949484288692474, "step": 1873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 635.0, "completions/max_terminated_length": 635.0, "completions/mean_length": 301.328125, "completions/mean_terminated_length": 301.328125, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.45106837153434753, "epoch": 2.2965686274509802, "frac_reward_zero_std": 0.75, "grad_norm": 0.6495166607796472, "kl": 0.034968167543411255, "learning_rate": 1.6017482287141088e-07, "loss": -0.0063, "num_tokens": 80352108.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.3410568237304688, "sampling/importance_sampling_ratio/mean": 1.000089406967163, "sampling/importance_sampling_ratio/min": 0.6386911869049072, "sampling/sampling_logp_difference/max": 0.4483342170715332, "sampling/sampling_logp_difference/mean": 0.014861547388136387, "step": 1874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 909.0, "completions/max_terminated_length": 909.0, "completions/mean_length": 374.296875, "completions/mean_terminated_length": 374.296875, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.520684003829956, "epoch": 2.297794117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.01445658933488895, "kl": 0.023891527205705643, "learning_rate": 1.5965261194613755e-07, "loss": 0.0002, "num_tokens": 80391535.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6362160444259644, "sampling/importance_sampling_ratio/mean": 0.9997815489768982, "sampling/importance_sampling_ratio/min": 0.6841519474983215, "sampling/sampling_logp_difference/max": 0.4923863410949707, "sampling/sampling_logp_difference/mean": 0.016332849860191345, "step": 1875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1155.0, "completions/max_terminated_length": 1155.0, "completions/mean_length": 482.375, "completions/mean_terminated_length": 482.375, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.689651370048523, "epoch": 2.299019607843137, "frac_reward_zero_std": 0.75, "grad_norm": 0.6044564054890085, "kl": 0.04657156392931938, "learning_rate": 1.591310919045003e-07, "loss": -0.0492, "num_tokens": 80438055.0, "reward": 0.15625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.4245145320892334, "sampling/importance_sampling_ratio/mean": 0.9997963309288025, "sampling/importance_sampling_ratio/min": 0.6037488579750061, "sampling/sampling_logp_difference/max": 0.5045969486236572, "sampling/sampling_logp_difference/mean": 0.019084373489022255, "step": 1876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1515.0, "completions/max_terminated_length": 1515.0, "completions/mean_length": 423.96875, "completions/mean_terminated_length": 423.96875, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.5766804218292236, "epoch": 2.3002450980392157, "frac_reward_zero_std": 0.75, "grad_norm": 0.6111858608207722, "kl": 0.020589767023921013, "learning_rate": 1.5861026380515163e-07, "loss": -0.0613, "num_tokens": 80480693.0, "reward": -0.75, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": -0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.6418190002441406, "sampling/importance_sampling_ratio/mean": 1.00044584274292, "sampling/importance_sampling_ratio/min": 0.6183424592018127, "sampling/sampling_logp_difference/max": 0.4958047866821289, "sampling/sampling_logp_difference/mean": 0.015879759564995766, "step": 1877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 883.0, "completions/max_terminated_length": 883.0, "completions/mean_length": 510.453125, "completions/mean_terminated_length": 510.453125, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "entropy": 0.5993058681488037, "epoch": 2.301470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 0.6907423709861038, "kl": 0.024377766996622086, "learning_rate": 1.5809012870533995e-07, "loss": -0.0116, "num_tokens": 80531218.0, "reward": 0.0625, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.4596487283706665, "sampling/importance_sampling_ratio/mean": 0.9998010396957397, "sampling/importance_sampling_ratio/min": 0.6117537021636963, "sampling/sampling_logp_difference/max": 0.4914255142211914, "sampling/sampling_logp_difference/mean": 0.016258828341960907, "step": 1878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 635.0, "completions/max_terminated_length": 635.0, "completions/mean_length": 410.125, "completions/mean_terminated_length": 410.125, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "entropy": 0.4153759777545929, "epoch": 2.3026960784313726, "frac_reward_zero_std": 1.0, "grad_norm": 0.011548341242894571, "kl": 0.018070761114358902, "learning_rate": 1.575706876609063e-07, "loss": 0.0002, "num_tokens": 80579594.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4759619235992432, "sampling/importance_sampling_ratio/mean": 1.000201940536499, "sampling/importance_sampling_ratio/min": 0.5584359169006348, "sampling/sampling_logp_difference/max": 0.5826153755187988, "sampling/sampling_logp_difference/mean": 0.013274921104311943, "step": 1879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1739.0, "completions/max_terminated_length": 1739.0, "completions/mean_length": 582.40625, "completions/mean_terminated_length": 582.40625, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.5635762214660645, "epoch": 2.303921568627451, "frac_reward_zero_std": 0.75, "grad_norm": 0.37087045414888514, "kl": 0.023050187155604362, "learning_rate": 1.5705194172628323e-07, "loss": 0.0083, "num_tokens": 80639636.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000190734863281, "sampling/importance_sampling_ratio/min": 0.3339158892631531, "sampling/sampling_logp_difference/max": 1.4865598678588867, "sampling/sampling_logp_difference/mean": 0.015201900154352188, "step": 1880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1780.0, "completions/max_terminated_length": 1780.0, "completions/mean_length": 563.359375, "completions/mean_terminated_length": 563.359375, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "entropy": 0.5612661242485046, "epoch": 2.3051470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 0.6450982139335893, "kl": 0.021215718239545822, "learning_rate": 1.565338919544918e-07, "loss": -0.0118, "num_tokens": 80696747.0, "reward": 0.4375, "reward_std": 0.47360679507255554, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.653319001197815, "sampling/importance_sampling_ratio/mean": 1.000218391418457, "sampling/importance_sampling_ratio/min": 0.5633549690246582, "sampling/sampling_logp_difference/max": 0.573845386505127, "sampling/sampling_logp_difference/mean": 0.016738198697566986, "step": 1881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1006.0, "completions/max_terminated_length": 1006.0, "completions/mean_length": 488.84375, "completions/mean_terminated_length": 488.84375, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.5484001040458679, "epoch": 2.306372549019608, "frac_reward_zero_std": 0.75, "grad_norm": 0.4912878187171622, "kl": 0.031052574515342712, "learning_rate": 1.5601653939714072e-07, "loss": -0.0203, "num_tokens": 80753297.0, "reward": 0.8125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.4780699014663696, "sampling/importance_sampling_ratio/mean": 0.9998023509979248, "sampling/importance_sampling_ratio/min": 0.630875289440155, "sampling/sampling_logp_difference/max": 0.4606471061706543, "sampling/sampling_logp_difference/mean": 0.014628357253968716, "step": 1882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 845.0, "completions/max_terminated_length": 845.0, "completions/mean_length": 426.90625, "completions/mean_terminated_length": 426.90625, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "entropy": 0.3969879746437073, "epoch": 2.3075980392156863, "frac_reward_zero_std": 1.0, "grad_norm": 0.01241418207102746, "kl": 0.013306811451911926, "learning_rate": 1.5549988510442258e-07, "loss": 0.0001, "num_tokens": 80799835.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.5771461725234985, "sampling/importance_sampling_ratio/mean": 1.0001152753829956, "sampling/importance_sampling_ratio/min": 0.6633387804031372, "sampling/sampling_logp_difference/max": 0.45561695098876953, "sampling/sampling_logp_difference/mean": 0.013047904707491398, "step": 1883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 923.0, "completions/max_terminated_length": 923.0, "completions/mean_length": 358.625, "completions/mean_terminated_length": 358.625, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "entropy": 0.4845743477344513, "epoch": 2.3088235294117645, "frac_reward_zero_std": 1.0, "grad_norm": 0.04416113729545382, "kl": 0.022552931681275368, "learning_rate": 1.5498393012511285e-07, "loss": 0.0002, "num_tokens": 80839459.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.7730250358581543, "sampling/importance_sampling_ratio/mean": 1.0003736019134521, "sampling/importance_sampling_ratio/min": 0.47910451889038086, "sampling/sampling_logp_difference/max": 0.7358365058898926, "sampling/sampling_logp_difference/mean": 0.01576259545981884, "step": 1884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 657.0, "completions/max_terminated_length": 657.0, "completions/mean_length": 307.734375, "completions/mean_terminated_length": 307.734375, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.48535147309303284, "epoch": 2.310049019607843, "frac_reward_zero_std": 1.0, "grad_norm": 0.017839079765012863, "kl": 0.0351158007979393, "learning_rate": 1.5446867550656767e-07, "loss": 0.0003, "num_tokens": 80873218.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4402128458023071, "sampling/importance_sampling_ratio/mean": 1.0004422664642334, "sampling/importance_sampling_ratio/min": 0.48007118701934814, "sampling/sampling_logp_difference/max": 0.733820915222168, "sampling/sampling_logp_difference/mean": 0.01565532200038433, "step": 1885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1072.0, "completions/max_terminated_length": 1072.0, "completions/mean_length": 406.09375, "completions/mean_terminated_length": 406.09375, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "entropy": 0.4127979278564453, "epoch": 2.311274509803922, "frac_reward_zero_std": 0.75, "grad_norm": 0.5660438858336178, "kl": 0.01779322698712349, "learning_rate": 1.5395412229472103e-07, "loss": -0.0073, "num_tokens": 80923192.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.7522801160812378, "sampling/importance_sampling_ratio/mean": 0.9998814463615417, "sampling/importance_sampling_ratio/min": 0.36878660321235657, "sampling/sampling_logp_difference/max": 0.9975371360778809, "sampling/sampling_logp_difference/mean": 0.013071397319436073, "step": 1886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1050.0, "completions/max_terminated_length": 1050.0, "completions/mean_length": 334.984375, "completions/mean_terminated_length": 334.984375, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.3987993597984314, "epoch": 2.3125, "frac_reward_zero_std": 0.75, "grad_norm": 0.8344569357283169, "kl": 0.019239600747823715, "learning_rate": 1.5344027153408374e-07, "loss": 0.0208, "num_tokens": 80972135.0, "reward": 0.84375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.7896052598953247, "sampling/importance_sampling_ratio/mean": 0.9998780488967896, "sampling/importance_sampling_ratio/min": 0.6552221179008484, "sampling/sampling_logp_difference/max": 0.5819950103759766, "sampling/sampling_logp_difference/mean": 0.01280306838452816, "step": 1887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 710.0, "completions/max_terminated_length": 710.0, "completions/mean_length": 360.625, "completions/mean_terminated_length": 360.625, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "entropy": 0.4199729561805725, "epoch": 2.313725490196078, "frac_reward_zero_std": 0.75, "grad_norm": 0.5455212964079132, "kl": 0.03175146132707596, "learning_rate": 1.5292712426773973e-07, "loss": -0.0334, "num_tokens": 81010863.0, "reward": 0.84375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.646325945854187, "sampling/importance_sampling_ratio/mean": 0.9997931718826294, "sampling/importance_sampling_ratio/min": 0.6922300457954407, "sampling/sampling_logp_difference/max": 0.49854612350463867, "sampling/sampling_logp_difference/mean": 0.013440591283142567, "step": 1888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 770.0, "completions/max_terminated_length": 770.0, "completions/mean_length": 377.859375, "completions/mean_terminated_length": 377.859375, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "entropy": 0.3041327893733978, "epoch": 2.314950980392157, "frac_reward_zero_std": 1.0, "grad_norm": 0.008632103205262488, "kl": 0.014062723144888878, "learning_rate": 1.5241468153734594e-07, "loss": 0.0001, "num_tokens": 81060710.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6976981163024902, "sampling/importance_sampling_ratio/mean": 0.9998505711555481, "sampling/importance_sampling_ratio/min": 0.6290829181671143, "sampling/sampling_logp_difference/max": 0.529273271560669, "sampling/sampling_logp_difference/mean": 0.011163058690726757, "step": 1889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1457.0, "completions/max_terminated_length": 1457.0, "completions/mean_length": 320.1875, "completions/mean_terminated_length": 320.1875, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.5795514583587646, "epoch": 2.3161764705882355, "frac_reward_zero_std": 0.75, "grad_norm": 0.9021796270518456, "kl": 0.05029181391000748, "learning_rate": 1.5190294438312834e-07, "loss": -0.0287, "num_tokens": 81096674.0, "reward": 0.625, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.3959200382232666, "sampling/importance_sampling_ratio/mean": 1.000257968902588, "sampling/importance_sampling_ratio/min": 0.6054268479347229, "sampling/sampling_logp_difference/max": 0.5018215179443359, "sampling/sampling_logp_difference/mean": 0.01735682412981987, "step": 1890 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 634.0, "completions/max_terminated_length": 634.0, "completions/mean_length": 294.71875, "completions/mean_terminated_length": 294.71875, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.45377930998802185, "epoch": 2.3174019607843137, "frac_reward_zero_std": 0.75, "grad_norm": 0.6440749882563453, "kl": 0.024413306266069412, "learning_rate": 1.5139191384388094e-07, "loss": -0.0112, "num_tokens": 81130096.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.4701993465423584, "sampling/importance_sampling_ratio/mean": 1.000017762184143, "sampling/importance_sampling_ratio/min": 0.5910096168518066, "sampling/sampling_logp_difference/max": 0.525922954082489, "sampling/sampling_logp_difference/mean": 0.01521704625338316, "step": 1891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 590.0, "completions/max_terminated_length": 590.0, "completions/mean_length": 357.328125, "completions/mean_terminated_length": 357.328125, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.28020989894866943, "epoch": 2.318627450980392, "frac_reward_zero_std": 1.0, "grad_norm": 0.010564360354703494, "kl": 0.012574242427945137, "learning_rate": 1.5088159095696362e-07, "loss": 0.0001, "num_tokens": 81168005.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4141829013824463, "sampling/importance_sampling_ratio/mean": 1.0001893043518066, "sampling/importance_sampling_ratio/min": 0.6367809772491455, "sampling/sampling_logp_difference/max": 0.4513295292854309, "sampling/sampling_logp_difference/mean": 0.009093918837606907, "step": 1892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 695.0, "completions/max_terminated_length": 695.0, "completions/mean_length": 370.171875, "completions/mean_terminated_length": 370.171875, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "entropy": 0.31127703189849854, "epoch": 2.3198529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.011947908063435528, "kl": 0.01632457599043846, "learning_rate": 1.5037197675829916e-07, "loss": 0.0002, "num_tokens": 81212640.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000098943710327, "sampling/importance_sampling_ratio/min": 0.6874675750732422, "sampling/sampling_logp_difference/max": 0.776710033416748, "sampling/sampling_logp_difference/mean": 0.010845985263586044, "step": 1893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 823.0, "completions/max_terminated_length": 823.0, "completions/mean_length": 517.984375, "completions/mean_terminated_length": 517.984375, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "entropy": 0.42932844161987305, "epoch": 2.321078431372549, "frac_reward_zero_std": 1.0, "grad_norm": 0.011801092606572808, "kl": 0.01900407299399376, "learning_rate": 1.4986307228237267e-07, "loss": 0.0002, "num_tokens": 81269631.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.7002314329147339, "sampling/importance_sampling_ratio/mean": 1.0000500679016113, "sampling/importance_sampling_ratio/min": 0.6368654370307922, "sampling/sampling_logp_difference/max": 0.5307643413543701, "sampling/sampling_logp_difference/mean": 0.012999720871448517, "step": 1894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 798.0, "completions/max_terminated_length": 798.0, "completions/mean_length": 370.109375, "completions/mean_terminated_length": 370.109375, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.5964108109474182, "epoch": 2.3223039215686274, "frac_reward_zero_std": 0.75, "grad_norm": 0.7002544892499892, "kl": 0.0393139086663723, "learning_rate": 1.4935487856222723e-07, "loss": -0.017, "num_tokens": 81313734.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.5871822834014893, "sampling/importance_sampling_ratio/mean": 0.9998951554298401, "sampling/importance_sampling_ratio/min": 0.6625707149505615, "sampling/sampling_logp_difference/max": 0.4619603157043457, "sampling/sampling_logp_difference/mean": 0.01718786731362343, "step": 1895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1350.0, "completions/max_terminated_length": 1350.0, "completions/mean_length": 463.078125, "completions/mean_terminated_length": 463.078125, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.46597814559936523, "epoch": 2.323529411764706, "frac_reward_zero_std": 0.75, "grad_norm": 0.4703095927374619, "kl": 0.019366290420293808, "learning_rate": 1.4884739662946445e-07, "loss": 0.0087, "num_tokens": 81364347.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.626587152481079, "sampling/importance_sampling_ratio/mean": 0.9997521042823792, "sampling/importance_sampling_ratio/min": 0.5870952606201172, "sampling/sampling_logp_difference/max": 0.5325682163238525, "sampling/sampling_logp_difference/mean": 0.014171315357089043, "step": 1896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 709.0, "completions/max_terminated_length": 709.0, "completions/mean_length": 372.3125, "completions/mean_terminated_length": 372.3125, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "entropy": 0.5080406665802002, "epoch": 2.3247549019607843, "frac_reward_zero_std": 0.5, "grad_norm": 0.9190084521418191, "kl": 0.02354089543223381, "learning_rate": 1.4834062751424015e-07, "loss": -0.0092, "num_tokens": 81408495.0, "reward": 0.59375, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.4605621099472046, "sampling/importance_sampling_ratio/mean": 0.9999968409538269, "sampling/importance_sampling_ratio/min": 0.6895461082458496, "sampling/sampling_logp_difference/max": 0.37882137298583984, "sampling/sampling_logp_difference/mean": 0.013807653449475765, "step": 1897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1032.0, "completions/max_terminated_length": 1032.0, "completions/mean_length": 433.3125, "completions/mean_terminated_length": 433.3125, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "entropy": 0.42273449897766113, "epoch": 2.325980392156863, "frac_reward_zero_std": 0.75, "grad_norm": 0.5066349376637795, "kl": 0.016125258058309555, "learning_rate": 1.478345722452639e-07, "loss": 0.0116, "num_tokens": 81452387.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.6368629932403564, "sampling/importance_sampling_ratio/mean": 0.9999611973762512, "sampling/importance_sampling_ratio/min": 0.6627088189125061, "sampling/sampling_logp_difference/max": 0.4927816390991211, "sampling/sampling_logp_difference/mean": 0.013126641511917114, "step": 1898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 643.0, "completions/max_terminated_length": 643.0, "completions/mean_length": 338.390625, "completions/mean_terminated_length": 338.390625, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "entropy": 0.5990315675735474, "epoch": 2.327205882352941, "frac_reward_zero_std": 0.5, "grad_norm": 0.8755497149476074, "kl": 0.033962443470954895, "learning_rate": 1.4732923184979562e-07, "loss": 0.0588, "num_tokens": 81491340.0, "reward": 0.46875, "reward_std": 0.4629635810852051, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.3651323318481445, "sampling/importance_sampling_ratio/mean": 1.0004661083221436, "sampling/importance_sampling_ratio/min": 0.7270824909210205, "sampling/sampling_logp_difference/max": 0.31871533393859863, "sampling/sampling_logp_difference/mean": 0.015933841466903687, "step": 1899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/max_terminated_length": 504.0, "completions/mean_length": 319.96875, "completions/mean_terminated_length": 319.96875, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.42200833559036255, "epoch": 2.3284313725490198, "frac_reward_zero_std": 1.0, "grad_norm": 0.01273842118837621, "kl": 0.017037106677889824, "learning_rate": 1.4682460735364422e-07, "loss": 0.0002, "num_tokens": 81528314.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4352003335952759, "sampling/importance_sampling_ratio/mean": 0.9999521970748901, "sampling/importance_sampling_ratio/min": 0.656382143497467, "sampling/sampling_logp_difference/max": 0.42101216316223145, "sampling/sampling_logp_difference/mean": 0.01339042093604803, "step": 1900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 995.0, "completions/max_terminated_length": 995.0, "completions/mean_length": 362.5, "completions/mean_terminated_length": 362.5, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.3876264691352844, "epoch": 2.329656862745098, "frac_reward_zero_std": 0.75, "grad_norm": 0.7778714713488962, "kl": 0.014745159074664116, "learning_rate": 1.4632069978116584e-07, "loss": 0.0741, "num_tokens": 81568538.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.6273435354232788, "sampling/importance_sampling_ratio/mean": 1.0000070333480835, "sampling/importance_sampling_ratio/min": 0.7111874222755432, "sampling/sampling_logp_difference/max": 0.48694896697998047, "sampling/sampling_logp_difference/mean": 0.012377627193927765, "step": 1901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 886.0, "completions/max_terminated_length": 886.0, "completions/mean_length": 353.59375, "completions/mean_terminated_length": 353.59375, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.47989633679389954, "epoch": 2.3308823529411766, "frac_reward_zero_std": 1.0, "grad_norm": 0.05305492071103238, "kl": 0.02151230350136757, "learning_rate": 1.4581751015526033e-07, "loss": 0.0002, "num_tokens": 81608176.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.548877239227295, "sampling/importance_sampling_ratio/mean": 0.9999960660934448, "sampling/importance_sampling_ratio/min": 0.6249411702156067, "sampling/sampling_logp_difference/max": 0.47009778022766113, "sampling/sampling_logp_difference/mean": 0.016015376895666122, "step": 1902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 857.0, "completions/max_terminated_length": 857.0, "completions/mean_length": 342.5, "completions/mean_terminated_length": 342.5, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "entropy": 0.5457208156585693, "epoch": 2.332107843137255, "frac_reward_zero_std": 0.5, "grad_norm": 0.9840966394616979, "kl": 0.03484099358320236, "learning_rate": 1.4531503949737106e-07, "loss": 0.0163, "num_tokens": 81650384.0, "reward": 0.65625, "reward_std": 0.47978055477142334, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.805199384689331, "sampling/importance_sampling_ratio/mean": 1.0001747608184814, "sampling/importance_sampling_ratio/min": 0.6707562804222107, "sampling/sampling_logp_difference/max": 0.5906710624694824, "sampling/sampling_logp_difference/mean": 0.015776898711919785, "step": 1903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 631.0, "completions/max_terminated_length": 631.0, "completions/mean_length": 323.390625, "completions/mean_terminated_length": 323.390625, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.38706842064857483, "epoch": 2.3333333333333335, "frac_reward_zero_std": 0.75, "grad_norm": 0.6376376819749032, "kl": 0.024471405893564224, "learning_rate": 1.4481328882748184e-07, "loss": 0.0173, "num_tokens": 81686297.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.4044983386993408, "sampling/importance_sampling_ratio/mean": 1.00010085105896, "sampling/importance_sampling_ratio/min": 0.5881037712097168, "sampling/sampling_logp_difference/max": 0.5308518409729004, "sampling/sampling_logp_difference/mean": 0.013258093036711216, "step": 1904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1985.0, "completions/max_terminated_length": 1985.0, "completions/mean_length": 501.3125, "completions/mean_terminated_length": 501.3125, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "entropy": 0.40238529443740845, "epoch": 2.3345588235294117, "frac_reward_zero_std": 0.75, "grad_norm": 0.6348946168597556, "kl": 0.016115091741085052, "learning_rate": 1.4431225916411455e-07, "loss": -0.1293, "num_tokens": 81733229.0, "reward": 0.0625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.4515106678009033, "sampling/importance_sampling_ratio/mean": 1.0002467632293701, "sampling/importance_sampling_ratio/min": 0.5325195789337158, "sampling/sampling_logp_difference/max": 0.6301356554031372, "sampling/sampling_logp_difference/mean": 0.014102410525083542, "step": 1905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 949.0, "completions/max_terminated_length": 949.0, "completions/mean_length": 443.4375, "completions/mean_terminated_length": 443.4375, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "entropy": 0.5406038761138916, "epoch": 2.3357843137254903, "frac_reward_zero_std": 0.5, "grad_norm": 0.7619431390043069, "kl": 0.041395582258701324, "learning_rate": 1.4381195152432769e-07, "loss": -0.0352, "num_tokens": 81783737.0, "reward": 0.34375, "reward_std": 0.375, "rewards/decision_reward_func/mean": 0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 1.430187702178955, "sampling/importance_sampling_ratio/mean": 0.9999947547912598, "sampling/importance_sampling_ratio/min": 0.6791586875915527, "sampling/sampling_logp_difference/max": 0.3869004249572754, "sampling/sampling_logp_difference/mean": 0.014999499544501305, "step": 1906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 999.0, "completions/max_terminated_length": 999.0, "completions/mean_length": 313.828125, "completions/mean_terminated_length": 313.828125, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.4056278169155121, "epoch": 2.3370098039215685, "frac_reward_zero_std": 0.75, "grad_norm": 0.6589249416381466, "kl": 0.02160082757472992, "learning_rate": 1.4331236692371384e-07, "loss": 0.0197, "num_tokens": 81817934.0, "reward": 0.84375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.4135538339614868, "sampling/importance_sampling_ratio/mean": 1.001006841659546, "sampling/importance_sampling_ratio/min": 0.6748111248016357, "sampling/sampling_logp_difference/max": 0.3933224678039551, "sampling/sampling_logp_difference/mean": 0.013541575521230698, "step": 1907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 727.0, "completions/max_terminated_length": 727.0, "completions/mean_length": 358.34375, "completions/mean_terminated_length": 358.34375, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "entropy": 0.5621259808540344, "epoch": 2.338235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 0.9423367875603321, "kl": 0.028794050216674805, "learning_rate": 1.428135063763985e-07, "loss": -0.0171, "num_tokens": 81862260.0, "reward": 0.125, "reward_std": 0.34156501293182373, "rewards/decision_reward_func/mean": 0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 1.4547728300094604, "sampling/importance_sampling_ratio/mean": 1.0002079010009766, "sampling/importance_sampling_ratio/min": 0.6622359156608582, "sampling/sampling_logp_difference/max": 0.41213345527648926, "sampling/sampling_logp_difference/mean": 0.017942264676094055, "step": 1908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1075.0, "completions/max_terminated_length": 1075.0, "completions/mean_length": 441.578125, "completions/mean_terminated_length": 441.578125, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "entropy": 0.436422735452652, "epoch": 2.3394607843137254, "frac_reward_zero_std": 0.75, "grad_norm": 0.5065812653857245, "kl": 0.017633922398090363, "learning_rate": 1.4231537089503675e-07, "loss": 0.0442, "num_tokens": 81909065.0, "reward": 0.8125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.6054160594940186, "sampling/importance_sampling_ratio/mean": 1.0000746250152588, "sampling/importance_sampling_ratio/min": 0.6547437906265259, "sampling/sampling_logp_difference/max": 0.47338294982910156, "sampling/sampling_logp_difference/mean": 0.01333607081323862, "step": 1909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1021.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 426.890625, "completions/mean_terminated_length": 426.890625, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "entropy": 0.5196318626403809, "epoch": 2.340686274509804, "frac_reward_zero_std": 0.75, "grad_norm": 0.5365061377330679, "kl": 0.024488627910614014, "learning_rate": 1.4181796149081194e-07, "loss": -0.0242, "num_tokens": 81957234.0, "reward": 0.0625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.3032093048095703, "sampling/importance_sampling_ratio/mean": 1.0001232624053955, "sampling/importance_sampling_ratio/min": 0.5648259520530701, "sampling/sampling_logp_difference/max": 0.5712375640869141, "sampling/sampling_logp_difference/mean": 0.013844381086528301, "step": 1910 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1119.0, "completions/max_terminated_length": 1119.0, "completions/mean_length": 342.4375, "completions/mean_terminated_length": 342.4375, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "entropy": 0.5264867544174194, "epoch": 2.3419117647058822, "frac_reward_zero_std": 0.5, "grad_norm": 0.930684623336506, "kl": 0.03306477516889572, "learning_rate": 1.4132127917343394e-07, "loss": -0.0318, "num_tokens": 81995534.0, "reward": 0.28125, "reward_std": 0.42695626616477966, "rewards/decision_reward_func/mean": 0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 1.496412992477417, "sampling/importance_sampling_ratio/mean": 1.0001342296600342, "sampling/importance_sampling_ratio/min": 0.654703676700592, "sampling/sampling_logp_difference/max": 0.4235725402832031, "sampling/sampling_logp_difference/mean": 0.015924477949738503, "step": 1911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 923.0, "completions/max_terminated_length": 923.0, "completions/mean_length": 396.359375, "completions/mean_terminated_length": 396.359375, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.5733599662780762, "epoch": 2.343137254901961, "frac_reward_zero_std": 1.0, "grad_norm": 0.04809722180060566, "kl": 0.024001918733119965, "learning_rate": 1.4082532495113624e-07, "loss": 0.0003, "num_tokens": 82037845.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4813957214355469, "sampling/importance_sampling_ratio/mean": 1.0002244710922241, "sampling/importance_sampling_ratio/min": 0.36578041315078735, "sampling/sampling_logp_difference/max": 1.0057220458984375, "sampling/sampling_logp_difference/mean": 0.015295792371034622, "step": 1912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1226.0, "completions/max_terminated_length": 1226.0, "completions/mean_length": 436.34375, "completions/mean_terminated_length": 436.34375, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "entropy": 0.35125815868377686, "epoch": 2.344362745098039, "frac_reward_zero_std": 0.75, "grad_norm": 0.6336314664720251, "kl": 0.02051752805709839, "learning_rate": 1.4033009983067452e-07, "loss": -0.0161, "num_tokens": 82085291.0, "reward": 0.6875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.4311373233795166, "sampling/importance_sampling_ratio/mean": 1.0002989768981934, "sampling/importance_sampling_ratio/min": 0.5685528516769409, "sampling/sampling_logp_difference/max": 0.5646609663963318, "sampling/sampling_logp_difference/mean": 0.010609139688313007, "step": 1913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1904.0, "completions/max_terminated_length": 1904.0, "completions/mean_length": 477.609375, "completions/mean_terminated_length": 477.609375, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "entropy": 0.5550570487976074, "epoch": 2.3455882352941178, "frac_reward_zero_std": 0.75, "grad_norm": 0.5634146153350101, "kl": 0.02110566943883896, "learning_rate": 1.398356048173242e-07, "loss": -0.0517, "num_tokens": 82136066.0, "reward": 0.71875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.435990571975708, "sampling/importance_sampling_ratio/mean": 0.9998867511749268, "sampling/importance_sampling_ratio/min": 0.039463676512241364, "sampling/sampling_logp_difference/max": 3.232374668121338, "sampling/sampling_logp_difference/mean": 0.015318287536501884, "step": 1914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1195.0, "completions/max_terminated_length": 1195.0, "completions/mean_length": 429.0, "completions/mean_terminated_length": 429.0, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.4633127450942993, "epoch": 2.346813725490196, "frac_reward_zero_std": 0.75, "grad_norm": 0.5415831445014825, "kl": 0.01862948015332222, "learning_rate": 1.3934184091487915e-07, "loss": -0.0132, "num_tokens": 82177938.0, "reward": 0.25, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.25, "rewards/decision_reward_func/std": 0.9759001135826111, "sampling/importance_sampling_ratio/max": 1.6356689929962158, "sampling/importance_sampling_ratio/mean": 1.0001739263534546, "sampling/importance_sampling_ratio/min": 0.6622073650360107, "sampling/sampling_logp_difference/max": 0.49205195903778076, "sampling/sampling_logp_difference/mean": 0.013646750710904598, "step": 1915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1004.0, "completions/max_terminated_length": 1004.0, "completions/mean_length": 345.46875, "completions/mean_terminated_length": 345.46875, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.337214857339859, "epoch": 2.3480392156862746, "frac_reward_zero_std": 1.0, "grad_norm": 0.01030635417434807, "kl": 0.014944124966859818, "learning_rate": 1.3884880912564873e-07, "loss": 0.0001, "num_tokens": 82216048.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.54752516746521, "sampling/importance_sampling_ratio/mean": 0.9995877742767334, "sampling/importance_sampling_ratio/min": 0.6822071075439453, "sampling/sampling_logp_difference/max": 0.4366569519042969, "sampling/sampling_logp_difference/mean": 0.012185954488813877, "step": 1916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1408.0, "completions/max_terminated_length": 1408.0, "completions/mean_length": 502.765625, "completions/mean_terminated_length": 502.765625, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.4186125099658966, "epoch": 2.349264705882353, "frac_reward_zero_std": 0.75, "grad_norm": 0.4209537753494363, "kl": 0.026717543601989746, "learning_rate": 1.3835651045045598e-07, "loss": 0.0216, "num_tokens": 82262337.0, "reward": 0.84375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.6125074625015259, "sampling/importance_sampling_ratio/mean": 1.0000169277191162, "sampling/importance_sampling_ratio/min": 0.5952411890029907, "sampling/sampling_logp_difference/max": 0.5187886953353882, "sampling/sampling_logp_difference/mean": 0.01331581175327301, "step": 1917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 969.0, "completions/max_terminated_length": 969.0, "completions/mean_length": 373.609375, "completions/mean_terminated_length": 373.609375, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.47760245203971863, "epoch": 2.3504901960784315, "frac_reward_zero_std": 0.75, "grad_norm": 0.5895712864710998, "kl": 0.0241969283670187, "learning_rate": 1.3786494588863633e-07, "loss": 0.002, "num_tokens": 82313224.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.316162347793579, "sampling/importance_sampling_ratio/mean": 0.999843955039978, "sampling/importance_sampling_ratio/min": 0.6407266855239868, "sampling/sampling_logp_difference/max": 0.44515228271484375, "sampling/sampling_logp_difference/mean": 0.014207405969500542, "step": 1918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1382.0, "completions/max_terminated_length": 1382.0, "completions/mean_length": 470.875, "completions/mean_terminated_length": 470.875, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "entropy": 0.44518813490867615, "epoch": 2.3517156862745097, "frac_reward_zero_std": 1.0, "grad_norm": 0.015449140473401037, "kl": 0.01825772598385811, "learning_rate": 1.3737411643803448e-07, "loss": 0.0002, "num_tokens": 82361376.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6273078918457031, "sampling/importance_sampling_ratio/mean": 0.9998447895050049, "sampling/importance_sampling_ratio/min": 0.7175968289375305, "sampling/sampling_logp_difference/max": 0.4869270324707031, "sampling/sampling_logp_difference/mean": 0.01415756531059742, "step": 1919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1196.0, "completions/max_terminated_length": 1196.0, "completions/mean_length": 536.8125, "completions/mean_terminated_length": 536.8125, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "entropy": 0.4904722273349762, "epoch": 2.3529411764705883, "frac_reward_zero_std": 1.0, "grad_norm": 0.01334837107179096, "kl": 0.016029274091124535, "learning_rate": 1.368840230950035e-07, "loss": 0.0002, "num_tokens": 82415636.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.5277618169784546, "sampling/importance_sampling_ratio/mean": 1.0001848936080933, "sampling/importance_sampling_ratio/min": 0.6455166339874268, "sampling/sampling_logp_difference/max": 0.43770432472229004, "sampling/sampling_logp_difference/mean": 0.01455366425216198, "step": 1920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2585.0, "completions/max_terminated_length": 2585.0, "completions/mean_length": 564.96875, "completions/mean_terminated_length": 564.96875, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "entropy": 0.42360424995422363, "epoch": 2.3541666666666665, "frac_reward_zero_std": 0.75, "grad_norm": 0.3369087179417857, "kl": 0.01597588136792183, "learning_rate": 1.3639466685440132e-07, "loss": -0.0025, "num_tokens": 82469442.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.4340269565582275, "sampling/importance_sampling_ratio/mean": 0.9999595880508423, "sampling/importance_sampling_ratio/min": 0.464097261428833, "sampling/sampling_logp_difference/max": 0.7676610946655273, "sampling/sampling_logp_difference/mean": 0.01366984099149704, "step": 1921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 773.0, "completions/max_terminated_length": 773.0, "completions/mean_length": 336.15625, "completions/mean_terminated_length": 336.15625, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.42670902609825134, "epoch": 2.355392156862745, "frac_reward_zero_std": 1.0, "grad_norm": 0.010481769784497313, "kl": 0.017240509390830994, "learning_rate": 1.3590604870959043e-07, "loss": 0.0002, "num_tokens": 82507052.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.7753360271453857, "sampling/importance_sampling_ratio/mean": 1.0000429153442383, "sampling/importance_sampling_ratio/min": 0.4967246353626251, "sampling/sampling_logp_difference/max": 0.6997194290161133, "sampling/sampling_logp_difference/mean": 0.014858750626444817, "step": 1922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1454.0, "completions/max_terminated_length": 1454.0, "completions/mean_length": 439.9375, "completions/mean_terminated_length": 439.9375, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "entropy": 0.49666106700897217, "epoch": 2.3566176470588234, "frac_reward_zero_std": 0.5, "grad_norm": 0.7889949382999659, "kl": 0.023360323160886765, "learning_rate": 1.3541816965243462e-07, "loss": -0.0239, "num_tokens": 82556952.0, "reward": 0.125, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": 0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 1.633039116859436, "sampling/importance_sampling_ratio/mean": 0.9998592138290405, "sampling/importance_sampling_ratio/min": 0.6721773743629456, "sampling/sampling_logp_difference/max": 0.49044275283813477, "sampling/sampling_logp_difference/mean": 0.014301173388957977, "step": 1923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 569.0, "completions/max_terminated_length": 569.0, "completions/mean_length": 302.421875, "completions/mean_terminated_length": 302.421875, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.43797069787979126, "epoch": 2.357843137254902, "frac_reward_zero_std": 1.0, "grad_norm": 0.016207693037122065, "kl": 0.021593241021037102, "learning_rate": 1.3493103067329737e-07, "loss": 0.0002, "num_tokens": 82594611.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9995329976081848, "sampling/importance_sampling_ratio/min": 0.48236799240112305, "sampling/sampling_logp_difference/max": 0.7290480136871338, "sampling/sampling_logp_difference/mean": 0.015570458024740219, "step": 1924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1155.0, "completions/max_terminated_length": 1155.0, "completions/mean_length": 449.0, "completions/mean_terminated_length": 449.0, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "entropy": 0.4141286611557007, "epoch": 2.3590686274509802, "frac_reward_zero_std": 0.75, "grad_norm": 0.5364210338855576, "kl": 0.016784466803073883, "learning_rate": 1.3444463276104012e-07, "loss": 0.0232, "num_tokens": 82642323.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.625780701637268, "sampling/importance_sampling_ratio/mean": 1.000063180923462, "sampling/importance_sampling_ratio/min": 0.6876872181892395, "sampling/sampling_logp_difference/max": 0.48598814010620117, "sampling/sampling_logp_difference/mean": 0.013419031165540218, "step": 1925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1140.0, "completions/max_terminated_length": 1140.0, "completions/mean_length": 497.359375, "completions/mean_terminated_length": 497.359375, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.4604421555995941, "epoch": 2.360294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 0.6721180162934672, "kl": 0.021408379077911377, "learning_rate": 1.3395897690301966e-07, "loss": 0.0524, "num_tokens": 82694650.0, "reward": 0.71875, "reward_std": 0.4515564441680908, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.8545819520950317, "sampling/importance_sampling_ratio/mean": 0.9998385906219482, "sampling/importance_sampling_ratio/min": 0.5843457579612732, "sampling/sampling_logp_difference/max": 0.617659330368042, "sampling/sampling_logp_difference/mean": 0.012493420392274857, "step": 1926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 882.0, "completions/max_terminated_length": 882.0, "completions/mean_length": 407.375, "completions/mean_terminated_length": 407.375, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "entropy": 0.39005160331726074, "epoch": 2.361519607843137, "frac_reward_zero_std": 1.0, "grad_norm": 0.014609835171513187, "kl": 0.014927485957741737, "learning_rate": 1.3347406408508694e-07, "loss": 0.0001, "num_tokens": 82736738.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4414186477661133, "sampling/importance_sampling_ratio/mean": 1.0002464056015015, "sampling/importance_sampling_ratio/min": 0.6262653470039368, "sampling/sampling_logp_difference/max": 0.46798110008239746, "sampling/sampling_logp_difference/mean": 0.012397952377796173, "step": 1927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 822.0, "completions/max_terminated_length": 822.0, "completions/mean_length": 441.34375, "completions/mean_terminated_length": 441.34375, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "entropy": 0.4327707886695862, "epoch": 2.3627450980392157, "frac_reward_zero_std": 0.75, "grad_norm": 0.40515521330838095, "kl": 0.019305048510432243, "learning_rate": 1.3298989529158378e-07, "loss": 0.0079, "num_tokens": 82785816.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.4754092693328857, "sampling/importance_sampling_ratio/mean": 0.9996547698974609, "sampling/importance_sampling_ratio/min": 0.5128060579299927, "sampling/sampling_logp_difference/max": 0.6678575277328491, "sampling/sampling_logp_difference/mean": 0.012377699837088585, "step": 1928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 783.0, "completions/max_terminated_length": 783.0, "completions/mean_length": 309.421875, "completions/mean_terminated_length": 309.421875, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.4763091206550598, "epoch": 2.363970588235294, "frac_reward_zero_std": 0.75, "grad_norm": 0.5645741585361078, "kl": 0.03080044314265251, "learning_rate": 1.325064715053425e-07, "loss": 0.0133, "num_tokens": 82819715.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.5989562273025513, "sampling/importance_sampling_ratio/mean": 1.000138282775879, "sampling/importance_sampling_ratio/min": 0.7028736472129822, "sampling/sampling_logp_difference/max": 0.46935105323791504, "sampling/sampling_logp_difference/mean": 0.015919024124741554, "step": 1929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1129.0, "completions/max_terminated_length": 1129.0, "completions/mean_length": 469.796875, "completions/mean_terminated_length": 469.796875, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "entropy": 0.567400336265564, "epoch": 2.3651960784313726, "frac_reward_zero_std": 0.75, "grad_norm": 0.5306044361395967, "kl": 0.022124525159597397, "learning_rate": 1.320237937076825e-07, "loss": 0.0517, "num_tokens": 82868230.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.671268105506897, "sampling/importance_sampling_ratio/mean": 1.0002734661102295, "sampling/importance_sampling_ratio/min": 0.453867107629776, "sampling/sampling_logp_difference/max": 0.7899508476257324, "sampling/sampling_logp_difference/mean": 0.016789911314845085, "step": 1930 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 610.0, "completions/max_terminated_length": 610.0, "completions/mean_length": 334.34375, "completions/mean_terminated_length": 334.34375, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.5247470140457153, "epoch": 2.366421568627451, "frac_reward_zero_std": 0.75, "grad_norm": 0.6266037354082921, "kl": 0.035015106201171875, "learning_rate": 1.3154186287840946e-07, "loss": -0.0082, "num_tokens": 82908764.0, "reward": 0.8125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.47061288356781, "sampling/importance_sampling_ratio/mean": 0.9999035596847534, "sampling/importance_sampling_ratio/min": 0.546299397945404, "sampling/sampling_logp_difference/max": 0.6045880317687988, "sampling/sampling_logp_difference/mean": 0.017248084768652916, "step": 1931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.0, "completions/max_terminated_length": 518.0, "completions/mean_length": 299.140625, "completions/mean_terminated_length": 299.140625, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "entropy": 0.5438364744186401, "epoch": 2.3676470588235294, "frac_reward_zero_std": 0.75, "grad_norm": 0.8874004347344296, "kl": 0.029977945610880852, "learning_rate": 1.310606799958122e-07, "loss": 0.024, "num_tokens": 82949717.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.5047268867492676, "sampling/importance_sampling_ratio/mean": 0.9995934963226318, "sampling/importance_sampling_ratio/min": 0.6620468497276306, "sampling/sampling_logp_difference/max": 0.4124189615249634, "sampling/sampling_logp_difference/mean": 0.01792212948203087, "step": 1932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1107.0, "completions/max_terminated_length": 1107.0, "completions/mean_length": 463.3125, "completions/mean_terminated_length": 463.3125, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "entropy": 0.44134005904197693, "epoch": 2.368872549019608, "frac_reward_zero_std": 1.0, "grad_norm": 0.013188091860439677, "kl": 0.01885836385190487, "learning_rate": 1.305802460366615e-07, "loss": 0.0002, "num_tokens": 83004105.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.697698950767517, "sampling/importance_sampling_ratio/mean": 1.000068187713623, "sampling/importance_sampling_ratio/min": 0.536325216293335, "sampling/sampling_logp_difference/max": 0.6230145692825317, "sampling/sampling_logp_difference/mean": 0.013347489759325981, "step": 1933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 940.0, "completions/max_terminated_length": 940.0, "completions/mean_length": 325.453125, "completions/mean_terminated_length": 325.453125, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.5035533905029297, "epoch": 2.3700980392156863, "frac_reward_zero_std": 0.5, "grad_norm": 0.9629218026003786, "kl": 0.026623355224728584, "learning_rate": 1.3010056197620812e-07, "loss": 0.0633, "num_tokens": 83044710.0, "reward": 0.625, "reward_std": 0.5, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0003809928894043, "sampling/importance_sampling_ratio/min": 0.6176416873931885, "sampling/sampling_logp_difference/max": 0.8088681697845459, "sampling/sampling_logp_difference/mean": 0.014561382122337818, "step": 1934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 999.0, "completions/max_terminated_length": 999.0, "completions/mean_length": 427.890625, "completions/mean_terminated_length": 427.890625, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "entropy": 0.6198945045471191, "epoch": 2.3713235294117645, "frac_reward_zero_std": 0.5, "grad_norm": 0.6265942600248017, "kl": 0.03964344784617424, "learning_rate": 1.2962162878817985e-07, "loss": 0.0225, "num_tokens": 83094031.0, "reward": 0.84375, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.6095978021621704, "sampling/importance_sampling_ratio/mean": 1.0002694129943848, "sampling/importance_sampling_ratio/min": 0.7216180562973022, "sampling/sampling_logp_difference/max": 0.4759843349456787, "sampling/sampling_logp_difference/mean": 0.01716119796037674, "step": 1935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1143.0, "completions/max_terminated_length": 1143.0, "completions/mean_length": 470.53125, "completions/mean_terminated_length": 470.53125, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "entropy": 0.4696858823299408, "epoch": 2.372549019607843, "frac_reward_zero_std": 0.5, "grad_norm": 0.6610694754965053, "kl": 0.026524493470788002, "learning_rate": 1.2914344744478112e-07, "loss": -0.046, "num_tokens": 83143185.0, "reward": 0.09375, "reward_std": 0.4515564441680908, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.588683843612671, "sampling/importance_sampling_ratio/mean": 0.9998884201049805, "sampling/importance_sampling_ratio/min": 0.5266839861869812, "sampling/sampling_logp_difference/max": 0.6411545276641846, "sampling/sampling_logp_difference/mean": 0.014217715710401535, "step": 1936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 939.0, "completions/max_terminated_length": 939.0, "completions/mean_length": 414.375, "completions/mean_terminated_length": 414.375, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "entropy": 0.37194323539733887, "epoch": 2.373774509803922, "frac_reward_zero_std": 0.75, "grad_norm": 0.5860548811892626, "kl": 0.01606004685163498, "learning_rate": 1.2866601891668942e-07, "loss": 0.0156, "num_tokens": 83185977.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.5593074560165405, "sampling/importance_sampling_ratio/mean": 1.0000301599502563, "sampling/importance_sampling_ratio/min": 0.10829772800207138, "sampling/sampling_logp_difference/max": 2.2228710651397705, "sampling/sampling_logp_difference/mean": 0.012457143515348434, "step": 1937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.0, "completions/max_terminated_length": 546.0, "completions/mean_length": 298.9375, "completions/mean_terminated_length": 298.9375, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "entropy": 0.38066744804382324, "epoch": 2.375, "frac_reward_zero_std": 1.0, "grad_norm": 0.012453541951508211, "kl": 0.01765270344913006, "learning_rate": 1.2818934417305477e-07, "loss": 0.0002, "num_tokens": 83222341.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.561669111251831, "sampling/importance_sampling_ratio/mean": 0.9997491836547852, "sampling/importance_sampling_ratio/min": 0.4807349145412445, "sampling/sampling_logp_difference/max": 0.7324392795562744, "sampling/sampling_logp_difference/mean": 0.013048812747001648, "step": 1938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1108.0, "completions/max_terminated_length": 1108.0, "completions/mean_length": 525.0625, "completions/mean_terminated_length": 525.0625, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "entropy": 0.4148012399673462, "epoch": 2.376225490196078, "frac_reward_zero_std": 0.75, "grad_norm": 0.5226129774004293, "kl": 0.013392365537583828, "learning_rate": 1.2771342418149656e-07, "loss": 0.0205, "num_tokens": 83277001.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.643626093864441, "sampling/importance_sampling_ratio/mean": 0.9999764561653137, "sampling/importance_sampling_ratio/min": 0.5069830417633057, "sampling/sampling_logp_difference/max": 0.6792776584625244, "sampling/sampling_logp_difference/mean": 0.013566777110099792, "step": 1939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 981.0, "completions/max_terminated_length": 981.0, "completions/mean_length": 483.515625, "completions/mean_terminated_length": 483.515625, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "entropy": 0.5535456538200378, "epoch": 2.377450980392157, "frac_reward_zero_std": 1.0, "grad_norm": 0.012950137363427339, "kl": 0.023586241528391838, "learning_rate": 1.2723825990810204e-07, "loss": 0.0002, "num_tokens": 83326650.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4781988859176636, "sampling/importance_sampling_ratio/mean": 0.9995971918106079, "sampling/importance_sampling_ratio/min": 0.7274662256240845, "sampling/sampling_logp_difference/max": 0.39082443714141846, "sampling/sampling_logp_difference/mean": 0.015115633606910706, "step": 1940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 718.0, "completions/max_terminated_length": 718.0, "completions/mean_length": 291.0, "completions/mean_terminated_length": 291.0, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.35967642068862915, "epoch": 2.3786764705882355, "frac_reward_zero_std": 0.75, "grad_norm": 0.6565935649722928, "kl": 0.016120584681630135, "learning_rate": 1.2676385231742494e-07, "loss": 0.0048, "num_tokens": 83361818.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.4351998567581177, "sampling/importance_sampling_ratio/mean": 0.9997794032096863, "sampling/importance_sampling_ratio/min": 0.6482198238372803, "sampling/sampling_logp_difference/max": 0.4335254430770874, "sampling/sampling_logp_difference/mean": 0.012509386986494064, "step": 1941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 882.0, "completions/max_terminated_length": 882.0, "completions/mean_length": 421.8125, "completions/mean_terminated_length": 421.8125, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "entropy": 0.40179866552352905, "epoch": 2.3799019607843137, "frac_reward_zero_std": 1.0, "grad_norm": 0.015531550218540309, "kl": 0.019553551450371742, "learning_rate": 1.262902023724824e-07, "loss": 0.0002, "num_tokens": 83405486.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5020666122436523, "sampling/importance_sampling_ratio/mean": 1.0003321170806885, "sampling/importance_sampling_ratio/min": 0.7190918922424316, "sampling/sampling_logp_difference/max": 0.40684187412261963, "sampling/sampling_logp_difference/mean": 0.012800442054867744, "step": 1942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 900.0, "completions/max_terminated_length": 900.0, "completions/mean_length": 481.21875, "completions/mean_terminated_length": 481.21875, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "entropy": 0.6042367815971375, "epoch": 2.381127450980392, "frac_reward_zero_std": 0.25, "grad_norm": 0.7920806867499003, "kl": 0.03523586690425873, "learning_rate": 1.258173110347538e-07, "loss": -0.0084, "num_tokens": 83465452.0, "reward": 0.15625, "reward_std": 0.5539814233779907, "rewards/decision_reward_func/mean": 0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.8818029165267944, "sampling/importance_sampling_ratio/mean": 0.9999896287918091, "sampling/importance_sampling_ratio/min": 0.473527729511261, "sampling/sampling_logp_difference/max": 0.7475447654724121, "sampling/sampling_logp_difference/mean": 0.01622064970433712, "step": 1943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1007.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 495.21875, "completions/mean_terminated_length": 495.21875, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "entropy": 0.48733392357826233, "epoch": 2.3823529411764706, "frac_reward_zero_std": 0.25, "grad_norm": 0.7384312200452469, "kl": 0.024198034778237343, "learning_rate": 1.253451792641785e-07, "loss": -0.0366, "num_tokens": 83516442.0, "reward": 0.0625, "reward_std": 0.5501632690429688, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.4676287174224854, "sampling/importance_sampling_ratio/mean": 0.9999654293060303, "sampling/importance_sampling_ratio/min": 0.6392839550971985, "sampling/sampling_logp_difference/max": 0.4474066495895386, "sampling/sampling_logp_difference/mean": 0.013512559235095978, "step": 1944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 595.0, "completions/max_terminated_length": 595.0, "completions/mean_length": 322.5, "completions/mean_terminated_length": 322.5, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "entropy": 0.5885299444198608, "epoch": 2.383578431372549, "frac_reward_zero_std": 0.5, "grad_norm": 0.8746252032611431, "kl": 0.039881352335214615, "learning_rate": 1.248738080191543e-07, "loss": 0.0039, "num_tokens": 83551978.0, "reward": 0.59375, "reward_std": 0.47978055477142334, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.479042410850525, "sampling/importance_sampling_ratio/mean": 1.000295639038086, "sampling/importance_sampling_ratio/min": 0.6970458626747131, "sampling/sampling_logp_difference/max": 0.39139485359191895, "sampling/sampling_logp_difference/mean": 0.0175015926361084, "step": 1945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1108.0, "completions/max_terminated_length": 1108.0, "completions/mean_length": 425.25, "completions/mean_terminated_length": 425.25, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "entropy": 0.39189720153808594, "epoch": 2.3848039215686274, "frac_reward_zero_std": 1.0, "grad_norm": 0.010365245168953072, "kl": 0.013703888282179832, "learning_rate": 1.244031982565349e-07, "loss": 0.0001, "num_tokens": 83593786.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0002193450927734, "sampling/importance_sampling_ratio/min": 0.7028541564941406, "sampling/sampling_logp_difference/max": 0.8234677314758301, "sampling/sampling_logp_difference/mean": 0.012265820987522602, "step": 1946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 679.0, "completions/max_terminated_length": 679.0, "completions/mean_length": 373.0, "completions/mean_terminated_length": 373.0, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "entropy": 0.6489561796188354, "epoch": 2.386029411764706, "frac_reward_zero_std": 0.25, "grad_norm": 1.0724185541324023, "kl": 0.05380932241678238, "learning_rate": 1.239333509316281e-07, "loss": -0.0135, "num_tokens": 83638314.0, "reward": 0.59375, "reward_std": 0.6305378675460815, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.443424940109253, "sampling/importance_sampling_ratio/mean": 0.999824583530426, "sampling/importance_sampling_ratio/min": 0.6058589220046997, "sampling/sampling_logp_difference/max": 0.5011081695556641, "sampling/sampling_logp_difference/mean": 0.01842574216425419, "step": 1947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1073.0, "completions/max_terminated_length": 1073.0, "completions/mean_length": 395.484375, "completions/mean_terminated_length": 395.484375, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.48375093936920166, "epoch": 2.3872549019607843, "frac_reward_zero_std": 0.5, "grad_norm": 0.7712001375342862, "kl": 0.024104923009872437, "learning_rate": 1.2346426699819456e-07, "loss": -0.0049, "num_tokens": 83682137.0, "reward": 0.625, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.4239355325698853, "sampling/importance_sampling_ratio/mean": 1.000038504600525, "sampling/importance_sampling_ratio/min": 0.615952730178833, "sampling/sampling_logp_difference/max": 0.4845850467681885, "sampling/sampling_logp_difference/mean": 0.014453334733843803, "step": 1948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 634.0, "completions/max_terminated_length": 634.0, "completions/mean_length": 345.9375, "completions/mean_terminated_length": 345.9375, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.3828203082084656, "epoch": 2.388480392156863, "frac_reward_zero_std": 1.0, "grad_norm": 0.012213791155909695, "kl": 0.015690211206674576, "learning_rate": 1.2299594740844476e-07, "loss": 0.0002, "num_tokens": 83721157.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.8598827123641968, "sampling/importance_sampling_ratio/mean": 1.0002186298370361, "sampling/importance_sampling_ratio/min": 0.6275500059127808, "sampling/sampling_logp_difference/max": 0.6205134391784668, "sampling/sampling_logp_difference/mean": 0.012958644889295101, "step": 1949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 765.0, "completions/max_terminated_length": 765.0, "completions/mean_length": 309.703125, "completions/mean_terminated_length": 309.703125, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.44220462441444397, "epoch": 2.389705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.015123848642791428, "kl": 0.01829487457871437, "learning_rate": 1.225283931130378e-07, "loss": 0.0002, "num_tokens": 83755858.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5203416347503662, "sampling/importance_sampling_ratio/mean": 0.999682605266571, "sampling/importance_sampling_ratio/min": 0.6525866389274597, "sampling/sampling_logp_difference/max": 0.4268113374710083, "sampling/sampling_logp_difference/mean": 0.014723929576575756, "step": 1950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 635.0, "completions/max_terminated_length": 635.0, "completions/mean_length": 293.234375, "completions/mean_terminated_length": 293.234375, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.4920468330383301, "epoch": 2.3909313725490198, "frac_reward_zero_std": 0.75, "grad_norm": 0.6066248198650801, "kl": 0.03059178963303566, "learning_rate": 1.220616050610791e-07, "loss": 0.0225, "num_tokens": 83791553.0, "reward": 0.40625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.6088899374008179, "sampling/importance_sampling_ratio/mean": 1.0000323057174683, "sampling/importance_sampling_ratio/min": 0.481278657913208, "sampling/sampling_logp_difference/max": 0.7313089370727539, "sampling/sampling_logp_difference/mean": 0.015592854470014572, "step": 1951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 653.0, "completions/max_terminated_length": 653.0, "completions/mean_length": 352.859375, "completions/mean_terminated_length": 352.859375, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "entropy": 0.48866981267929077, "epoch": 2.392156862745098, "frac_reward_zero_std": 0.5, "grad_norm": 0.9622418320397272, "kl": 0.020563479512929916, "learning_rate": 1.2159558420011905e-07, "loss": 0.0162, "num_tokens": 83832296.0, "reward": 0.28125, "reward_std": 0.4515564441680908, "rewards/decision_reward_func/mean": 0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 1.6021250486373901, "sampling/importance_sampling_ratio/mean": 0.9997910261154175, "sampling/importance_sampling_ratio/min": 0.5527560114860535, "sampling/sampling_logp_difference/max": 0.5928385257720947, "sampling/sampling_logp_difference/mean": 0.015054437331855297, "step": 1952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 751.0, "completions/max_terminated_length": 751.0, "completions/mean_length": 480.796875, "completions/mean_terminated_length": 480.796875, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "entropy": 0.40421730279922485, "epoch": 2.3933823529411766, "frac_reward_zero_std": 1.0, "grad_norm": 0.011012953710840927, "kl": 0.01527397707104683, "learning_rate": 1.2113033147615071e-07, "loss": 0.0001, "num_tokens": 83877867.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001039505004883, "sampling/importance_sampling_ratio/min": 0.6383181214332581, "sampling/sampling_logp_difference/max": 0.709916353225708, "sampling/sampling_logp_difference/mean": 0.012514806352555752, "step": 1953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 763.0, "completions/max_terminated_length": 763.0, "completions/mean_length": 341.515625, "completions/mean_terminated_length": 341.515625, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.33605849742889404, "epoch": 2.394607843137255, "frac_reward_zero_std": 0.75, "grad_norm": 0.5888600478080172, "kl": 0.015921292826533318, "learning_rate": 1.206658478336071e-07, "loss": -0.0034, "num_tokens": 83916572.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.2960000038146973, "sampling/importance_sampling_ratio/mean": 0.9999550580978394, "sampling/importance_sampling_ratio/min": 0.7630901336669922, "sampling/sampling_logp_difference/max": 0.27037906646728516, "sampling/sampling_logp_difference/mean": 0.010849649086594582, "step": 1954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 636.0, "completions/max_terminated_length": 636.0, "completions/mean_length": 357.96875, "completions/mean_terminated_length": 357.96875, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "entropy": 0.4200955629348755, "epoch": 2.3958333333333335, "frac_reward_zero_std": 1.0, "grad_norm": 0.011576873204748752, "kl": 0.017061855643987656, "learning_rate": 1.2020213421536103e-07, "loss": 0.0002, "num_tokens": 83956442.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4019628763198853, "sampling/importance_sampling_ratio/mean": 1.0002577304840088, "sampling/importance_sampling_ratio/min": 0.6298449039459229, "sampling/sampling_logp_difference/max": 0.4622817039489746, "sampling/sampling_logp_difference/mean": 0.013865157030522823, "step": 1955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 559.0, "completions/max_terminated_length": 559.0, "completions/mean_length": 332.015625, "completions/mean_terminated_length": 332.015625, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.5905846953392029, "epoch": 2.3970588235294117, "frac_reward_zero_std": 0.75, "grad_norm": 0.6496519697082195, "kl": 0.029236093163490295, "learning_rate": 1.1973919156272138e-07, "loss": -0.0042, "num_tokens": 84000971.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.9626052379608154, "sampling/importance_sampling_ratio/mean": 1.0003597736358643, "sampling/importance_sampling_ratio/min": 0.6558524966239929, "sampling/sampling_logp_difference/max": 0.6742727756500244, "sampling/sampling_logp_difference/mean": 0.018735207617282867, "step": 1956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 715.0, "completions/max_terminated_length": 715.0, "completions/mean_length": 402.171875, "completions/mean_terminated_length": 402.171875, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "entropy": 0.40596336126327515, "epoch": 2.3982843137254903, "frac_reward_zero_std": 1.0, "grad_norm": 0.010825437716148145, "kl": 0.015355302020907402, "learning_rate": 1.1927702081543278e-07, "loss": 0.0001, "num_tokens": 84045062.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.7731739282608032, "sampling/importance_sampling_ratio/mean": 0.9997348785400391, "sampling/importance_sampling_ratio/min": 0.6547006964683533, "sampling/sampling_logp_difference/max": 0.5727710723876953, "sampling/sampling_logp_difference/mean": 0.013082662597298622, "step": 1957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1117.0, "completions/max_terminated_length": 1117.0, "completions/mean_length": 458.375, "completions/mean_terminated_length": 458.375, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.47841325402259827, "epoch": 2.3995098039215685, "frac_reward_zero_std": 1.0, "grad_norm": 0.01242717453210583, "kl": 0.018083244562149048, "learning_rate": 1.188156229116724e-07, "loss": 0.0002, "num_tokens": 84102478.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6092060804367065, "sampling/importance_sampling_ratio/mean": 0.9998587369918823, "sampling/importance_sampling_ratio/min": 0.6359979510307312, "sampling/sampling_logp_difference/max": 0.475740909576416, "sampling/sampling_logp_difference/mean": 0.014743911102414131, "step": 1958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1131.0, "completions/max_terminated_length": 1131.0, "completions/mean_length": 378.703125, "completions/mean_terminated_length": 378.703125, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "entropy": 0.4250997006893158, "epoch": 2.400735294117647, "frac_reward_zero_std": 0.75, "grad_norm": 0.5053975455450029, "kl": 0.018992707133293152, "learning_rate": 1.1835499878804861e-07, "loss": -0.0231, "num_tokens": 84144971.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.5670037269592285, "sampling/importance_sampling_ratio/mean": 1.0002152919769287, "sampling/importance_sampling_ratio/min": 0.6132686734199524, "sampling/sampling_logp_difference/max": 0.4889521598815918, "sampling/sampling_logp_difference/mean": 0.013721524737775326, "step": 1959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 968.0, "completions/max_terminated_length": 968.0, "completions/mean_length": 339.515625, "completions/mean_terminated_length": 339.515625, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.5941302180290222, "epoch": 2.4019607843137254, "frac_reward_zero_std": 0.75, "grad_norm": 0.6270882527407999, "kl": 0.0241215992718935, "learning_rate": 1.1789514937959965e-07, "loss": 0.0585, "num_tokens": 84181996.0, "reward": 0.59375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.5971468687057495, "sampling/importance_sampling_ratio/mean": 0.9996210336685181, "sampling/importance_sampling_ratio/min": 0.6610053181648254, "sampling/sampling_logp_difference/max": 0.4682188034057617, "sampling/sampling_logp_difference/mean": 0.01830323226749897, "step": 1960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 753.0, "completions/max_terminated_length": 753.0, "completions/mean_length": 355.65625, "completions/mean_terminated_length": 355.65625, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.5715056657791138, "epoch": 2.403186274509804, "frac_reward_zero_std": 0.5, "grad_norm": 0.9029189495177907, "kl": 0.02521122619509697, "learning_rate": 1.1743607561979013e-07, "loss": 0.0577, "num_tokens": 84224182.0, "reward": 0.8125, "reward_std": 0.40311288833618164, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.529356598854065, "sampling/importance_sampling_ratio/mean": 1.0000426769256592, "sampling/importance_sampling_ratio/min": 0.561181366443634, "sampling/sampling_logp_difference/max": 0.5777111053466797, "sampling/sampling_logp_difference/mean": 0.01561686396598816, "step": 1961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 687.0, "completions/max_terminated_length": 687.0, "completions/mean_length": 309.46875, "completions/mean_terminated_length": 309.46875, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.4311138391494751, "epoch": 2.4044117647058822, "frac_reward_zero_std": 1.0, "grad_norm": 0.015260444012059646, "kl": 0.02365969493985176, "learning_rate": 1.1697777844051104e-07, "loss": 0.0002, "num_tokens": 84260852.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.507345199584961, "sampling/importance_sampling_ratio/mean": 1.0000296831130981, "sampling/importance_sampling_ratio/min": 0.586681604385376, "sampling/sampling_logp_difference/max": 0.5332729816436768, "sampling/sampling_logp_difference/mean": 0.014585047960281372, "step": 1962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 757.0, "completions/max_terminated_length": 757.0, "completions/mean_length": 426.359375, "completions/mean_terminated_length": 426.359375, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.45570623874664307, "epoch": 2.405637254901961, "frac_reward_zero_std": 0.5, "grad_norm": 0.7203673187630456, "kl": 0.018121860921382904, "learning_rate": 1.1652025877207644e-07, "loss": -0.0051, "num_tokens": 84304859.0, "reward": 0.5, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0003349781036377, "sampling/importance_sampling_ratio/min": 0.6385962963104248, "sampling/sampling_logp_difference/max": 0.7132596969604492, "sampling/sampling_logp_difference/mean": 0.014654138125479221, "step": 1963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 542.0, "completions/max_terminated_length": 542.0, "completions/mean_length": 277.90625, "completions/mean_terminated_length": 277.90625, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.4025183320045471, "epoch": 2.406862745098039, "frac_reward_zero_std": 1.0, "grad_norm": 0.01345736418747929, "kl": 0.018178176134824753, "learning_rate": 1.1606351754322247e-07, "loss": 0.0002, "num_tokens": 84337733.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.7738994359970093, "sampling/importance_sampling_ratio/mean": 0.999103307723999, "sampling/importance_sampling_ratio/min": 0.6182993650436401, "sampling/sampling_logp_difference/max": 0.5731801986694336, "sampling/sampling_logp_difference/mean": 0.013351919129490852, "step": 1964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1014.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 447.296875, "completions/mean_terminated_length": 447.296875, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "entropy": 0.6502946615219116, "epoch": 2.4080882352941178, "frac_reward_zero_std": 0.5, "grad_norm": 0.6887547321126138, "kl": 0.02872980386018753, "learning_rate": 1.156075556811048e-07, "loss": 0.0416, "num_tokens": 84385816.0, "reward": 0.28125, "reward_std": 0.375, "rewards/decision_reward_func/mean": 0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 1.4204468727111816, "sampling/importance_sampling_ratio/mean": 1.0000734329223633, "sampling/importance_sampling_ratio/min": 0.6132689714431763, "sampling/sampling_logp_difference/max": 0.4889516830444336, "sampling/sampling_logp_difference/mean": 0.016781125217676163, "step": 1965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 836.0, "completions/max_terminated_length": 836.0, "completions/mean_length": 358.296875, "completions/mean_terminated_length": 358.296875, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.4580095708370209, "epoch": 2.409313725490196, "frac_reward_zero_std": 0.5, "grad_norm": 0.9455486460015117, "kl": 0.026967685669660568, "learning_rate": 1.1515237411129697e-07, "loss": 0.0837, "num_tokens": 84429003.0, "reward": -0.0625, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": -0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.5654895305633545, "sampling/importance_sampling_ratio/mean": 0.9999490976333618, "sampling/importance_sampling_ratio/min": 0.6179013848304749, "sampling/sampling_logp_difference/max": 0.481426477432251, "sampling/sampling_logp_difference/mean": 0.015200131572782993, "step": 1966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 561.0, "completions/max_terminated_length": 561.0, "completions/mean_length": 282.6875, "completions/mean_terminated_length": 282.6875, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.47256535291671753, "epoch": 2.4105392156862746, "frac_reward_zero_std": 0.5, "grad_norm": 1.1367573312147081, "kl": 0.04878377914428711, "learning_rate": 1.1469797375778901e-07, "loss": 0.0422, "num_tokens": 84459895.0, "reward": 0.9375, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.6531935930252075, "sampling/importance_sampling_ratio/mean": 1.0002590417861938, "sampling/importance_sampling_ratio/min": 0.7786315083503723, "sampling/sampling_logp_difference/max": 0.502708911895752, "sampling/sampling_logp_difference/mean": 0.015278123319149017, "step": 1967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 977.0, "completions/max_terminated_length": 977.0, "completions/mean_length": 445.46875, "completions/mean_terminated_length": 445.46875, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.5364172458648682, "epoch": 2.411764705882353, "frac_reward_zero_std": 0.75, "grad_norm": 0.4655726908467831, "kl": 0.021212035790085793, "learning_rate": 1.1424435554298473e-07, "loss": -0.014, "num_tokens": 84509861.0, "reward": 0.71875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.461927056312561, "sampling/importance_sampling_ratio/mean": 0.9999554753303528, "sampling/importance_sampling_ratio/min": 0.6268531680107117, "sampling/sampling_logp_difference/max": 0.4670429229736328, "sampling/sampling_logp_difference/mean": 0.015546740964055061, "step": 1968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 852.0, "completions/max_terminated_length": 852.0, "completions/mean_length": 431.6875, "completions/mean_terminated_length": 431.6875, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "entropy": 0.4576515555381775, "epoch": 2.4129901960784315, "frac_reward_zero_std": 0.5, "grad_norm": 0.641737247695371, "kl": 0.024225780740380287, "learning_rate": 1.1379152038770029e-07, "loss": -0.0171, "num_tokens": 84558737.0, "reward": 0.6875, "reward_std": 0.3811737596988678, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.5594680309295654, "sampling/importance_sampling_ratio/mean": 0.9996718168258667, "sampling/importance_sampling_ratio/min": 0.30077919363975525, "sampling/sampling_logp_difference/max": 1.2013788223266602, "sampling/sampling_logp_difference/mean": 0.014101961627602577, "step": 1969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 587.0, "completions/max_terminated_length": 587.0, "completions/mean_length": 318.9375, "completions/mean_terminated_length": 318.9375, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "entropy": 0.5023967027664185, "epoch": 2.4142156862745097, "frac_reward_zero_std": 0.75, "grad_norm": 0.6871604805762855, "kl": 0.05308526009321213, "learning_rate": 1.1333946921116234e-07, "loss": 0.0119, "num_tokens": 84595245.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.5062084197998047, "sampling/importance_sampling_ratio/mean": 1.0007381439208984, "sampling/importance_sampling_ratio/min": 0.6940051317214966, "sampling/sampling_logp_difference/max": 0.4095954895019531, "sampling/sampling_logp_difference/mean": 0.015241207554936409, "step": 1970 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1312.0, "completions/max_terminated_length": 1312.0, "completions/mean_length": 389.3125, "completions/mean_terminated_length": 389.3125, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.533998966217041, "epoch": 2.4154411764705883, "frac_reward_zero_std": 0.25, "grad_norm": 1.0518993984942302, "kl": 0.029050540179014206, "learning_rate": 1.1288820293100637e-07, "loss": -0.0463, "num_tokens": 84639569.0, "reward": 0.625, "reward_std": 0.5997638702392578, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.3829379081726074, "sampling/importance_sampling_ratio/mean": 0.9998953342437744, "sampling/importance_sampling_ratio/min": 0.6733343601226807, "sampling/sampling_logp_difference/max": 0.3955133557319641, "sampling/sampling_logp_difference/mean": 0.014808299951255322, "step": 1971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 686.0, "completions/max_terminated_length": 686.0, "completions/mean_length": 378.484375, "completions/mean_terminated_length": 378.484375, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.487685889005661, "epoch": 2.4166666666666665, "frac_reward_zero_std": 0.75, "grad_norm": 0.6940188115523186, "kl": 0.01919841393828392, "learning_rate": 1.1243772246327415e-07, "loss": 0.0404, "num_tokens": 84683152.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000375747680664, "sampling/importance_sampling_ratio/min": 0.6794466376304626, "sampling/sampling_logp_difference/max": 0.8285856246948242, "sampling/sampling_logp_difference/mean": 0.015777362510561943, "step": 1972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1170.0, "completions/max_terminated_length": 1170.0, "completions/mean_length": 508.484375, "completions/mean_terminated_length": 508.484375, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "entropy": 0.5331392884254456, "epoch": 2.417892156862745, "frac_reward_zero_std": 0.5, "grad_norm": 0.648780240236084, "kl": 0.029774140566587448, "learning_rate": 1.1198802872241242e-07, "loss": 0.0327, "num_tokens": 84735055.0, "reward": 0.78125, "reward_std": 0.42516323924064636, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.4775913953781128, "sampling/importance_sampling_ratio/mean": 0.9999371767044067, "sampling/importance_sampling_ratio/min": 0.6656587719917297, "sampling/sampling_logp_difference/max": 0.40697813034057617, "sampling/sampling_logp_difference/mean": 0.015564629808068275, "step": 1973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/max_terminated_length": 524.0, "completions/mean_length": 239.84375, "completions/mean_terminated_length": 239.84375, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.4706956744194031, "epoch": 2.4191176470588234, "frac_reward_zero_std": 1.0, "grad_norm": 0.020918720823953896, "kl": 0.024563461542129517, "learning_rate": 1.1153912262127119e-07, "loss": 0.0002, "num_tokens": 84770485.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3793964385986328, "sampling/importance_sampling_ratio/mean": 1.0000771284103394, "sampling/importance_sampling_ratio/min": 0.6155158281326294, "sampling/sampling_logp_difference/max": 0.4852945804595947, "sampling/sampling_logp_difference/mean": 0.017223425209522247, "step": 1974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.0, "completions/max_terminated_length": 518.0, "completions/mean_length": 331.515625, "completions/mean_terminated_length": 331.515625, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.4195324778556824, "epoch": 2.420343137254902, "frac_reward_zero_std": 0.75, "grad_norm": 0.6283506339940682, "kl": 0.02260478213429451, "learning_rate": 1.1109100507110131e-07, "loss": -0.011, "num_tokens": 84805494.0, "reward": 0.8125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.405151128768921, "sampling/importance_sampling_ratio/mean": 0.9998661279678345, "sampling/importance_sampling_ratio/min": 0.6427701115608215, "sampling/sampling_logp_difference/max": 0.4419682025909424, "sampling/sampling_logp_difference/mean": 0.013175224885344505, "step": 1975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1327.0, "completions/max_terminated_length": 1327.0, "completions/mean_length": 472.21875, "completions/mean_terminated_length": 472.21875, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "entropy": 0.5490034818649292, "epoch": 2.4215686274509802, "frac_reward_zero_std": 0.5, "grad_norm": 0.6514932112785615, "kl": 0.026515310630202293, "learning_rate": 1.1064367698155303e-07, "loss": -0.0474, "num_tokens": 84857636.0, "reward": 0.8125, "reward_std": 0.3943893015384674, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.4070497751235962, "sampling/importance_sampling_ratio/mean": 1.0001485347747803, "sampling/importance_sampling_ratio/min": 0.6861670613288879, "sampling/sampling_logp_difference/max": 0.3766341209411621, "sampling/sampling_logp_difference/mean": 0.014824079349637032, "step": 1976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 631.0, "completions/max_terminated_length": 631.0, "completions/mean_length": 360.921875, "completions/mean_terminated_length": 360.921875, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "entropy": 0.4926827549934387, "epoch": 2.422794117647059, "frac_reward_zero_std": 0.75, "grad_norm": 0.6560712465641688, "kl": 0.03343046456575394, "learning_rate": 1.1019713926067392e-07, "loss": 0.0212, "num_tokens": 84898607.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.636217713356018, "sampling/importance_sampling_ratio/mean": 0.9998933672904968, "sampling/importance_sampling_ratio/min": 0.4851599633693695, "sampling/sampling_logp_difference/max": 0.7232766151428223, "sampling/sampling_logp_difference/mean": 0.01542038843035698, "step": 1977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1308.0, "completions/max_terminated_length": 1308.0, "completions/mean_length": 509.71875, "completions/mean_terminated_length": 509.71875, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "entropy": 0.44261425733566284, "epoch": 2.424019607843137, "frac_reward_zero_std": 0.75, "grad_norm": 0.4922697364634103, "kl": 0.01535057369619608, "learning_rate": 1.0975139281490747e-07, "loss": 0.0416, "num_tokens": 84950061.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000272274017334, "sampling/importance_sampling_ratio/min": 0.677406907081604, "sampling/sampling_logp_difference/max": 0.6979579925537109, "sampling/sampling_logp_difference/mean": 0.013310705311596394, "step": 1978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 798.0, "completions/max_terminated_length": 798.0, "completions/mean_length": 313.078125, "completions/mean_terminated_length": 313.078125, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.4785287380218506, "epoch": 2.4252450980392157, "frac_reward_zero_std": 1.0, "grad_norm": 0.01861313240601166, "kl": 0.041793063282966614, "learning_rate": 1.093064385490906e-07, "loss": 0.0003, "num_tokens": 84984450.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5744246244430542, "sampling/importance_sampling_ratio/mean": 1.0002551078796387, "sampling/importance_sampling_ratio/min": 0.7007641196250916, "sampling/sampling_logp_difference/max": 0.4538898468017578, "sampling/sampling_logp_difference/mean": 0.01603473350405693, "step": 1979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 807.0, "completions/max_terminated_length": 807.0, "completions/mean_length": 433.390625, "completions/mean_terminated_length": 433.390625, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 0.5196254849433899, "epoch": 2.426470588235294, "frac_reward_zero_std": 0.25, "grad_norm": 1.0220309062021433, "kl": 0.029817555099725723, "learning_rate": 1.0886227736645215e-07, "loss": 0.0942, "num_tokens": 85034907.0, "reward": 0.4375, "reward_std": 0.42078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.6293957233428955, "sampling/importance_sampling_ratio/mean": 1.0004093647003174, "sampling/importance_sampling_ratio/min": 0.4055778682231903, "sampling/sampling_logp_difference/max": 0.902442455291748, "sampling/sampling_logp_difference/mean": 0.015421854332089424, "step": 1980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 541.0, "completions/max_terminated_length": 541.0, "completions/mean_length": 320.015625, "completions/mean_terminated_length": 320.015625, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "entropy": 0.585490345954895, "epoch": 2.4276960784313726, "frac_reward_zero_std": 0.75, "grad_norm": 0.7461472978009546, "kl": 0.03756077587604523, "learning_rate": 1.0841891016861155e-07, "loss": -0.033, "num_tokens": 85075980.0, "reward": 0.75, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.4916131496429443, "sampling/importance_sampling_ratio/mean": 0.9997145533561707, "sampling/importance_sampling_ratio/min": 0.46694234013557434, "sampling/sampling_logp_difference/max": 0.7615494728088379, "sampling/sampling_logp_difference/mean": 0.017761684954166412, "step": 1981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 906.0, "completions/max_terminated_length": 906.0, "completions/mean_length": 369.140625, "completions/mean_terminated_length": 369.140625, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "entropy": 0.501172661781311, "epoch": 2.428921568627451, "frac_reward_zero_std": 0.5, "grad_norm": 0.9405912990699224, "kl": 0.021085578948259354, "learning_rate": 1.0797633785557581e-07, "loss": 0.0814, "num_tokens": 85121157.0, "reward": 0.53125, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.998932123184204, "sampling/importance_sampling_ratio/mean": 1.0000585317611694, "sampling/importance_sampling_ratio/min": 0.6178486347198486, "sampling/sampling_logp_difference/max": 0.6926131248474121, "sampling/sampling_logp_difference/mean": 0.014894163236021996, "step": 1982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 827.0, "completions/max_terminated_length": 827.0, "completions/mean_length": 357.203125, "completions/mean_terminated_length": 357.203125, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.46825286746025085, "epoch": 2.4301470588235294, "frac_reward_zero_std": 0.75, "grad_norm": 0.8730786584142741, "kl": 0.021637093275785446, "learning_rate": 1.0753456132573885e-07, "loss": -0.0112, "num_tokens": 85164434.0, "reward": 0.1875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 1.456916093826294, "sampling/importance_sampling_ratio/mean": 1.0002673864364624, "sampling/importance_sampling_ratio/min": 0.6589239835739136, "sampling/sampling_logp_difference/max": 0.41714704036712646, "sampling/sampling_logp_difference/mean": 0.014391673728823662, "step": 1983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1239.0, "completions/max_terminated_length": 1239.0, "completions/mean_length": 450.296875, "completions/mean_terminated_length": 450.296875, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.49084752798080444, "epoch": 2.431372549019608, "frac_reward_zero_std": 1.0, "grad_norm": 0.01802119355913802, "kl": 0.024190880358219147, "learning_rate": 1.0709358147587883e-07, "loss": 0.0002, "num_tokens": 85212917.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000331401824951, "sampling/importance_sampling_ratio/min": 0.6096634268760681, "sampling/sampling_logp_difference/max": 0.8758214712142944, "sampling/sampling_logp_difference/mean": 0.01589093543589115, "step": 1984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 918.0, "completions/max_terminated_length": 918.0, "completions/mean_length": 457.4375, "completions/mean_terminated_length": 457.4375, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.4673923850059509, "epoch": 2.4325980392156863, "frac_reward_zero_std": 0.75, "grad_norm": 0.42948706017886756, "kl": 0.019798021763563156, "learning_rate": 1.0665339920115718e-07, "loss": -0.0024, "num_tokens": 85259393.0, "reward": 0.40625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.5906530618667603, "sampling/importance_sampling_ratio/mean": 0.9998549818992615, "sampling/importance_sampling_ratio/min": 0.4619346261024475, "sampling/sampling_logp_difference/max": 0.772331953048706, "sampling/sampling_logp_difference/mean": 0.014150416478514671, "step": 1985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 687.0, "completions/max_terminated_length": 687.0, "completions/mean_length": 372.828125, "completions/mean_terminated_length": 372.828125, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.6488343477249146, "epoch": 2.4338235294117645, "frac_reward_zero_std": 0.25, "grad_norm": 1.0415791833924524, "kl": 0.027483340352773666, "learning_rate": 1.0621401539511587e-07, "loss": -0.0358, "num_tokens": 85306102.0, "reward": 0.3125, "reward_std": 0.47360679507255554, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.9107699394226074, "sampling/importance_sampling_ratio/mean": 1.0000032186508179, "sampling/importance_sampling_ratio/min": 0.40564581751823425, "sampling/sampling_logp_difference/max": 0.9022748470306396, "sampling/sampling_logp_difference/mean": 0.018678754568099976, "step": 1986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 560.0, "completions/max_terminated_length": 560.0, "completions/mean_length": 304.09375, "completions/mean_terminated_length": 304.09375, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.43980756402015686, "epoch": 2.435049019607843, "frac_reward_zero_std": 0.5, "grad_norm": 1.1526108269668827, "kl": 0.02129153534770012, "learning_rate": 1.0577543094967611e-07, "loss": 0.0475, "num_tokens": 85343084.0, "reward": 0.875, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.3902510404586792, "sampling/importance_sampling_ratio/mean": 0.9998450875282288, "sampling/importance_sampling_ratio/min": 0.6954975724220276, "sampling/sampling_logp_difference/max": 0.3631277084350586, "sampling/sampling_logp_difference/mean": 0.014378875494003296, "step": 1987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 810.0, "completions/max_terminated_length": 810.0, "completions/mean_length": 313.59375, "completions/mean_terminated_length": 313.59375, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.4850468635559082, "epoch": 2.436274509803922, "frac_reward_zero_std": 0.75, "grad_norm": 0.5763412422824403, "kl": 0.022260908037424088, "learning_rate": 1.053376467551368e-07, "loss": 0.0027, "num_tokens": 85380258.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.613181471824646, "sampling/importance_sampling_ratio/mean": 0.9997431635856628, "sampling/importance_sampling_ratio/min": 0.4194561243057251, "sampling/sampling_logp_difference/max": 0.8687963485717773, "sampling/sampling_logp_difference/mean": 0.016065046191215515, "step": 1988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/max_terminated_length": 524.0, "completions/mean_length": 317.203125, "completions/mean_terminated_length": 317.203125, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "entropy": 0.41928476095199585, "epoch": 2.4375, "frac_reward_zero_std": 0.75, "grad_norm": 0.62613805792808, "kl": 0.022574733942747116, "learning_rate": 1.0490066370017181e-07, "loss": -0.0155, "num_tokens": 85417439.0, "reward": 0.6875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.291561245918274, "sampling/importance_sampling_ratio/mean": 0.9999145269393921, "sampling/importance_sampling_ratio/min": 0.5894016027450562, "sampling/sampling_logp_difference/max": 0.5286475419998169, "sampling/sampling_logp_difference/mean": 0.012769601307809353, "step": 1989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 928.0, "completions/max_terminated_length": 928.0, "completions/mean_length": 330.46875, "completions/mean_terminated_length": 330.46875, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.29442059993743896, "epoch": 2.438725490196078, "frac_reward_zero_std": 1.0, "grad_norm": 0.01139014393870863, "kl": 0.01789858564734459, "learning_rate": 1.044644826718295e-07, "loss": 0.0001, "num_tokens": 85461277.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4176517724990845, "sampling/importance_sampling_ratio/mean": 0.999839723110199, "sampling/importance_sampling_ratio/min": 0.11942283809185028, "sampling/sampling_logp_difference/max": 2.12508487701416, "sampling/sampling_logp_difference/mean": 0.011954793706536293, "step": 1990 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 845.0, "completions/max_terminated_length": 845.0, "completions/mean_length": 385.734375, "completions/mean_terminated_length": 385.734375, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.4688987731933594, "epoch": 2.439950980392157, "frac_reward_zero_std": 0.5, "grad_norm": 0.6870107744221775, "kl": 0.02938472293317318, "learning_rate": 1.0402910455552916e-07, "loss": -0.0119, "num_tokens": 85504332.0, "reward": 0.46875, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.437279462814331, "sampling/importance_sampling_ratio/mean": 0.999961256980896, "sampling/importance_sampling_ratio/min": 0.626309871673584, "sampling/sampling_logp_difference/max": 0.4679100513458252, "sampling/sampling_logp_difference/mean": 0.014665831811726093, "step": 1991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 789.0, "completions/max_terminated_length": 789.0, "completions/mean_length": 333.1875, "completions/mean_terminated_length": 333.1875, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.5642496347427368, "epoch": 2.4411764705882355, "frac_reward_zero_std": 0.25, "grad_norm": 0.9858199980146067, "kl": 0.05451158806681633, "learning_rate": 1.0359453023506121e-07, "loss": 0.0194, "num_tokens": 85541400.0, "reward": 0.375, "reward_std": 0.690913200378418, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.3438199758529663, "sampling/importance_sampling_ratio/mean": 1.000030755996704, "sampling/importance_sampling_ratio/min": 0.7312501668930054, "sampling/sampling_logp_difference/max": 0.3129997253417969, "sampling/sampling_logp_difference/mean": 0.015983374789357185, "step": 1992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 979.0, "completions/max_terminated_length": 979.0, "completions/mean_length": 432.09375, "completions/mean_terminated_length": 432.09375, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.4295406937599182, "epoch": 2.4424019607843137, "frac_reward_zero_std": 0.75, "grad_norm": 0.4653592175155032, "kl": 0.016976531594991684, "learning_rate": 1.0316076059258389e-07, "loss": 0.0148, "num_tokens": 85586558.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9998765587806702, "sampling/importance_sampling_ratio/min": 0.6993035674095154, "sampling/sampling_logp_difference/max": 0.8511209487915039, "sampling/sampling_logp_difference/mean": 0.012966765090823174, "step": 1993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 744.0, "completions/max_terminated_length": 744.0, "completions/mean_length": 316.375, "completions/mean_terminated_length": 316.375, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.3935540020465851, "epoch": 2.443627450980392, "frac_reward_zero_std": 1.0, "grad_norm": 0.015278082743988924, "kl": 0.016327252611517906, "learning_rate": 1.0272779650862185e-07, "loss": 0.0002, "num_tokens": 85626454.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6078556776046753, "sampling/importance_sampling_ratio/mean": 0.9999628067016602, "sampling/importance_sampling_ratio/min": 0.6182855367660522, "sampling/sampling_logp_difference/max": 0.4808049201965332, "sampling/sampling_logp_difference/mean": 0.013829727657139301, "step": 1994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1247.0, "completions/max_terminated_length": 1247.0, "completions/mean_length": 448.75, "completions/mean_terminated_length": 448.75, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.6225670576095581, "epoch": 2.4448529411764706, "frac_reward_zero_std": 0.25, "grad_norm": 1.0167980014035065, "kl": 0.022273242473602295, "learning_rate": 1.0229563886206516e-07, "loss": 0.0469, "num_tokens": 85674518.0, "reward": 0.84375, "reward_std": 0.46656501293182373, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.516396403312683, "sampling/importance_sampling_ratio/mean": 1.0002257823944092, "sampling/importance_sampling_ratio/min": 0.6300704479217529, "sampling/sampling_logp_difference/max": 0.46192359924316406, "sampling/sampling_logp_difference/mean": 0.01818186044692993, "step": 1995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 556.0, "completions/max_terminated_length": 556.0, "completions/mean_length": 330.953125, "completions/mean_terminated_length": 330.953125, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.3701726198196411, "epoch": 2.446078431372549, "frac_reward_zero_std": 0.75, "grad_norm": 0.5641071571559076, "kl": 0.016050096601247787, "learning_rate": 1.0186428853016604e-07, "loss": -0.0004, "num_tokens": 85717075.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.4000227451324463, "sampling/importance_sampling_ratio/mean": 1.0003139972686768, "sampling/importance_sampling_ratio/min": 0.5325406789779663, "sampling/sampling_logp_difference/max": 0.6300959587097168, "sampling/sampling_logp_difference/mean": 0.012161776423454285, "step": 1996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 556.0, "completions/max_terminated_length": 556.0, "completions/mean_length": 286.640625, "completions/mean_terminated_length": 286.640625, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.4820958375930786, "epoch": 2.4473039215686274, "frac_reward_zero_std": 1.0, "grad_norm": 0.018436112938405358, "kl": 0.024329040199518204, "learning_rate": 1.0143374638853891e-07, "loss": 0.0002, "num_tokens": 85751420.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.473806381225586, "sampling/importance_sampling_ratio/mean": 1.0001040697097778, "sampling/importance_sampling_ratio/min": 0.6437638401985168, "sampling/sampling_logp_difference/max": 0.44042330980300903, "sampling/sampling_logp_difference/mean": 0.016319390386343002, "step": 1997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1023.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 379.28125, "completions/mean_terminated_length": 379.28125, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.5071786046028137, "epoch": 2.448529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 0.7865527058288164, "kl": 0.03031349368393421, "learning_rate": 1.0100401331115638e-07, "loss": -0.0113, "num_tokens": 85793646.0, "reward": 0.53125, "reward_std": 0.5061737298965454, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.5977063179016113, "sampling/importance_sampling_ratio/mean": 0.9996448755264282, "sampling/importance_sampling_ratio/min": 0.47493472695350647, "sampling/sampling_logp_difference/max": 0.7445778846740723, "sampling/sampling_logp_difference/mean": 0.015055624768137932, "step": 1998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 450.0, "completions/max_terminated_length": 450.0, "completions/mean_length": 289.875, "completions/mean_terminated_length": 289.875, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "entropy": 0.5452965497970581, "epoch": 2.4497549019607843, "frac_reward_zero_std": 1.0, "grad_norm": 0.020548579088536342, "kl": 0.03412887454032898, "learning_rate": 1.0057509017034977e-07, "loss": 0.0003, "num_tokens": 85830246.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4040255546569824, "sampling/importance_sampling_ratio/mean": 0.9999415278434753, "sampling/importance_sampling_ratio/min": 0.6098306179046631, "sampling/sampling_logp_difference/max": 0.49457406997680664, "sampling/sampling_logp_difference/mean": 0.016989685595035553, "step": 1999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 756.0, "completions/max_terminated_length": 756.0, "completions/mean_length": 353.796875, "completions/mean_terminated_length": 353.796875, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.42378807067871094, "epoch": 2.450980392156863, "frac_reward_zero_std": 1.0, "grad_norm": 0.016716804667639713, "kl": 0.01980084553360939, "learning_rate": 1.001469778368057e-07, "loss": 0.0002, "num_tokens": 85868377.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.696582555770874, "sampling/importance_sampling_ratio/mean": 1.0001220703125, "sampling/importance_sampling_ratio/min": 0.4982512891292572, "sampling/sampling_logp_difference/max": 0.6966507434844971, "sampling/sampling_logp_difference/mean": 0.01371418870985508, "step": 2000 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 700.0, "completions/max_terminated_length": 700.0, "completions/mean_length": 376.3125, "completions/mean_terminated_length": 376.3125, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.4694760739803314, "epoch": 2.452205882352941, "frac_reward_zero_std": 0.75, "grad_norm": 0.6557356544551449, "kl": 0.024077005684375763, "learning_rate": 9.971967717956531e-08, "loss": -0.0069, "num_tokens": 85920029.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.4907902479171753, "sampling/importance_sampling_ratio/mean": 1.0001784563064575, "sampling/importance_sampling_ratio/min": 0.7196665406227112, "sampling/sampling_logp_difference/max": 0.3993062973022461, "sampling/sampling_logp_difference/mean": 0.01402963325381279, "step": 2001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 652.0, "completions/max_terminated_length": 652.0, "completions/mean_length": 413.453125, "completions/mean_terminated_length": 413.453125, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "entropy": 0.5380818843841553, "epoch": 2.4534313725490198, "frac_reward_zero_std": 0.5, "grad_norm": 0.7517089446000808, "kl": 0.019972864538431168, "learning_rate": 9.929318906602174e-08, "loss": 0.016, "num_tokens": 85962714.0, "reward": 0.65625, "reward_std": 0.375, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.371732234954834, "sampling/importance_sampling_ratio/mean": 0.9998036623001099, "sampling/importance_sampling_ratio/min": 0.6546944975852966, "sampling/sampling_logp_difference/max": 0.4235866069793701, "sampling/sampling_logp_difference/mean": 0.014928067103028297, "step": 2002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 816.0, "completions/max_terminated_length": 816.0, "completions/mean_length": 444.796875, "completions/mean_terminated_length": 444.796875, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "entropy": 0.4962307810783386, "epoch": 2.454656862745098, "frac_reward_zero_std": 0.75, "grad_norm": 0.5303655553596743, "kl": 0.018403185531497, "learning_rate": 9.886751436191871e-08, "loss": 0.0129, "num_tokens": 86010973.0, "reward": 0.625, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.5796191692352295, "sampling/importance_sampling_ratio/mean": 1.0003600120544434, "sampling/importance_sampling_ratio/min": 0.6775802373886108, "sampling/sampling_logp_difference/max": 0.457183837890625, "sampling/sampling_logp_difference/mean": 0.013720303773880005, "step": 2003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 727.0, "completions/max_terminated_length": 727.0, "completions/mean_length": 318.96875, "completions/mean_terminated_length": 318.96875, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "entropy": 0.4439535439014435, "epoch": 2.4558823529411766, "frac_reward_zero_std": 0.5, "grad_norm": 0.8373497801250678, "kl": 0.024745387956500053, "learning_rate": 9.844265393134926e-08, "loss": 0.0053, "num_tokens": 86050955.0, "reward": 0.5, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.7894337177276611, "sampling/importance_sampling_ratio/mean": 1.0000526905059814, "sampling/importance_sampling_ratio/min": 0.5988035798072815, "sampling/sampling_logp_difference/max": 0.5818991661071777, "sampling/sampling_logp_difference/mean": 0.015376938506960869, "step": 2004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1068.0, "completions/max_terminated_length": 1068.0, "completions/mean_length": 409.328125, "completions/mean_terminated_length": 409.328125, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "entropy": 0.5542207360267639, "epoch": 2.457107843137255, "frac_reward_zero_std": 0.5, "grad_norm": 0.8433198570866792, "kl": 0.029978584498167038, "learning_rate": 9.801860863675266e-08, "loss": -0.0836, "num_tokens": 86097056.0, "reward": 0.40625, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.5598045587539673, "sampling/importance_sampling_ratio/mean": 1.000291109085083, "sampling/importance_sampling_ratio/min": 0.6248463988304138, "sampling/sampling_logp_difference/max": 0.4702494144439697, "sampling/sampling_logp_difference/mean": 0.015643076971173286, "step": 2005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1047.0, "completions/max_terminated_length": 1047.0, "completions/mean_length": 365.640625, "completions/mean_terminated_length": 365.640625, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.5193289518356323, "epoch": 2.4583333333333335, "frac_reward_zero_std": 0.5, "grad_norm": 0.9419861723494732, "kl": 0.02031654119491577, "learning_rate": 9.759537933891421e-08, "loss": 0.0003, "num_tokens": 86136649.0, "reward": 0.46875, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9997302293777466, "sampling/importance_sampling_ratio/min": 0.34090930223464966, "sampling/sampling_logp_difference/max": 1.0761388540267944, "sampling/sampling_logp_difference/mean": 0.016678113490343094, "step": 2006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 758.0, "completions/max_terminated_length": 758.0, "completions/mean_length": 329.90625, "completions/mean_terminated_length": 329.90625, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.6356875896453857, "epoch": 2.4595588235294117, "frac_reward_zero_std": 0.75, "grad_norm": 0.47303073986640765, "kl": 0.058628521859645844, "learning_rate": 9.71729668969628e-08, "loss": 0.0112, "num_tokens": 86175731.0, "reward": 0.59375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.62908935546875, "sampling/importance_sampling_ratio/mean": 0.9995545148849487, "sampling/importance_sampling_ratio/min": 0.6177362203598022, "sampling/sampling_logp_difference/max": 0.4880211353302002, "sampling/sampling_logp_difference/mean": 0.01814216561615467, "step": 2007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 671.0, "completions/max_terminated_length": 671.0, "completions/mean_length": 339.75, "completions/mean_terminated_length": 339.75, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "entropy": 0.49132227897644043, "epoch": 2.4607843137254903, "frac_reward_zero_std": 0.75, "grad_norm": 0.9937923130962172, "kl": 0.020931560546159744, "learning_rate": 9.67513721683687e-08, "loss": 0.0321, "num_tokens": 86212915.0, "reward": 0.40625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.4570294618606567, "sampling/importance_sampling_ratio/mean": 0.9996726512908936, "sampling/importance_sampling_ratio/min": 0.6199607849121094, "sampling/sampling_logp_difference/max": 0.47809910774230957, "sampling/sampling_logp_difference/mean": 0.015716366469860077, "step": 2008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1242.0, "completions/max_terminated_length": 1242.0, "completions/mean_length": 396.625, "completions/mean_terminated_length": 396.625, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.44943007826805115, "epoch": 2.4620098039215685, "frac_reward_zero_std": 0.75, "grad_norm": 0.4066835415933353, "kl": 0.018481794744729996, "learning_rate": 9.633059600894256e-08, "loss": 0.003, "num_tokens": 86263355.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.5980671644210815, "sampling/importance_sampling_ratio/mean": 1.0000193119049072, "sampling/importance_sampling_ratio/min": 0.4835300147533417, "sampling/sampling_logp_difference/max": 0.7266418933868408, "sampling/sampling_logp_difference/mean": 0.014273881912231445, "step": 2009 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 785.0, "completions/max_terminated_length": 785.0, "completions/mean_length": 367.703125, "completions/mean_terminated_length": 367.703125, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.39827412366867065, "epoch": 2.463235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.020434433094970776, "kl": 0.014825995080173016, "learning_rate": 9.59106392728331e-08, "loss": 0.0001, "num_tokens": 86306824.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5400842428207397, "sampling/importance_sampling_ratio/mean": 0.9999579191207886, "sampling/importance_sampling_ratio/min": 0.6974018216133118, "sampling/sampling_logp_difference/max": 0.4318370819091797, "sampling/sampling_logp_difference/mean": 0.012138715013861656, "step": 2010 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 808.0, "completions/max_terminated_length": 808.0, "completions/mean_length": 350.40625, "completions/mean_terminated_length": 350.40625, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "entropy": 0.44719359278678894, "epoch": 2.4644607843137254, "frac_reward_zero_std": 0.75, "grad_norm": 0.6254866489912777, "kl": 0.020384060218930244, "learning_rate": 9.549150281252632e-08, "loss": 0.0316, "num_tokens": 86351410.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.5814623832702637, "sampling/importance_sampling_ratio/mean": 1.0003173351287842, "sampling/importance_sampling_ratio/min": 0.5096350908279419, "sampling/sampling_logp_difference/max": 0.6740604043006897, "sampling/sampling_logp_difference/mean": 0.014734165742993355, "step": 2011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 782.0, "completions/max_terminated_length": 782.0, "completions/mean_length": 337.84375, "completions/mean_terminated_length": 337.84375, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.5456031560897827, "epoch": 2.465686274509804, "frac_reward_zero_std": 0.75, "grad_norm": 0.5809893743691794, "kl": 0.030397456139326096, "learning_rate": 9.507318747884241e-08, "loss": 0.0144, "num_tokens": 86393128.0, "reward": 0.125, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 1.5805305242538452, "sampling/importance_sampling_ratio/mean": 0.9998569488525391, "sampling/importance_sampling_ratio/min": 0.6933793425559998, "sampling/sampling_logp_difference/max": 0.4577605724334717, "sampling/sampling_logp_difference/mean": 0.01505983155220747, "step": 2012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 625.0, "completions/max_terminated_length": 625.0, "completions/mean_length": 299.78125, "completions/mean_terminated_length": 299.78125, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.45628994703292847, "epoch": 2.4669117647058822, "frac_reward_zero_std": 0.75, "grad_norm": 0.660090400005466, "kl": 0.028354834765195847, "learning_rate": 9.465569412093488e-08, "loss": 0.0214, "num_tokens": 86427882.0, "reward": 0.65625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.6207854747772217, "sampling/importance_sampling_ratio/mean": 0.9999902844429016, "sampling/importance_sampling_ratio/min": 0.6005659103393555, "sampling/sampling_logp_difference/max": 0.509882926940918, "sampling/sampling_logp_difference/mean": 0.015328515321016312, "step": 2013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1303.0, "completions/max_terminated_length": 1303.0, "completions/mean_length": 525.390625, "completions/mean_terminated_length": 525.390625, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "entropy": 0.4515731632709503, "epoch": 2.468137254901961, "frac_reward_zero_std": 0.5, "grad_norm": 0.6367387910686444, "kl": 0.02471562847495079, "learning_rate": 9.423902358628916e-08, "loss": 0.0057, "num_tokens": 86486419.0, "reward": 0.40625, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.5216177701950073, "sampling/importance_sampling_ratio/mean": 0.9998372793197632, "sampling/importance_sampling_ratio/min": 0.34379613399505615, "sampling/sampling_logp_difference/max": 1.0677063465118408, "sampling/sampling_logp_difference/mean": 0.013594096526503563, "step": 2014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 907.0, "completions/max_terminated_length": 907.0, "completions/mean_length": 350.203125, "completions/mean_terminated_length": 350.203125, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.5142543315887451, "epoch": 2.469362745098039, "frac_reward_zero_std": 0.75, "grad_norm": 0.6491632217855435, "kl": 0.029163051396608353, "learning_rate": 9.382317672071966e-08, "loss": 0.057, "num_tokens": 86522320.0, "reward": 0.8125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.421674132347107, "sampling/importance_sampling_ratio/mean": 0.9997380375862122, "sampling/importance_sampling_ratio/min": 0.6368654370307922, "sampling/sampling_logp_difference/max": 0.45119690895080566, "sampling/sampling_logp_difference/mean": 0.015587743371725082, "step": 2015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 747.0, "completions/max_terminated_length": 747.0, "completions/mean_length": 364.21875, "completions/mean_terminated_length": 364.21875, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.5109447240829468, "epoch": 2.4705882352941178, "frac_reward_zero_std": 1.0, "grad_norm": 0.016047827356925344, "kl": 0.04043231159448624, "learning_rate": 9.340815436836963e-08, "loss": 0.0003, "num_tokens": 86564174.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6353731155395508, "sampling/importance_sampling_ratio/mean": 1.000037431716919, "sampling/importance_sampling_ratio/min": 0.6237004995346069, "sampling/sampling_logp_difference/max": 0.4918709993362427, "sampling/sampling_logp_difference/mean": 0.015851834788918495, "step": 2016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1255.0, "completions/max_terminated_length": 1255.0, "completions/mean_length": 498.328125, "completions/mean_terminated_length": 498.328125, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "entropy": 0.43960005044937134, "epoch": 2.471813725490196, "frac_reward_zero_std": 0.75, "grad_norm": 0.4786356747623876, "kl": 0.01450418308377266, "learning_rate": 9.299395737170757e-08, "loss": -0.0226, "num_tokens": 86613283.0, "reward": 0.65625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.438014268875122, "sampling/importance_sampling_ratio/mean": 0.9997552037239075, "sampling/importance_sampling_ratio/min": 0.6056475043296814, "sampling/sampling_logp_difference/max": 0.5014572143554688, "sampling/sampling_logp_difference/mean": 0.012551134452223778, "step": 2017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 805.0, "completions/max_terminated_length": 805.0, "completions/mean_length": 339.078125, "completions/mean_terminated_length": 339.078125, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.36390385031700134, "epoch": 2.4730392156862746, "frac_reward_zero_std": 1.0, "grad_norm": 0.010240107313606223, "kl": 0.016955044120550156, "learning_rate": 9.258058657152761e-08, "loss": 0.0002, "num_tokens": 86652296.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4973394870758057, "sampling/importance_sampling_ratio/mean": 0.9998985528945923, "sampling/importance_sampling_ratio/min": 0.6111265420913696, "sampling/sampling_logp_difference/max": 0.49245119094848633, "sampling/sampling_logp_difference/mean": 0.013622431084513664, "step": 2018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 963.0, "completions/max_terminated_length": 963.0, "completions/mean_length": 362.140625, "completions/mean_terminated_length": 362.140625, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.6089727878570557, "epoch": 2.474264705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.8565716894949492, "kl": 0.035560283809900284, "learning_rate": 9.216804280694612e-08, "loss": 0.0171, "num_tokens": 86691905.0, "reward": 0.5, "reward_std": 0.4472135901451111, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.630021333694458, "sampling/importance_sampling_ratio/mean": 0.9999361038208008, "sampling/importance_sampling_ratio/min": 0.6929814219474792, "sampling/sampling_logp_difference/max": 0.48859310150146484, "sampling/sampling_logp_difference/mean": 0.016544349491596222, "step": 2019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1572.0, "completions/max_terminated_length": 1572.0, "completions/mean_length": 431.78125, "completions/mean_terminated_length": 431.78125, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.4160088002681732, "epoch": 2.4754901960784315, "frac_reward_zero_std": 0.75, "grad_norm": 0.6631210437738546, "kl": 0.02134557068347931, "learning_rate": 9.175632691540064e-08, "loss": 0.1171, "num_tokens": 86740659.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.5508496761322021, "sampling/importance_sampling_ratio/mean": 1.000300645828247, "sampling/importance_sampling_ratio/min": 0.617247998714447, "sampling/sampling_logp_difference/max": 0.4824843406677246, "sampling/sampling_logp_difference/mean": 0.013543687760829926, "step": 2020 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 534.0, "completions/max_terminated_length": 534.0, "completions/mean_length": 204.1875, "completions/mean_terminated_length": 204.1875, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.4063154458999634, "epoch": 2.4767156862745097, "frac_reward_zero_std": 0.75, "grad_norm": 0.9214348865823414, "kl": 0.027350233867764473, "learning_rate": 9.134543973264868e-08, "loss": 0.0095, "num_tokens": 86765167.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.7505618333816528, "sampling/importance_sampling_ratio/mean": 1.0002621412277222, "sampling/importance_sampling_ratio/min": 0.3617531359195709, "sampling/sampling_logp_difference/max": 1.0167932510375977, "sampling/sampling_logp_difference/mean": 0.01597100868821144, "step": 2021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 582.0, "completions/max_terminated_length": 582.0, "completions/mean_length": 363.703125, "completions/mean_terminated_length": 363.703125, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "entropy": 0.5204824209213257, "epoch": 2.4779411764705883, "frac_reward_zero_std": 0.75, "grad_norm": 0.6330342213646245, "kl": 0.028159357607364655, "learning_rate": 9.093538209276486e-08, "loss": 0.0061, "num_tokens": 86804172.0, "reward": 0.59375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.3600409030914307, "sampling/importance_sampling_ratio/mean": 1.0001919269561768, "sampling/importance_sampling_ratio/min": 0.6167065501213074, "sampling/sampling_logp_difference/max": 0.48336195945739746, "sampling/sampling_logp_difference/mean": 0.015442202799022198, "step": 2022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 914.0, "completions/max_terminated_length": 914.0, "completions/mean_length": 451.921875, "completions/mean_terminated_length": 451.921875, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "entropy": 0.39672720432281494, "epoch": 2.4791666666666665, "frac_reward_zero_std": 0.75, "grad_norm": 0.4247053759631333, "kl": 0.01817684806883335, "learning_rate": 9.052615482814069e-08, "loss": -0.0053, "num_tokens": 86855639.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.4088422060012817, "sampling/importance_sampling_ratio/mean": 0.999949038028717, "sampling/importance_sampling_ratio/min": 0.04446224868297577, "sampling/sampling_logp_difference/max": 3.113114833831787, "sampling/sampling_logp_difference/mean": 0.01228392869234085, "step": 2023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1012.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 411.609375, "completions/mean_terminated_length": 411.609375, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "entropy": 0.5559807419776917, "epoch": 2.480392156862745, "frac_reward_zero_std": 0.75, "grad_norm": 0.4731682313934922, "kl": 0.06053575128316879, "learning_rate": 9.011775876948096e-08, "loss": -0.0065, "num_tokens": 86897054.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.6008760929107666, "sampling/importance_sampling_ratio/mean": 0.9995759129524231, "sampling/importance_sampling_ratio/min": 0.6097817420959473, "sampling/sampling_logp_difference/max": 0.49465417861938477, "sampling/sampling_logp_difference/mean": 0.01604539528489113, "step": 2024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.0, "completions/max_terminated_length": 532.0, "completions/mean_length": 293.5, "completions/mean_terminated_length": 293.5, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.5029458403587341, "epoch": 2.4816176470588234, "frac_reward_zero_std": 0.75, "grad_norm": 0.6989123224744958, "kl": 0.046888481825590134, "learning_rate": 8.971019474580427e-08, "loss": 0.0379, "num_tokens": 86929326.0, "reward": 0.75, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.6366273164749146, "sampling/importance_sampling_ratio/mean": 0.9995224475860596, "sampling/importance_sampling_ratio/min": 0.5858862996101379, "sampling/sampling_logp_difference/max": 0.5346295833587646, "sampling/sampling_logp_difference/mean": 0.016162948682904243, "step": 2025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1077.0, "completions/max_terminated_length": 1077.0, "completions/mean_length": 274.25, "completions/mean_terminated_length": 274.25, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.5471726059913635, "epoch": 2.482843137254902, "frac_reward_zero_std": 0.75, "grad_norm": 0.7878903685936279, "kl": 0.03652939200401306, "learning_rate": 8.930346358443953e-08, "loss": 0.0142, "num_tokens": 86961550.0, "reward": 0.40625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.624337911605835, "sampling/importance_sampling_ratio/mean": 1.000370740890503, "sampling/importance_sampling_ratio/min": 0.6373802423477173, "sampling/sampling_logp_difference/max": 0.48510026931762695, "sampling/sampling_logp_difference/mean": 0.017099354416131973, "step": 2026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 944.0, "completions/max_terminated_length": 944.0, "completions/mean_length": 355.78125, "completions/mean_terminated_length": 355.78125, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.5780984163284302, "epoch": 2.4840686274509802, "frac_reward_zero_std": 0.75, "grad_norm": 0.45016711179702085, "kl": 0.05154428631067276, "learning_rate": 8.889756611102539e-08, "loss": -0.0024, "num_tokens": 86998848.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.4657893180847168, "sampling/importance_sampling_ratio/mean": 0.9996787905693054, "sampling/importance_sampling_ratio/min": 0.621524453163147, "sampling/sampling_logp_difference/max": 0.47557997703552246, "sampling/sampling_logp_difference/mean": 0.0173359215259552, "step": 2027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 772.0, "completions/max_terminated_length": 772.0, "completions/mean_length": 323.5625, "completions/mean_terminated_length": 323.5625, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "entropy": 0.4153844714164734, "epoch": 2.485294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.013418917541488872, "kl": 0.023444563150405884, "learning_rate": 8.84925031495079e-08, "loss": 0.0002, "num_tokens": 87036756.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4129561185836792, "sampling/importance_sampling_ratio/mean": 1.000279188156128, "sampling/importance_sampling_ratio/min": 0.7018581628799438, "sampling/sampling_logp_difference/max": 0.35402393341064453, "sampling/sampling_logp_difference/mean": 0.013536701910197735, "step": 2028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 916.0, "completions/max_terminated_length": 916.0, "completions/mean_length": 315.734375, "completions/mean_terminated_length": 315.734375, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.5552902221679688, "epoch": 2.486519607843137, "frac_reward_zero_std": 0.75, "grad_norm": 0.7437886437418242, "kl": 0.027055341750383377, "learning_rate": 8.808827552213916e-08, "loss": 0.0584, "num_tokens": 87071347.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.3593207597732544, "sampling/importance_sampling_ratio/mean": 0.9996039867401123, "sampling/importance_sampling_ratio/min": 0.6067765355110168, "sampling/sampling_logp_difference/max": 0.49959468841552734, "sampling/sampling_logp_difference/mean": 0.018462490290403366, "step": 2029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1784.0, "completions/max_terminated_length": 1784.0, "completions/mean_length": 356.671875, "completions/mean_terminated_length": 356.671875, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.4716532230377197, "epoch": 2.4877450980392157, "frac_reward_zero_std": 0.75, "grad_norm": 0.8210343312107994, "kl": 0.020505931228399277, "learning_rate": 8.768488404947593e-08, "loss": 0.0187, "num_tokens": 87110558.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.4435145854949951, "sampling/importance_sampling_ratio/mean": 0.9997718334197998, "sampling/importance_sampling_ratio/min": 0.6062548160552979, "sampling/sampling_logp_difference/max": 0.5004549026489258, "sampling/sampling_logp_difference/mean": 0.015678737312555313, "step": 2030 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 683.0, "completions/max_terminated_length": 683.0, "completions/mean_length": 406.34375, "completions/mean_terminated_length": 406.34375, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.42823708057403564, "epoch": 2.488970588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.010895686063040338, "kl": 0.02187976986169815, "learning_rate": 8.728232955037696e-08, "loss": 0.0002, "num_tokens": 87153972.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5292953252792358, "sampling/importance_sampling_ratio/mean": 1.0002131462097168, "sampling/importance_sampling_ratio/min": 0.7123538851737976, "sampling/sampling_logp_difference/max": 0.424807071685791, "sampling/sampling_logp_difference/mean": 0.014209944754838943, "step": 2031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1027.0, "completions/max_terminated_length": 1027.0, "completions/mean_length": 382.234375, "completions/mean_terminated_length": 382.234375, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "entropy": 0.37749287486076355, "epoch": 2.4901960784313726, "frac_reward_zero_std": 0.75, "grad_norm": 0.5097961108542397, "kl": 0.017999451607465744, "learning_rate": 8.688061284200265e-08, "loss": -0.0074, "num_tokens": 87197315.0, "reward": 0.3125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.3670469522476196, "sampling/importance_sampling_ratio/mean": 0.9993981122970581, "sampling/importance_sampling_ratio/min": 0.5352298021316528, "sampling/sampling_logp_difference/max": 0.6250591278076172, "sampling/sampling_logp_difference/mean": 0.011993631720542908, "step": 2032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 685.0, "completions/max_terminated_length": 685.0, "completions/mean_length": 340.5625, "completions/mean_terminated_length": 340.5625, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.550229012966156, "epoch": 2.491421568627451, "frac_reward_zero_std": 0.5, "grad_norm": 0.7861460620885643, "kl": 0.038982708007097244, "learning_rate": 8.647973473981224e-08, "loss": -0.0236, "num_tokens": 87238759.0, "reward": 0.5, "reward_std": 0.5123475193977356, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4236258268356323, "sampling/importance_sampling_ratio/mean": 0.999970018863678, "sampling/importance_sampling_ratio/min": 0.6839099526405334, "sampling/sampling_logp_difference/max": 0.3799290657043457, "sampling/sampling_logp_difference/mean": 0.01614437624812126, "step": 2033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 645.0, "completions/max_terminated_length": 645.0, "completions/mean_length": 269.234375, "completions/mean_terminated_length": 269.234375, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.4905725121498108, "epoch": 2.4926470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.015265652470960693, "kl": 0.020448841154575348, "learning_rate": 8.607969605756315e-08, "loss": 0.0002, "num_tokens": 87274358.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.465522289276123, "sampling/importance_sampling_ratio/mean": 0.9999524354934692, "sampling/importance_sampling_ratio/min": 0.4325583577156067, "sampling/sampling_logp_difference/max": 0.8380380868911743, "sampling/sampling_logp_difference/mean": 0.01728798635303974, "step": 2034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 995.0, "completions/max_terminated_length": 995.0, "completions/mean_length": 490.5625, "completions/mean_terminated_length": 490.5625, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "entropy": 0.4333847165107727, "epoch": 2.493872549019608, "frac_reward_zero_std": 1.0, "grad_norm": 0.00884990564412107, "kl": 0.014128869399428368, "learning_rate": 8.568049760730838e-08, "loss": 0.0001, "num_tokens": 87326906.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5987240076065063, "sampling/importance_sampling_ratio/mean": 1.00033438205719, "sampling/importance_sampling_ratio/min": 0.6275395750999451, "sampling/sampling_logp_difference/max": 0.4692058563232422, "sampling/sampling_logp_difference/mean": 0.014160026796162128, "step": 2035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1130.0, "completions/max_terminated_length": 1130.0, "completions/mean_length": 438.296875, "completions/mean_terminated_length": 438.296875, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.5176718235015869, "epoch": 2.4950980392156863, "frac_reward_zero_std": 0.75, "grad_norm": 0.5381725957440366, "kl": 0.02820497378706932, "learning_rate": 8.52821401993955e-08, "loss": 0.0141, "num_tokens": 87373837.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.5457035303115845, "sampling/importance_sampling_ratio/mean": 0.999646008014679, "sampling/importance_sampling_ratio/min": 0.6466166377067566, "sampling/sampling_logp_difference/max": 0.4360017776489258, "sampling/sampling_logp_difference/mean": 0.01600518450140953, "step": 2036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 410.0, "completions/max_terminated_length": 410.0, "completions/mean_length": 274.828125, "completions/mean_terminated_length": 274.828125, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.4334583282470703, "epoch": 2.4963235294117645, "frac_reward_zero_std": 1.0, "grad_norm": 0.0140140521032286, "kl": 0.019586438313126564, "learning_rate": 8.488462464246493e-08, "loss": 0.0002, "num_tokens": 87411714.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.753572940826416, "sampling/importance_sampling_ratio/mean": 1.0000697374343872, "sampling/importance_sampling_ratio/min": 0.6163353323936462, "sampling/sampling_logp_difference/max": 0.5616554021835327, "sampling/sampling_logp_difference/mean": 0.01596229150891304, "step": 2037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 560.0, "completions/max_terminated_length": 560.0, "completions/mean_length": 265.21875, "completions/mean_terminated_length": 265.21875, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.3989444375038147, "epoch": 2.497549019607843, "frac_reward_zero_std": 1.0, "grad_norm": 0.015136612962161391, "kl": 0.02094714716076851, "learning_rate": 8.448795174344803e-08, "loss": 0.0002, "num_tokens": 87446192.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6439807415008545, "sampling/importance_sampling_ratio/mean": 0.999544620513916, "sampling/importance_sampling_ratio/min": 0.6035577058792114, "sampling/sampling_logp_difference/max": 0.5049135684967041, "sampling/sampling_logp_difference/mean": 0.014722573570907116, "step": 2038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 553.0, "completions/max_terminated_length": 553.0, "completions/mean_length": 308.078125, "completions/mean_terminated_length": 308.078125, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.482771635055542, "epoch": 2.498774509803922, "frac_reward_zero_std": 0.5, "grad_norm": 0.9680148845316057, "kl": 0.02789677493274212, "learning_rate": 8.409212230756563e-08, "loss": -0.0113, "num_tokens": 87480421.0, "reward": 0.15625, "reward_std": 0.3723389506340027, "rewards/decision_reward_func/mean": 0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.9701077938079834, "sampling/importance_sampling_ratio/mean": 1.0000520944595337, "sampling/importance_sampling_ratio/min": 0.6438773274421692, "sampling/sampling_logp_difference/max": 0.6780883073806763, "sampling/sampling_logp_difference/mean": 0.016131922602653503, "step": 2039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 876.0, "completions/max_terminated_length": 876.0, "completions/mean_length": 395.0625, "completions/mean_terminated_length": 395.0625, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.45095521211624146, "epoch": 2.5, "frac_reward_zero_std": 0.75, "grad_norm": 0.5670073393275998, "kl": 0.018941303715109825, "learning_rate": 8.369713713832622e-08, "loss": 0.0186, "num_tokens": 87525161.0, "reward": 0.375, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.627264380455017, "sampling/importance_sampling_ratio/mean": 1.0004199743270874, "sampling/importance_sampling_ratio/min": 0.6629416942596436, "sampling/sampling_logp_difference/max": 0.48690032958984375, "sampling/sampling_logp_difference/mean": 0.013713760301470757, "step": 2040 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 980.0, "completions/max_terminated_length": 980.0, "completions/mean_length": 342.453125, "completions/mean_terminated_length": 342.453125, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.5389746427536011, "epoch": 2.501225490196078, "frac_reward_zero_std": 0.75, "grad_norm": 0.5837901034452154, "kl": 0.059862080961465836, "learning_rate": 8.330299703752497e-08, "loss": -0.0087, "num_tokens": 87567958.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.5519194602966309, "sampling/importance_sampling_ratio/mean": 0.9999290108680725, "sampling/importance_sampling_ratio/min": 0.5517097115516663, "sampling/sampling_logp_difference/max": 0.5947332382202148, "sampling/sampling_logp_difference/mean": 0.016790762543678284, "step": 2041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1152.0, "completions/max_terminated_length": 1152.0, "completions/mean_length": 431.234375, "completions/mean_terminated_length": 431.234375, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.5475429892539978, "epoch": 2.502450980392157, "frac_reward_zero_std": 0.5, "grad_norm": 0.8776328767466868, "kl": 0.023793064057826996, "learning_rate": 8.290970280524124e-08, "loss": -0.0092, "num_tokens": 87610389.0, "reward": 0.0625, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.4971222877502441, "sampling/importance_sampling_ratio/mean": 1.0001389980316162, "sampling/importance_sampling_ratio/min": 0.6191501021385193, "sampling/sampling_logp_difference/max": 0.47940754890441895, "sampling/sampling_logp_difference/mean": 0.016927681863307953, "step": 2042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 429.0, "completions/max_terminated_length": 429.0, "completions/mean_length": 266.359375, "completions/mean_terminated_length": 266.359375, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.3347460925579071, "epoch": 2.5036764705882355, "frac_reward_zero_std": 1.0, "grad_norm": 0.011835677115041802, "kl": 0.014383486472070217, "learning_rate": 8.251725523983722e-08, "loss": 0.0001, "num_tokens": 87642972.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5071555376052856, "sampling/importance_sampling_ratio/mean": 0.9998353719711304, "sampling/importance_sampling_ratio/min": 0.6160424947738647, "sampling/sampling_logp_difference/max": 0.4844393730163574, "sampling/sampling_logp_difference/mean": 0.01316148042678833, "step": 2043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 639.0, "completions/max_terminated_length": 639.0, "completions/mean_length": 354.34375, "completions/mean_terminated_length": 354.34375, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "entropy": 0.5239408612251282, "epoch": 2.5049019607843137, "frac_reward_zero_std": 1.0, "grad_norm": 0.016010368308215964, "kl": 0.03826243802905083, "learning_rate": 8.212565513795683e-08, "loss": 0.0003, "num_tokens": 87685874.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.655517578125, "sampling/importance_sampling_ratio/mean": 1.000596046447754, "sampling/importance_sampling_ratio/min": 0.689724326133728, "sampling/sampling_logp_difference/max": 0.5041136741638184, "sampling/sampling_logp_difference/mean": 0.016394753009080887, "step": 2044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1263.0, "completions/max_terminated_length": 1263.0, "completions/mean_length": 382.6875, "completions/mean_terminated_length": 382.6875, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.3677459955215454, "epoch": 2.506127450980392, "frac_reward_zero_std": 0.75, "grad_norm": 0.5754066431928363, "kl": 0.015317828394472599, "learning_rate": 8.173490329452343e-08, "loss": 0.0434, "num_tokens": 87728302.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.4765123128890991, "sampling/importance_sampling_ratio/mean": 0.9997751712799072, "sampling/importance_sampling_ratio/min": 0.6086212396621704, "sampling/sampling_logp_difference/max": 0.49655914306640625, "sampling/sampling_logp_difference/mean": 0.011800028383731842, "step": 2045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2731.0, "completions/max_terminated_length": 2731.0, "completions/mean_length": 531.3125, "completions/mean_terminated_length": 531.3125, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "entropy": 0.5310277938842773, "epoch": 2.5073529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 0.677413830508819, "kl": 0.018100161105394363, "learning_rate": 8.13450005027384e-08, "loss": 0.1022, "num_tokens": 87779618.0, "reward": 0.84375, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.4389132261276245, "sampling/importance_sampling_ratio/mean": 1.0001132488250732, "sampling/importance_sampling_ratio/min": 0.6734963655471802, "sampling/sampling_logp_difference/max": 0.39527273178100586, "sampling/sampling_logp_difference/mean": 0.015197496861219406, "step": 2046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1294.0, "completions/max_terminated_length": 1294.0, "completions/mean_length": 427.703125, "completions/mean_terminated_length": 427.703125, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "entropy": 0.5487755537033081, "epoch": 2.508578431372549, "frac_reward_zero_std": 0.5, "grad_norm": 0.6767858313458931, "kl": 0.03194589167833328, "learning_rate": 8.09559475540797e-08, "loss": -0.0041, "num_tokens": 87825183.0, "reward": 0.09375, "reward_std": 0.497555673122406, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.561215877532959, "sampling/importance_sampling_ratio/mean": 0.9998584389686584, "sampling/importance_sampling_ratio/min": 0.4954179525375366, "sampling/sampling_logp_difference/max": 0.7023534774780273, "sampling/sampling_logp_difference/mean": 0.01574910432100296, "step": 2047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 766.0, "completions/max_terminated_length": 766.0, "completions/mean_length": 276.546875, "completions/mean_terminated_length": 276.546875, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "entropy": 0.410917729139328, "epoch": 2.5098039215686274, "frac_reward_zero_std": 1.0, "grad_norm": 0.01216430418676119, "kl": 0.013962316326797009, "learning_rate": 8.056774523830029e-08, "loss": 0.0001, "num_tokens": 87856194.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4370096921920776, "sampling/importance_sampling_ratio/mean": 0.9996622204780579, "sampling/importance_sampling_ratio/min": 0.6911449432373047, "sampling/sampling_logp_difference/max": 0.36940574645996094, "sampling/sampling_logp_difference/mean": 0.013986590318381786, "step": 2048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1235.0, "completions/max_terminated_length": 1235.0, "completions/mean_length": 420.71875, "completions/mean_terminated_length": 420.71875, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.45769256353378296, "epoch": 2.5110294117647056, "frac_reward_zero_std": 1.0, "grad_norm": 0.01570795501799553, "kl": 0.019054913893342018, "learning_rate": 8.018039434342627e-08, "loss": 0.0002, "num_tokens": 87900576.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.881325602531433, "sampling/importance_sampling_ratio/mean": 0.9999811053276062, "sampling/importance_sampling_ratio/min": 0.539108157157898, "sampling/sampling_logp_difference/max": 0.6319766044616699, "sampling/sampling_logp_difference/mean": 0.014634935185313225, "step": 2049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 903.0, "completions/max_terminated_length": 903.0, "completions/mean_length": 393.4375, "completions/mean_terminated_length": 393.4375, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "entropy": 0.4762817621231079, "epoch": 2.5122549019607843, "frac_reward_zero_std": 0.75, "grad_norm": 0.6373027534618516, "kl": 0.022292546927928925, "learning_rate": 7.979389565575522e-08, "loss": 0.0165, "num_tokens": 87947740.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.559706449508667, "sampling/importance_sampling_ratio/mean": 0.9998424053192139, "sampling/importance_sampling_ratio/min": 0.4970056414604187, "sampling/sampling_logp_difference/max": 0.6991539001464844, "sampling/sampling_logp_difference/mean": 0.014698117971420288, "step": 2050 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 601.0, "completions/max_terminated_length": 601.0, "completions/mean_length": 291.296875, "completions/mean_terminated_length": 291.296875, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.5661441087722778, "epoch": 2.513480392156863, "frac_reward_zero_std": 0.5, "grad_norm": 0.9728829473966704, "kl": 0.04979410395026207, "learning_rate": 7.940824995985528e-08, "loss": -0.0041, "num_tokens": 87982671.0, "reward": 0.53125, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.6301367282867432, "sampling/importance_sampling_ratio/mean": 1.0000660419464111, "sampling/importance_sampling_ratio/min": 0.6788964867591858, "sampling/sampling_logp_difference/max": 0.488663911819458, "sampling/sampling_logp_difference/mean": 0.016453657299280167, "step": 2051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 777.0, "completions/max_terminated_length": 777.0, "completions/mean_length": 357.1875, "completions/mean_terminated_length": 357.1875, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.4110541343688965, "epoch": 2.514705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.01291588078966231, "kl": 0.014849510043859482, "learning_rate": 7.902345803856264e-08, "loss": 0.0001, "num_tokens": 88025291.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6182366609573364, "sampling/importance_sampling_ratio/mean": 1.0001564025878906, "sampling/importance_sampling_ratio/min": 0.5685044527053833, "sampling/sampling_logp_difference/max": 0.5647461414337158, "sampling/sampling_logp_difference/mean": 0.012734738178551197, "step": 2052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 614.0, "completions/max_terminated_length": 614.0, "completions/mean_length": 292.046875, "completions/mean_terminated_length": 292.046875, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "entropy": 0.3838595151901245, "epoch": 2.5159313725490198, "frac_reward_zero_std": 0.75, "grad_norm": 0.7457389157388754, "kl": 0.01861615851521492, "learning_rate": 7.863952067298041e-08, "loss": 0.0071, "num_tokens": 88062254.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.6085432767868042, "sampling/importance_sampling_ratio/mean": 0.9999039173126221, "sampling/importance_sampling_ratio/min": 0.6993058919906616, "sampling/sampling_logp_difference/max": 0.4753289222717285, "sampling/sampling_logp_difference/mean": 0.013692962937057018, "step": 2053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1134.0, "completions/max_terminated_length": 1134.0, "completions/mean_length": 370.546875, "completions/mean_terminated_length": 370.546875, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "entropy": 0.5914913415908813, "epoch": 2.517156862745098, "frac_reward_zero_std": 0.5, "grad_norm": 0.8915009611468424, "kl": 0.03682635352015495, "learning_rate": 7.825643864247733e-08, "loss": 0.0322, "num_tokens": 88108993.0, "reward": 0.5625, "reward_std": 0.47360679507255554, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.612030029296875, "sampling/importance_sampling_ratio/mean": 0.9997950792312622, "sampling/importance_sampling_ratio/min": 0.4368430972099304, "sampling/sampling_logp_difference/max": 0.828181266784668, "sampling/sampling_logp_difference/mean": 0.01747285947203636, "step": 2054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 846.0, "completions/max_terminated_length": 846.0, "completions/mean_length": 376.453125, "completions/mean_terminated_length": 376.453125, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.44362589716911316, "epoch": 2.5183823529411766, "frac_reward_zero_std": 0.5, "grad_norm": 0.8804987475100029, "kl": 0.032579682767391205, "learning_rate": 7.787421272468547e-08, "loss": -0.011, "num_tokens": 88152894.0, "reward": 0.65625, "reward_std": 0.375, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.4827368259429932, "sampling/importance_sampling_ratio/mean": 0.9996652603149414, "sampling/importance_sampling_ratio/min": 0.4982425272464752, "sampling/sampling_logp_difference/max": 0.6966683864593506, "sampling/sampling_logp_difference/mean": 0.014565473422408104, "step": 2055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 790.0, "completions/max_terminated_length": 790.0, "completions/mean_length": 391.53125, "completions/mean_terminated_length": 391.53125, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.6241872310638428, "epoch": 2.519607843137255, "frac_reward_zero_std": 0.5, "grad_norm": 0.7771715837691645, "kl": 0.02881479822099209, "learning_rate": 7.749284369549952e-08, "loss": 0.0025, "num_tokens": 88193296.0, "reward": 0.25, "reward_std": 0.44091323018074036, "rewards/decision_reward_func/mean": 0.25, "rewards/decision_reward_func/std": 0.9759001135826111, "sampling/importance_sampling_ratio/max": 1.5275804996490479, "sampling/importance_sampling_ratio/mean": 1.000152349472046, "sampling/importance_sampling_ratio/min": 0.6647953391075134, "sampling/sampling_logp_difference/max": 0.42368507385253906, "sampling/sampling_logp_difference/mean": 0.016801338642835617, "step": 2056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1154.0, "completions/max_terminated_length": 1154.0, "completions/mean_length": 399.203125, "completions/mean_terminated_length": 399.203125, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "entropy": 0.49759817123413086, "epoch": 2.5208333333333335, "frac_reward_zero_std": 0.5, "grad_norm": 0.8457035047246126, "kl": 0.019957244396209717, "learning_rate": 7.711233232907399e-08, "loss": 0.0111, "num_tokens": 88236781.0, "reward": 0.375, "reward_std": 0.34156501293182373, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.3665165901184082, "sampling/importance_sampling_ratio/mean": 1.0003321170806885, "sampling/importance_sampling_ratio/min": 0.6336930990219116, "sampling/sampling_logp_difference/max": 0.4561905860900879, "sampling/sampling_logp_difference/mean": 0.014767213724553585, "step": 2057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 698.0, "completions/max_terminated_length": 698.0, "completions/mean_length": 313.6875, "completions/mean_terminated_length": 313.6875, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.5625600814819336, "epoch": 2.5220588235294117, "frac_reward_zero_std": 0.75, "grad_norm": 0.5018475280490287, "kl": 0.0364031083881855, "learning_rate": 7.673267939782324e-08, "loss": 0.0055, "num_tokens": 88278793.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.5562366247177124, "sampling/importance_sampling_ratio/mean": 1.0001705884933472, "sampling/importance_sampling_ratio/min": 0.7187730073928833, "sampling/sampling_logp_difference/max": 0.44227051734924316, "sampling/sampling_logp_difference/mean": 0.01586228795349598, "step": 2058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 897.0, "completions/max_terminated_length": 897.0, "completions/mean_length": 376.1875, "completions/mean_terminated_length": 376.1875, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "entropy": 0.40572962164878845, "epoch": 2.5232843137254903, "frac_reward_zero_std": 0.75, "grad_norm": 0.5558911402897152, "kl": 0.016937313601374626, "learning_rate": 7.63538856724184e-08, "loss": 0.0663, "num_tokens": 88324229.0, "reward": 0.59375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.4468421936035156, "sampling/importance_sampling_ratio/mean": 0.9997506141662598, "sampling/importance_sampling_ratio/min": 0.6061469912528992, "sampling/sampling_logp_difference/max": 0.5006327629089355, "sampling/sampling_logp_difference/mean": 0.01404731348156929, "step": 2059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/max_terminated_length": 472.0, "completions/mean_length": 269.609375, "completions/mean_terminated_length": 269.609375, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "entropy": 0.49775516986846924, "epoch": 2.5245098039215685, "frac_reward_zero_std": 0.75, "grad_norm": 0.8058217735069053, "kl": 0.01767593063414097, "learning_rate": 7.597595192178702e-08, "loss": -0.0046, "num_tokens": 88357820.0, "reward": 0.125, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 1.283595085144043, "sampling/importance_sampling_ratio/mean": 0.9997279644012451, "sampling/importance_sampling_ratio/min": 0.6671778559684753, "sampling/sampling_logp_difference/max": 0.40469861030578613, "sampling/sampling_logp_difference/mean": 0.01710563711822033, "step": 2060 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 542.0, "completions/max_terminated_length": 542.0, "completions/mean_length": 258.40625, "completions/mean_terminated_length": 258.40625, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.4481734037399292, "epoch": 2.525735294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.026705169831133618, "kl": 0.030527615919709206, "learning_rate": 7.559887891311046e-08, "loss": 0.0004, "num_tokens": 88389638.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4325790405273438, "sampling/importance_sampling_ratio/mean": 1.0006053447723389, "sampling/importance_sampling_ratio/min": 0.621135950088501, "sampling/sampling_logp_difference/max": 0.47620534896850586, "sampling/sampling_logp_difference/mean": 0.014307933859527111, "step": 2061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 914.0, "completions/max_terminated_length": 914.0, "completions/mean_length": 378.875, "completions/mean_terminated_length": 378.875, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "entropy": 0.5545972585678101, "epoch": 2.5269607843137254, "frac_reward_zero_std": 0.5, "grad_norm": 0.9421285797371809, "kl": 0.026122838258743286, "learning_rate": 7.522266741182303e-08, "loss": 0.0026, "num_tokens": 88438494.0, "reward": 0.78125, "reward_std": 0.375, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.5043588876724243, "sampling/importance_sampling_ratio/mean": 1.0004637241363525, "sampling/importance_sampling_ratio/min": 0.6977726817131042, "sampling/sampling_logp_difference/max": 0.408366858959198, "sampling/sampling_logp_difference/mean": 0.015840861946344376, "step": 2062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 676.0, "completions/max_terminated_length": 676.0, "completions/mean_length": 312.78125, "completions/mean_terminated_length": 312.78125, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.5668543577194214, "epoch": 2.528186274509804, "frac_reward_zero_std": 0.5, "grad_norm": 1.0568842686537108, "kl": 0.04118448495864868, "learning_rate": 7.484731818161049e-08, "loss": 0.0122, "num_tokens": 88473040.0, "reward": 0.71875, "reward_std": 0.4629635810852051, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.493909478187561, "sampling/importance_sampling_ratio/mean": 0.9999381303787231, "sampling/importance_sampling_ratio/min": 0.6741206049919128, "sampling/sampling_logp_difference/max": 0.4013965129852295, "sampling/sampling_logp_difference/mean": 0.016723167151212692, "step": 2063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 587.0, "completions/max_terminated_length": 587.0, "completions/mean_length": 296.625, "completions/mean_terminated_length": 296.625, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.5061111450195312, "epoch": 2.5294117647058822, "frac_reward_zero_std": 1.0, "grad_norm": 0.01578296786730947, "kl": 0.024869056418538094, "learning_rate": 7.447283198440763e-08, "loss": 0.0002, "num_tokens": 88506632.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4465924501419067, "sampling/importance_sampling_ratio/mean": 1.0004689693450928, "sampling/importance_sampling_ratio/min": 0.5865954160690308, "sampling/sampling_logp_difference/max": 0.5334199070930481, "sampling/sampling_logp_difference/mean": 0.01597938872873783, "step": 2064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 881.0, "completions/max_terminated_length": 881.0, "completions/mean_length": 328.046875, "completions/mean_terminated_length": 328.046875, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "entropy": 0.3913782238960266, "epoch": 2.530637254901961, "frac_reward_zero_std": 0.75, "grad_norm": 0.7715366686395772, "kl": 0.01972559094429016, "learning_rate": 7.409920958039794e-08, "loss": 0.0219, "num_tokens": 88552235.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.999821126461029, "sampling/importance_sampling_ratio/min": 0.3305003046989441, "sampling/sampling_logp_difference/max": 1.1071476936340332, "sampling/sampling_logp_difference/mean": 0.014163661748170853, "step": 2065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 856.0, "completions/max_terminated_length": 856.0, "completions/mean_length": 348.734375, "completions/mean_terminated_length": 348.734375, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "entropy": 0.5040494203567505, "epoch": 2.531862745098039, "frac_reward_zero_std": 0.25, "grad_norm": 1.054881901214661, "kl": 0.025158971548080444, "learning_rate": 7.372645172801112e-08, "loss": 0.0461, "num_tokens": 88591594.0, "reward": -0.21875, "reward_std": 0.5061737298965454, "rewards/decision_reward_func/mean": -0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 1.6275174617767334, "sampling/importance_sampling_ratio/mean": 0.9999942183494568, "sampling/importance_sampling_ratio/min": 0.6531445980072021, "sampling/sampling_logp_difference/max": 0.48705577850341797, "sampling/sampling_logp_difference/mean": 0.016496503725647926, "step": 2066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1061.0, "completions/max_terminated_length": 1061.0, "completions/mean_length": 406.546875, "completions/mean_terminated_length": 406.546875, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.5173803567886353, "epoch": 2.5330882352941178, "frac_reward_zero_std": 0.5, "grad_norm": 0.7617807076010857, "kl": 0.02209978550672531, "learning_rate": 7.335455918392219e-08, "loss": 0.0143, "num_tokens": 88638317.0, "reward": 0.4375, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.6622133255004883, "sampling/importance_sampling_ratio/mean": 1.000338077545166, "sampling/importance_sampling_ratio/min": 0.6369378566741943, "sampling/sampling_logp_difference/max": 0.5081501007080078, "sampling/sampling_logp_difference/mean": 0.01602158322930336, "step": 2067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 711.0, "completions/max_terminated_length": 711.0, "completions/mean_length": 313.078125, "completions/mean_terminated_length": 313.078125, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "entropy": 0.45411181449890137, "epoch": 2.534313725490196, "frac_reward_zero_std": 0.75, "grad_norm": 0.7033334319514152, "kl": 0.020312298089265823, "learning_rate": 7.29835327030493e-08, "loss": -0.0108, "num_tokens": 88672930.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.3980697393417358, "sampling/importance_sampling_ratio/mean": 0.999660849571228, "sampling/importance_sampling_ratio/min": 0.6618714332580566, "sampling/sampling_logp_difference/max": 0.41268396377563477, "sampling/sampling_logp_difference/mean": 0.015630224719643593, "step": 2068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 847.0, "completions/max_terminated_length": 847.0, "completions/mean_length": 381.5625, "completions/mean_terminated_length": 381.5625, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.5530791282653809, "epoch": 2.5355392156862746, "frac_reward_zero_std": 0.75, "grad_norm": 0.6911597457444283, "kl": 0.022144759073853493, "learning_rate": 7.261337303855258e-08, "loss": 0.0198, "num_tokens": 88716630.0, "reward": 0.40625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.4574356079101562, "sampling/importance_sampling_ratio/mean": 1.000207543373108, "sampling/importance_sampling_ratio/min": 0.6190770268440247, "sampling/sampling_logp_difference/max": 0.4795255661010742, "sampling/sampling_logp_difference/mean": 0.01677640527486801, "step": 2069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 549.0, "completions/max_terminated_length": 549.0, "completions/mean_length": 288.375, "completions/mean_terminated_length": 288.375, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.545372724533081, "epoch": 2.536764705882353, "frac_reward_zero_std": 0.75, "grad_norm": 0.684994354349687, "kl": 0.030420534312725067, "learning_rate": 7.224408094183299e-08, "loss": 0.0257, "num_tokens": 88749422.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.5246001482009888, "sampling/importance_sampling_ratio/mean": 1.0002045631408691, "sampling/importance_sampling_ratio/min": 0.7168774008750916, "sampling/sampling_logp_difference/max": 0.42173218727111816, "sampling/sampling_logp_difference/mean": 0.017986111342906952, "step": 2070 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 824.0, "completions/max_terminated_length": 824.0, "completions/mean_length": 333.96875, "completions/mean_terminated_length": 333.96875, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.620752215385437, "epoch": 2.5379901960784315, "frac_reward_zero_std": 0.5, "grad_norm": 0.9114800440367166, "kl": 0.028290722519159317, "learning_rate": 7.187565716252991e-08, "loss": 0.0224, "num_tokens": 88785964.0, "reward": 0.84375, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.3670614957809448, "sampling/importance_sampling_ratio/mean": 1.0000383853912354, "sampling/importance_sampling_ratio/min": 0.6597129702568054, "sampling/sampling_logp_difference/max": 0.4159504175186157, "sampling/sampling_logp_difference/mean": 0.017340410500764847, "step": 2071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 789.0, "completions/max_terminated_length": 789.0, "completions/mean_length": 341.703125, "completions/mean_terminated_length": 341.703125, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.4335407316684723, "epoch": 2.5392156862745097, "frac_reward_zero_std": 1.0, "grad_norm": 0.02806912982755116, "kl": 0.01620986871421337, "learning_rate": 7.150810244852035e-08, "loss": 0.0002, "num_tokens": 88824121.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6239738464355469, "sampling/importance_sampling_ratio/mean": 0.9999967813491821, "sampling/importance_sampling_ratio/min": 0.3858293294906616, "sampling/sampling_logp_difference/max": 0.9523601531982422, "sampling/sampling_logp_difference/mean": 0.015636665746569633, "step": 2072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 628.0, "completions/max_terminated_length": 628.0, "completions/mean_length": 283.421875, "completions/mean_terminated_length": 283.421875, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.4844023585319519, "epoch": 2.5404411764705883, "frac_reward_zero_std": 0.75, "grad_norm": 0.6428191927846141, "kl": 0.022393204271793365, "learning_rate": 7.114141754591691e-08, "loss": 0.0096, "num_tokens": 88861220.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.4636178016662598, "sampling/importance_sampling_ratio/mean": 0.9996892213821411, "sampling/importance_sampling_ratio/min": 0.6264181137084961, "sampling/sampling_logp_difference/max": 0.46773719787597656, "sampling/sampling_logp_difference/mean": 0.01575683057308197, "step": 2073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 833.0, "completions/max_terminated_length": 833.0, "completions/mean_length": 289.5, "completions/mean_terminated_length": 289.5, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.3120833933353424, "epoch": 2.5416666666666665, "frac_reward_zero_std": 0.75, "grad_norm": 0.6262400876567887, "kl": 0.014613564126193523, "learning_rate": 7.077560319906694e-08, "loss": 0.0286, "num_tokens": 88896900.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.6089869737625122, "sampling/importance_sampling_ratio/mean": 1.0001229047775269, "sampling/importance_sampling_ratio/min": 0.612409234046936, "sampling/sampling_logp_difference/max": 0.4903545379638672, "sampling/sampling_logp_difference/mean": 0.011091912165284157, "step": 2074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1010.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 428.890625, "completions/mean_terminated_length": 428.890625, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "entropy": 0.4078092575073242, "epoch": 2.542892156862745, "frac_reward_zero_std": 0.75, "grad_norm": 0.6012915048134511, "kl": 0.01895270124077797, "learning_rate": 7.041066015055036e-08, "loss": -0.0535, "num_tokens": 88944077.0, "reward": 0.1875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 1.4445346593856812, "sampling/importance_sampling_ratio/mean": 1.0001063346862793, "sampling/importance_sampling_ratio/min": 0.6705991625785828, "sampling/sampling_logp_difference/max": 0.39958375692367554, "sampling/sampling_logp_difference/mean": 0.01248204056173563, "step": 2075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 697.0, "completions/max_terminated_length": 697.0, "completions/mean_length": 285.0, "completions/mean_terminated_length": 285.0, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "entropy": 0.44286519289016724, "epoch": 2.5441176470588234, "frac_reward_zero_std": 0.75, "grad_norm": 0.6463804318894905, "kl": 0.021181363612413406, "learning_rate": 7.004658914117822e-08, "loss": -0.0276, "num_tokens": 88978813.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.6287914514541626, "sampling/importance_sampling_ratio/mean": 1.00014066696167, "sampling/importance_sampling_ratio/min": 0.6163480877876282, "sampling/sampling_logp_difference/max": 0.4878382682800293, "sampling/sampling_logp_difference/mean": 0.01583913341164589, "step": 2076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 872.0, "completions/max_terminated_length": 872.0, "completions/mean_length": 305.765625, "completions/mean_terminated_length": 305.765625, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.4061392545700073, "epoch": 2.545343137254902, "frac_reward_zero_std": 0.75, "grad_norm": 0.6457507596791521, "kl": 0.023992544040083885, "learning_rate": 6.968339090999186e-08, "loss": -0.019, "num_tokens": 89017246.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.4230939149856567, "sampling/importance_sampling_ratio/mean": 0.99977707862854, "sampling/importance_sampling_ratio/min": 0.5812164545059204, "sampling/sampling_logp_difference/max": 0.5426321029663086, "sampling/sampling_logp_difference/mean": 0.01504246611148119, "step": 2077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1062.0, "completions/max_terminated_length": 1062.0, "completions/mean_length": 351.90625, "completions/mean_terminated_length": 351.90625, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.5995556116104126, "epoch": 2.5465686274509802, "frac_reward_zero_std": 0.5, "grad_norm": 0.837098298595801, "kl": 0.03202180564403534, "learning_rate": 6.932106619426064e-08, "loss": -0.0128, "num_tokens": 89059752.0, "reward": -0.40625, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": -0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.4125325679779053, "sampling/importance_sampling_ratio/mean": 1.0002164840698242, "sampling/importance_sampling_ratio/min": 0.7136092185974121, "sampling/sampling_logp_difference/max": 0.34538424015045166, "sampling/sampling_logp_difference/mean": 0.0166948139667511, "step": 2078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1199.0, "completions/max_terminated_length": 1199.0, "completions/mean_length": 303.296875, "completions/mean_terminated_length": 303.296875, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.43847018480300903, "epoch": 2.547794117647059, "frac_reward_zero_std": 0.75, "grad_norm": 0.6581301651152035, "kl": 0.024305567145347595, "learning_rate": 6.895961572948067e-08, "loss": 0.0092, "num_tokens": 89096331.0, "reward": 0.75, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.566723346710205, "sampling/importance_sampling_ratio/mean": 0.9998667240142822, "sampling/importance_sampling_ratio/min": 0.6308279037475586, "sampling/sampling_logp_difference/max": 0.4607222080230713, "sampling/sampling_logp_difference/mean": 0.01578931137919426, "step": 2079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1069.0, "completions/max_terminated_length": 1069.0, "completions/mean_length": 420.578125, "completions/mean_terminated_length": 420.578125, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "entropy": 0.5263601541519165, "epoch": 2.549019607843137, "frac_reward_zero_std": 1.0, "grad_norm": 0.011491192642062727, "kl": 0.017666911706328392, "learning_rate": 6.859904024937347e-08, "loss": 0.0002, "num_tokens": 89141088.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4011369943618774, "sampling/importance_sampling_ratio/mean": 1.0001323223114014, "sampling/importance_sampling_ratio/min": 0.6242122650146484, "sampling/sampling_logp_difference/max": 0.4712648391723633, "sampling/sampling_logp_difference/mean": 0.014453459531068802, "step": 2080 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1142.0, "completions/max_terminated_length": 1142.0, "completions/mean_length": 428.25, "completions/mean_terminated_length": 428.25, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.6440922021865845, "epoch": 2.5502450980392157, "frac_reward_zero_std": 0.5, "grad_norm": 0.7600908113697502, "kl": 0.035944871604442596, "learning_rate": 6.823934048588459e-08, "loss": -0.018, "num_tokens": 89184928.0, "reward": 0.53125, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.7662986516952515, "sampling/importance_sampling_ratio/mean": 1.0001152753829956, "sampling/importance_sampling_ratio/min": 0.5109188556671143, "sampling/sampling_logp_difference/max": 0.6715445518493652, "sampling/sampling_logp_difference/mean": 0.017576124519109726, "step": 2081 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 935.0, "completions/max_terminated_length": 935.0, "completions/mean_length": 376.84375, "completions/mean_terminated_length": 376.84375, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.6715270280838013, "epoch": 2.5514705882352944, "frac_reward_zero_std": 0.5, "grad_norm": 0.6896504935210157, "kl": 0.0397479273378849, "learning_rate": 6.78805171691817e-08, "loss": -0.0104, "num_tokens": 89229718.0, "reward": 0.5, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6118876934051514, "sampling/importance_sampling_ratio/mean": 1.0003901720046997, "sampling/importance_sampling_ratio/min": 0.7099385857582092, "sampling/sampling_logp_difference/max": 0.47740602493286133, "sampling/sampling_logp_difference/mean": 0.018353113904595375, "step": 2082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1095.0, "completions/max_terminated_length": 1095.0, "completions/mean_length": 365.453125, "completions/mean_terminated_length": 365.453125, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.6706279516220093, "epoch": 2.5526960784313726, "frac_reward_zero_std": 0.5, "grad_norm": 0.8559257005370421, "kl": 0.05388794466853142, "learning_rate": 6.752257102765324e-08, "loss": 0.0221, "num_tokens": 89278723.0, "reward": 0.78125, "reward_std": 0.4101392924785614, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.547874093055725, "sampling/importance_sampling_ratio/mean": 0.9994400143623352, "sampling/importance_sampling_ratio/min": 0.5910111665725708, "sampling/sampling_logp_difference/max": 0.5259203910827637, "sampling/sampling_logp_difference/mean": 0.019235458225011826, "step": 2083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 668.0, "completions/max_terminated_length": 668.0, "completions/mean_length": 346.625, "completions/mean_terminated_length": 346.625, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "entropy": 0.5077131986618042, "epoch": 2.553921568627451, "frac_reward_zero_std": 0.5, "grad_norm": 0.9916099646614178, "kl": 0.027958132326602936, "learning_rate": 6.716550278790739e-08, "loss": -0.0146, "num_tokens": 89322283.0, "reward": 0.40625, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.4032665491104126, "sampling/importance_sampling_ratio/mean": 0.9996911883354187, "sampling/importance_sampling_ratio/min": 0.6245012879371643, "sampling/sampling_logp_difference/max": 0.47080183029174805, "sampling/sampling_logp_difference/mean": 0.016115956008434296, "step": 2084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 950.0, "completions/max_terminated_length": 950.0, "completions/mean_length": 383.59375, "completions/mean_terminated_length": 383.59375, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.42092761397361755, "epoch": 2.5551470588235294, "frac_reward_zero_std": 0.75, "grad_norm": 0.564074033911951, "kl": 0.02166523039340973, "learning_rate": 6.680931317476996e-08, "loss": 0.0125, "num_tokens": 89361409.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.337085485458374, "sampling/importance_sampling_ratio/mean": 0.9994177222251892, "sampling/importance_sampling_ratio/min": 0.44557034969329834, "sampling/sampling_logp_difference/max": 0.8084001541137695, "sampling/sampling_logp_difference/mean": 0.013026129454374313, "step": 2085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 721.0, "completions/max_terminated_length": 721.0, "completions/mean_length": 328.453125, "completions/mean_terminated_length": 328.453125, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "entropy": 0.7176576256752014, "epoch": 2.556372549019608, "frac_reward_zero_std": 0.5, "grad_norm": 1.0142432814762525, "kl": 0.03617919236421585, "learning_rate": 6.645400291128356e-08, "loss": 0.004, "num_tokens": 89407054.0, "reward": 0.84375, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.463692545890808, "sampling/importance_sampling_ratio/mean": 1.000481128692627, "sampling/importance_sampling_ratio/min": 0.6271799802780151, "sampling/sampling_logp_difference/max": 0.4665217399597168, "sampling/sampling_logp_difference/mean": 0.02028723619878292, "step": 2086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1699.0, "completions/max_terminated_length": 1699.0, "completions/mean_length": 480.8125, "completions/mean_terminated_length": 480.8125, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "entropy": 0.5787152051925659, "epoch": 2.5575980392156863, "frac_reward_zero_std": 0.25, "grad_norm": 0.7029314743427733, "kl": 0.027982600033283234, "learning_rate": 6.609957271870503e-08, "loss": -0.0649, "num_tokens": 89457922.0, "reward": 0.75, "reward_std": 0.5, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.5097479820251465, "sampling/importance_sampling_ratio/mean": 0.9999877214431763, "sampling/importance_sampling_ratio/min": 0.6988462209701538, "sampling/sampling_logp_difference/max": 0.411942720413208, "sampling/sampling_logp_difference/mean": 0.01573704183101654, "step": 2087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 705.0, "completions/max_terminated_length": 705.0, "completions/mean_length": 389.40625, "completions/mean_terminated_length": 389.40625, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.3571021556854248, "epoch": 2.5588235294117645, "frac_reward_zero_std": 1.0, "grad_norm": 0.008598701584566455, "kl": 0.011506263166666031, "learning_rate": 6.574602331650559e-08, "loss": 0.0001, "num_tokens": 89499068.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.5911074876785278, "sampling/importance_sampling_ratio/mean": 1.0004396438598633, "sampling/importance_sampling_ratio/min": 0.6597685813903809, "sampling/sampling_logp_difference/max": 0.4644303321838379, "sampling/sampling_logp_difference/mean": 0.012549884617328644, "step": 2088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 983.0, "completions/max_terminated_length": 983.0, "completions/mean_length": 369.4375, "completions/mean_terminated_length": 369.4375, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.46665000915527344, "epoch": 2.560049019607843, "frac_reward_zero_std": 0.75, "grad_norm": 0.5852479456198927, "kl": 0.02848290652036667, "learning_rate": 6.539335542236802e-08, "loss": -0.014, "num_tokens": 89541624.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.4219739437103271, "sampling/importance_sampling_ratio/mean": 1.0001485347747803, "sampling/importance_sampling_ratio/min": 0.6647613048553467, "sampling/sampling_logp_difference/max": 0.40832722187042236, "sampling/sampling_logp_difference/mean": 0.013437395915389061, "step": 2089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 624.0, "completions/max_terminated_length": 624.0, "completions/mean_length": 348.28125, "completions/mean_terminated_length": 348.28125, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.3714780807495117, "epoch": 2.561274509803922, "frac_reward_zero_std": 1.0, "grad_norm": 0.010830208927057945, "kl": 0.012608101591467857, "learning_rate": 6.504156975218567e-08, "loss": 0.0001, "num_tokens": 89578154.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6586670875549316, "sampling/importance_sampling_ratio/mean": 0.9995500445365906, "sampling/importance_sampling_ratio/min": 0.6056269407272339, "sampling/sampling_logp_difference/max": 0.506014347076416, "sampling/sampling_logp_difference/mean": 0.013256524689495564, "step": 2090 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1804.0, "completions/max_terminated_length": 1804.0, "completions/mean_length": 446.484375, "completions/mean_terminated_length": 446.484375, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.48718804121017456, "epoch": 2.5625, "frac_reward_zero_std": 0.5, "grad_norm": 0.912532021767399, "kl": 0.022470438852906227, "learning_rate": 6.469066702006137e-08, "loss": 0.0561, "num_tokens": 89623145.0, "reward": 0.8125, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.5135613679885864, "sampling/importance_sampling_ratio/mean": 1.0004732608795166, "sampling/importance_sampling_ratio/min": 0.454216867685318, "sampling/sampling_logp_difference/max": 0.7891805171966553, "sampling/sampling_logp_difference/mean": 0.015215839259326458, "step": 2091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 966.0, "completions/max_terminated_length": 966.0, "completions/mean_length": 413.53125, "completions/mean_terminated_length": 413.53125, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.49360746145248413, "epoch": 2.563725490196078, "frac_reward_zero_std": 0.75, "grad_norm": 0.49866645872955456, "kl": 0.02388349547982216, "learning_rate": 6.43406479383053e-08, "loss": 0.0154, "num_tokens": 89664955.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000770092010498, "sampling/importance_sampling_ratio/min": 0.6771528720855713, "sampling/sampling_logp_difference/max": 0.9039700031280518, "sampling/sampling_logp_difference/mean": 0.015676971524953842, "step": 2092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/max_terminated_length": 485.0, "completions/mean_length": 229.84375, "completions/mean_terminated_length": 229.84375, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.4340648949146271, "epoch": 2.564950980392157, "frac_reward_zero_std": 0.75, "grad_norm": 0.8809272782812709, "kl": 0.023864315822720528, "learning_rate": 6.399151321743423e-08, "loss": 0.0233, "num_tokens": 89690929.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.417021632194519, "sampling/importance_sampling_ratio/mean": 0.9999493360519409, "sampling/importance_sampling_ratio/min": 0.5809086561203003, "sampling/sampling_logp_difference/max": 0.5431617498397827, "sampling/sampling_logp_difference/mean": 0.016514714807271957, "step": 2093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1258.0, "completions/max_terminated_length": 1258.0, "completions/mean_length": 457.5625, "completions/mean_terminated_length": 457.5625, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "entropy": 0.39418524503707886, "epoch": 2.5661764705882355, "frac_reward_zero_std": 0.75, "grad_norm": 0.4522953245284341, "kl": 0.016179226338863373, "learning_rate": 6.364326356616917e-08, "loss": -0.0112, "num_tokens": 89745413.0, "reward": -0.0625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": -0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999662041664124, "sampling/importance_sampling_ratio/min": 0.5044505596160889, "sampling/sampling_logp_difference/max": 1.4494602680206299, "sampling/sampling_logp_difference/mean": 0.012144520878791809, "step": 2094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/max_terminated_length": 525.0, "completions/mean_length": 298.328125, "completions/mean_terminated_length": 298.328125, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.3735736608505249, "epoch": 2.5674019607843137, "frac_reward_zero_std": 1.0, "grad_norm": 0.010561875615943244, "kl": 0.01559720654040575, "learning_rate": 6.329589969143517e-08, "loss": 0.0002, "num_tokens": 89782522.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.616909146308899, "sampling/importance_sampling_ratio/mean": 0.9999896883964539, "sampling/importance_sampling_ratio/min": 0.6140319108963013, "sampling/sampling_logp_difference/max": 0.48770833015441895, "sampling/sampling_logp_difference/mean": 0.01347808912396431, "step": 2095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 827.0, "completions/max_terminated_length": 827.0, "completions/mean_length": 410.828125, "completions/mean_terminated_length": 410.828125, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "entropy": 0.5203307867050171, "epoch": 2.568627450980392, "frac_reward_zero_std": 0.5, "grad_norm": 0.785365218744053, "kl": 0.02159891463816166, "learning_rate": 6.29494222983587e-08, "loss": 0.0509, "num_tokens": 89834143.0, "reward": 0.71875, "reward_std": 0.4515564441680908, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000190734863281, "sampling/importance_sampling_ratio/min": 0.6440069675445557, "sampling/sampling_logp_difference/max": 1.1824228763580322, "sampling/sampling_logp_difference/mean": 0.015096180140972137, "step": 2096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1023.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 450.40625, "completions/mean_terminated_length": 450.40625, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "entropy": 0.4898986220359802, "epoch": 2.5698529411764706, "frac_reward_zero_std": 0.75, "grad_norm": 0.46756842262542553, "kl": 0.02045193687081337, "learning_rate": 6.260383209026704e-08, "loss": 0.0221, "num_tokens": 89883417.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.867648720741272, "sampling/importance_sampling_ratio/mean": 0.9999666213989258, "sampling/importance_sampling_ratio/min": 0.6535195708274841, "sampling/sampling_logp_difference/max": 0.6246802806854248, "sampling/sampling_logp_difference/mean": 0.014319321140646935, "step": 2097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1052.0, "completions/max_terminated_length": 1052.0, "completions/mean_length": 449.28125, "completions/mean_terminated_length": 449.28125, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "entropy": 0.6353087425231934, "epoch": 2.571078431372549, "frac_reward_zero_std": 0.5, "grad_norm": 0.7192355044909258, "kl": 0.044782981276512146, "learning_rate": 6.225912976868636e-08, "loss": -0.0361, "num_tokens": 89932059.0, "reward": 0.625, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.528887391090393, "sampling/importance_sampling_ratio/mean": 0.9997761249542236, "sampling/importance_sampling_ratio/min": 0.40525558590888977, "sampling/sampling_logp_difference/max": 0.9032373428344727, "sampling/sampling_logp_difference/mean": 0.017338404431939125, "step": 2098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 841.0, "completions/max_terminated_length": 841.0, "completions/mean_length": 401.640625, "completions/mean_terminated_length": 401.640625, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "entropy": 0.5470843315124512, "epoch": 2.5723039215686274, "frac_reward_zero_std": 0.5, "grad_norm": 0.7761378172165889, "kl": 0.0217285193502903, "learning_rate": 6.191531603334044e-08, "loss": 0.0437, "num_tokens": 89972756.0, "reward": 0.90625, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.4450666904449463, "sampling/importance_sampling_ratio/mean": 0.9996083974838257, "sampling/importance_sampling_ratio/min": 0.6158308386802673, "sampling/sampling_logp_difference/max": 0.4847829341888428, "sampling/sampling_logp_difference/mean": 0.016358759254217148, "step": 2099 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 989.0, "completions/max_terminated_length": 989.0, "completions/mean_length": 388.765625, "completions/mean_terminated_length": 388.765625, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.37956660985946655, "epoch": 2.5735294117647056, "frac_reward_zero_std": 0.5, "grad_norm": 0.855262452882262, "kl": 0.013482652604579926, "learning_rate": 6.157239158214966e-08, "loss": -0.0335, "num_tokens": 90020741.0, "reward": 0.5, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.439874291419983, "sampling/importance_sampling_ratio/mean": 0.9998517036437988, "sampling/importance_sampling_ratio/min": 0.6368911266326904, "sampling/sampling_logp_difference/max": 0.4511566162109375, "sampling/sampling_logp_difference/mean": 0.011903383769094944, "step": 2100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 581.0, "completions/max_terminated_length": 581.0, "completions/mean_length": 317.3125, "completions/mean_terminated_length": 317.3125, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.4259551465511322, "epoch": 2.5747549019607843, "frac_reward_zero_std": 1.0, "grad_norm": 0.012041793076076576, "kl": 0.01634800247848034, "learning_rate": 6.123035711122859e-08, "loss": 0.0002, "num_tokens": 90058937.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6355787515640259, "sampling/importance_sampling_ratio/mean": 0.999879002571106, "sampling/importance_sampling_ratio/min": 0.6056220531463623, "sampling/sampling_logp_difference/max": 0.5014991760253906, "sampling/sampling_logp_difference/mean": 0.014577758498489857, "step": 2101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1697.0, "completions/max_terminated_length": 1697.0, "completions/mean_length": 455.875, "completions/mean_terminated_length": 455.875, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.5551818609237671, "epoch": 2.575980392156863, "frac_reward_zero_std": 0.5, "grad_norm": 0.8607121786020685, "kl": 0.027501298114657402, "learning_rate": 6.088921331488566e-08, "loss": 0.1561, "num_tokens": 90104929.0, "reward": 0.71875, "reward_std": 0.38319888710975647, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.4888958930969238, "sampling/importance_sampling_ratio/mean": 1.0004150867462158, "sampling/importance_sampling_ratio/min": 0.6520918011665344, "sampling/sampling_logp_difference/max": 0.4275698661804199, "sampling/sampling_logp_difference/mean": 0.0162846390157938, "step": 2102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 455.0, "completions/max_terminated_length": 455.0, "completions/mean_length": 244.75, "completions/mean_terminated_length": 244.75, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "entropy": 0.444794237613678, "epoch": 2.577205882352941, "frac_reward_zero_std": 0.75, "grad_norm": 0.8208645905080031, "kl": 0.028334520757198334, "learning_rate": 6.05489608856214e-08, "loss": 0.0022, "num_tokens": 90137489.0, "reward": 0.6875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.3658896684646606, "sampling/importance_sampling_ratio/mean": 0.9994814395904541, "sampling/importance_sampling_ratio/min": 0.650577962398529, "sampling/sampling_logp_difference/max": 0.42989420890808105, "sampling/sampling_logp_difference/mean": 0.014448881149291992, "step": 2103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 633.0, "completions/max_terminated_length": 633.0, "completions/mean_length": 408.546875, "completions/mean_terminated_length": 408.546875, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "entropy": 0.49564099311828613, "epoch": 2.5784313725490198, "frac_reward_zero_std": 1.0, "grad_norm": 0.019762105727426347, "kl": 0.026996394619345665, "learning_rate": 6.020960051412638e-08, "loss": 0.0003, "num_tokens": 90180628.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.557730793952942, "sampling/importance_sampling_ratio/mean": 0.9998835325241089, "sampling/importance_sampling_ratio/min": 0.5096350908279419, "sampling/sampling_logp_difference/max": 0.6740604043006897, "sampling/sampling_logp_difference/mean": 0.014405216090381145, "step": 2104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1310.0, "completions/max_terminated_length": 1310.0, "completions/mean_length": 495.4375, "completions/mean_terminated_length": 495.4375, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "entropy": 0.43148869276046753, "epoch": 2.579656862745098, "frac_reward_zero_std": 0.5, "grad_norm": 0.6025705933031795, "kl": 0.0157481636852026, "learning_rate": 5.98711328892808e-08, "loss": -0.0006, "num_tokens": 90231312.0, "reward": 0.21875, "reward_std": 0.38319888710975647, "rewards/decision_reward_func/mean": 0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 1.5956149101257324, "sampling/importance_sampling_ratio/mean": 1.000415563583374, "sampling/importance_sampling_ratio/min": 0.6764503121376038, "sampling/sampling_logp_difference/max": 0.46725916862487793, "sampling/sampling_logp_difference/mean": 0.013328293338418007, "step": 2105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 569.0, "completions/max_terminated_length": 569.0, "completions/mean_length": 268.5, "completions/mean_terminated_length": 268.5, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.6039770245552063, "epoch": 2.5808823529411766, "frac_reward_zero_std": 0.75, "grad_norm": 0.7488954104852983, "kl": 0.04668772220611572, "learning_rate": 5.9533558698152355e-08, "loss": 0.0153, "num_tokens": 90267008.0, "reward": 0.78125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.4106940031051636, "sampling/importance_sampling_ratio/mean": 0.9991574883460999, "sampling/importance_sampling_ratio/min": 0.6107810735702515, "sampling/sampling_logp_difference/max": 0.49301671981811523, "sampling/sampling_logp_difference/mean": 0.018121013417840004, "step": 2106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1466.0, "completions/max_terminated_length": 1466.0, "completions/mean_length": 488.234375, "completions/mean_terminated_length": 488.234375, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "entropy": 0.489930123090744, "epoch": 2.582107843137255, "frac_reward_zero_std": 0.5, "grad_norm": 0.7599779659666287, "kl": 0.028739502653479576, "learning_rate": 5.919687862599548e-08, "loss": -0.001, "num_tokens": 90317087.0, "reward": -0.03125, "reward_std": 0.4629635810852051, "rewards/decision_reward_func/mean": -0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.3748443126678467, "sampling/importance_sampling_ratio/mean": 0.9996616244316101, "sampling/importance_sampling_ratio/min": 0.22495901584625244, "sampling/sampling_logp_difference/max": 1.4918370246887207, "sampling/sampling_logp_difference/mean": 0.014397673308849335, "step": 2107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 701.0, "completions/max_terminated_length": 701.0, "completions/mean_length": 289.796875, "completions/mean_terminated_length": 289.796875, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.4151555299758911, "epoch": 2.5833333333333335, "frac_reward_zero_std": 1.0, "grad_norm": 0.015146775720912959, "kl": 0.021516963839530945, "learning_rate": 5.886109335624928e-08, "loss": 0.0002, "num_tokens": 90355570.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9997149705886841, "sampling/importance_sampling_ratio/min": 0.6339288949966431, "sampling/sampling_logp_difference/max": 1.3870060443878174, "sampling/sampling_logp_difference/mean": 0.015324673615396023, "step": 2108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 575.0, "completions/max_terminated_length": 575.0, "completions/mean_length": 214.28125, "completions/mean_terminated_length": 214.28125, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.3772292137145996, "epoch": 2.5845588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 0.02223513549041636, "kl": 0.028380488976836205, "learning_rate": 5.8526203570536504e-08, "loss": 0.0003, "num_tokens": 90381716.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3093703985214233, "sampling/importance_sampling_ratio/mean": 0.9999834299087524, "sampling/importance_sampling_ratio/min": 0.6546944975852966, "sampling/sampling_logp_difference/max": 0.4235866069793701, "sampling/sampling_logp_difference/mean": 0.015297825448215008, "step": 2109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 553.0, "completions/max_terminated_length": 553.0, "completions/mean_length": 258.421875, "completions/mean_terminated_length": 258.421875, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.4325995445251465, "epoch": 2.5857843137254903, "frac_reward_zero_std": 1.0, "grad_norm": 0.0155610742300939, "kl": 0.0226141307502985, "learning_rate": 5.819220994866236e-08, "loss": 0.0002, "num_tokens": 90413343.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6101850271224976, "sampling/importance_sampling_ratio/mean": 1.0004189014434814, "sampling/importance_sampling_ratio/min": 0.6270315647125244, "sampling/sampling_logp_difference/max": 0.4763491153717041, "sampling/sampling_logp_difference/mean": 0.013906922191381454, "step": 2110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 910.0, "completions/max_terminated_length": 910.0, "completions/mean_length": 388.421875, "completions/mean_terminated_length": 388.421875, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "entropy": 0.4957129955291748, "epoch": 2.5870098039215685, "frac_reward_zero_std": 0.75, "grad_norm": 0.5849668987456379, "kl": 0.022076185792684555, "learning_rate": 5.7859113168612696e-08, "loss": 0.0401, "num_tokens": 90456570.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.6089891195297241, "sampling/importance_sampling_ratio/mean": 0.9999948143959045, "sampling/importance_sampling_ratio/min": 0.6482276916503906, "sampling/sampling_logp_difference/max": 0.4756060838699341, "sampling/sampling_logp_difference/mean": 0.015076762065291405, "step": 2111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/max_terminated_length": 480.0, "completions/mean_length": 318.6875, "completions/mean_terminated_length": 318.6875, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "entropy": 0.49327605962753296, "epoch": 2.588235294117647, "frac_reward_zero_std": 0.75, "grad_norm": 0.7439118289868772, "kl": 0.02233012393116951, "learning_rate": 5.7526913906552786e-08, "loss": 0.0258, "num_tokens": 90502486.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.7807080745697021, "sampling/importance_sampling_ratio/mean": 1.0002117156982422, "sampling/importance_sampling_ratio/min": 0.47249066829681396, "sampling/sampling_logp_difference/max": 0.7497372627258301, "sampling/sampling_logp_difference/mean": 0.015780411660671234, "step": 2112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 945.0, "completions/max_terminated_length": 945.0, "completions/mean_length": 412.84375, "completions/mean_terminated_length": 412.84375, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "entropy": 0.5075137615203857, "epoch": 2.5894607843137254, "frac_reward_zero_std": 0.25, "grad_norm": 0.9709772923345232, "kl": 0.02982671745121479, "learning_rate": 5.7195612836826055e-08, "loss": 0.0419, "num_tokens": 90546284.0, "reward": 0.625, "reward_std": 0.5997638702392578, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.4072990417480469, "sampling/importance_sampling_ratio/mean": 1.0004467964172363, "sampling/importance_sampling_ratio/min": 0.6897897720336914, "sampling/sampling_logp_difference/max": 0.371368408203125, "sampling/sampling_logp_difference/mean": 0.014777952805161476, "step": 2113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 876.0, "completions/max_terminated_length": 876.0, "completions/mean_length": 381.828125, "completions/mean_terminated_length": 381.828125, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.4964631199836731, "epoch": 2.590686274509804, "frac_reward_zero_std": 0.25, "grad_norm": 0.9879639461647947, "kl": 0.02062797173857689, "learning_rate": 5.686521063195287e-08, "loss": -0.0124, "num_tokens": 90589185.0, "reward": 0.71875, "reward_std": 0.5061737298965454, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.4271409511566162, "sampling/importance_sampling_ratio/mean": 0.999788761138916, "sampling/importance_sampling_ratio/min": 0.5837689638137817, "sampling/sampling_logp_difference/max": 0.5382499694824219, "sampling/sampling_logp_difference/mean": 0.015141002833843231, "step": 2114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1265.0, "completions/max_terminated_length": 1265.0, "completions/mean_length": 459.875, "completions/mean_terminated_length": 459.875, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "entropy": 0.41580215096473694, "epoch": 2.5919117647058822, "frac_reward_zero_std": 0.25, "grad_norm": 0.8481752790574674, "kl": 0.020156599581241608, "learning_rate": 5.6535707962628685e-08, "loss": -0.0313, "num_tokens": 90642105.0, "reward": 0.3125, "reward_std": 0.5, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.9877210855484009, "sampling/importance_sampling_ratio/mean": 0.9999071955680847, "sampling/importance_sampling_ratio/min": 0.6263484358787537, "sampling/sampling_logp_difference/max": 0.6869888305664062, "sampling/sampling_logp_difference/mean": 0.012416569516062737, "step": 2115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1070.0, "completions/max_terminated_length": 1070.0, "completions/mean_length": 342.46875, "completions/mean_terminated_length": 342.46875, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.6721670627593994, "epoch": 2.593137254901961, "frac_reward_zero_std": 0.5, "grad_norm": 1.010925851676007, "kl": 0.0322243869304657, "learning_rate": 5.620710549772295e-08, "loss": 0.0631, "num_tokens": 90687047.0, "reward": 0.8125, "reward_std": 0.3943893015384674, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.4344782829284668, "sampling/importance_sampling_ratio/mean": 1.0000838041305542, "sampling/importance_sampling_ratio/min": 0.44376158714294434, "sampling/sampling_logp_difference/max": 0.8124678134918213, "sampling/sampling_logp_difference/mean": 0.01929238811135292, "step": 2116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 851.0, "completions/max_terminated_length": 851.0, "completions/mean_length": 332.1875, "completions/mean_terminated_length": 332.1875, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "entropy": 0.47742369771003723, "epoch": 2.594362745098039, "frac_reward_zero_std": 1.0, "grad_norm": 0.01211462647566223, "kl": 0.018548965454101562, "learning_rate": 5.5879403904278034e-08, "loss": 0.0002, "num_tokens": 90723123.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6355795860290527, "sampling/importance_sampling_ratio/mean": 1.0003294944763184, "sampling/importance_sampling_ratio/min": 0.6625093817710876, "sampling/sampling_logp_difference/max": 0.49199724197387695, "sampling/sampling_logp_difference/mean": 0.01522801909595728, "step": 2117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 447.0, "completions/max_terminated_length": 447.0, "completions/mean_length": 234.953125, "completions/mean_terminated_length": 234.953125, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.5630340576171875, "epoch": 2.5955882352941178, "frac_reward_zero_std": 0.75, "grad_norm": 0.7826008563616963, "kl": 0.05161953717470169, "learning_rate": 5.555260384750721e-08, "loss": -0.0033, "num_tokens": 90753968.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.5612678527832031, "sampling/importance_sampling_ratio/mean": 1.0000886917114258, "sampling/importance_sampling_ratio/min": 0.616589367389679, "sampling/sampling_logp_difference/max": 0.4835519790649414, "sampling/sampling_logp_difference/mean": 0.017568130046129227, "step": 2118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 750.0, "completions/max_terminated_length": 750.0, "completions/mean_length": 352.265625, "completions/mean_terminated_length": 352.265625, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.44268399477005005, "epoch": 2.596813725490196, "frac_reward_zero_std": 1.0, "grad_norm": 0.011078298744758304, "kl": 0.01568310707807541, "learning_rate": 5.5226705990794156e-08, "loss": 0.0002, "num_tokens": 90800865.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.473733901977539, "sampling/importance_sampling_ratio/mean": 0.9998778700828552, "sampling/importance_sampling_ratio/min": 0.6109293699264526, "sampling/sampling_logp_difference/max": 0.49277400970458984, "sampling/sampling_logp_difference/mean": 0.014024605974555016, "step": 2119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 897.0, "completions/max_terminated_length": 897.0, "completions/mean_length": 510.1875, "completions/mean_terminated_length": 510.1875, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "entropy": 0.5399607419967651, "epoch": 2.5980392156862746, "frac_reward_zero_std": 0.75, "grad_norm": 0.5212744154214, "kl": 0.018121816217899323, "learning_rate": 5.4901710995690576e-08, "loss": -0.0149, "num_tokens": 90851517.0, "reward": 0.28125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 1.4371111392974854, "sampling/importance_sampling_ratio/mean": 0.999916672706604, "sampling/importance_sampling_ratio/min": 0.05140678212046623, "sampling/sampling_logp_difference/max": 2.967985153198242, "sampling/sampling_logp_difference/mean": 0.015247681178152561, "step": 2120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 849.0, "completions/max_terminated_length": 849.0, "completions/mean_length": 318.59375, "completions/mean_terminated_length": 318.59375, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "entropy": 0.5234070420265198, "epoch": 2.599264705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.02617606038040671, "kl": 0.033122122287750244, "learning_rate": 5.4577619521915916e-08, "loss": 0.0003, "num_tokens": 90890419.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3929363489151, "sampling/importance_sampling_ratio/mean": 0.9997227191925049, "sampling/importance_sampling_ratio/min": 0.6141110062599182, "sampling/sampling_logp_difference/max": 0.4875795841217041, "sampling/sampling_logp_difference/mean": 0.01586868241429329, "step": 2121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 845.0, "completions/max_terminated_length": 845.0, "completions/mean_length": 416.203125, "completions/mean_terminated_length": 416.203125, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "entropy": 0.368831992149353, "epoch": 2.6004901960784315, "frac_reward_zero_std": 1.0, "grad_norm": 0.008978316503759179, "kl": 0.013390161097049713, "learning_rate": 5.425443222735526e-08, "loss": 0.0001, "num_tokens": 90932768.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.5759456157684326, "sampling/importance_sampling_ratio/mean": 1.0003321170806885, "sampling/importance_sampling_ratio/min": 0.7561548352241516, "sampling/sampling_logp_difference/max": 0.45485544204711914, "sampling/sampling_logp_difference/mean": 0.011755745857954025, "step": 2122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1070.0, "completions/max_terminated_length": 1070.0, "completions/mean_length": 456.609375, "completions/mean_terminated_length": 456.609375, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "entropy": 0.5589444041252136, "epoch": 2.6017156862745097, "frac_reward_zero_std": 0.75, "grad_norm": 0.5513042451437647, "kl": 0.01558010559529066, "learning_rate": 5.393214976805832e-08, "loss": -0.0135, "num_tokens": 90984327.0, "reward": 0.09375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.535770297050476, "sampling/importance_sampling_ratio/mean": 0.9997930526733398, "sampling/importance_sampling_ratio/min": 0.622846245765686, "sampling/sampling_logp_difference/max": 0.473455548286438, "sampling/sampling_logp_difference/mean": 0.015644975006580353, "step": 2123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 799.0, "completions/max_terminated_length": 799.0, "completions/mean_length": 365.953125, "completions/mean_terminated_length": 365.953125, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.5162167549133301, "epoch": 2.6029411764705883, "frac_reward_zero_std": 0.75, "grad_norm": 0.5888184685700069, "kl": 0.022259706631302834, "learning_rate": 5.361077279823817e-08, "loss": -0.0346, "num_tokens": 91024324.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.8881243467330933, "sampling/importance_sampling_ratio/mean": 0.9999355673789978, "sampling/importance_sampling_ratio/min": 0.6106176972389221, "sampling/sampling_logp_difference/max": 0.6355838775634766, "sampling/sampling_logp_difference/mean": 0.015172508545219898, "step": 2124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 898.0, "completions/max_terminated_length": 898.0, "completions/mean_length": 425.171875, "completions/mean_terminated_length": 425.171875, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.45306622982025146, "epoch": 2.6041666666666665, "frac_reward_zero_std": 0.75, "grad_norm": 0.5517874010458704, "kl": 0.022090256214141846, "learning_rate": 5.3290301970269514e-08, "loss": -0.0044, "num_tokens": 91068463.0, "reward": 0.6875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.4682434797286987, "sampling/importance_sampling_ratio/mean": 0.999931812286377, "sampling/importance_sampling_ratio/min": 0.6954025030136108, "sampling/sampling_logp_difference/max": 0.3840668201446533, "sampling/sampling_logp_difference/mean": 0.013254771009087563, "step": 2125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 574.0, "completions/max_terminated_length": 574.0, "completions/mean_length": 308.65625, "completions/mean_terminated_length": 308.65625, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.47255951166152954, "epoch": 2.605392156862745, "frac_reward_zero_std": 1.0, "grad_norm": 0.010900180187769453, "kl": 0.01732967235147953, "learning_rate": 5.29707379346882e-08, "loss": 0.0002, "num_tokens": 91103689.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6931442022323608, "sampling/importance_sampling_ratio/mean": 1.0003749132156372, "sampling/importance_sampling_ratio/min": 0.4868062138557434, "sampling/sampling_logp_difference/max": 0.7198891639709473, "sampling/sampling_logp_difference/mean": 0.015873923897743225, "step": 2126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 743.0, "completions/max_terminated_length": 743.0, "completions/mean_length": 414.546875, "completions/mean_terminated_length": 414.546875, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.383606880903244, "epoch": 2.6066176470588234, "frac_reward_zero_std": 0.75, "grad_norm": 0.5780045615381371, "kl": 0.016940457746386528, "learning_rate": 5.2652081340188506e-08, "loss": 0.0221, "num_tokens": 91149196.0, "reward": 0.34375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 1.596928358078003, "sampling/importance_sampling_ratio/mean": 0.9997327923774719, "sampling/importance_sampling_ratio/min": 0.6139987111091614, "sampling/sampling_logp_difference/max": 0.487762451171875, "sampling/sampling_logp_difference/mean": 0.011484788730740547, "step": 2127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1012.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 370.828125, "completions/mean_terminated_length": 370.828125, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "entropy": 0.4224597215652466, "epoch": 2.607843137254902, "frac_reward_zero_std": 0.75, "grad_norm": 0.6307212045740865, "kl": 0.01835232973098755, "learning_rate": 5.2334332833623487e-08, "loss": -0.0505, "num_tokens": 91193313.0, "reward": 0.78125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.5717437267303467, "sampling/importance_sampling_ratio/mean": 1.0001039505004883, "sampling/importance_sampling_ratio/min": 0.6622980237007141, "sampling/sampling_logp_difference/max": 0.45218563079833984, "sampling/sampling_logp_difference/mean": 0.013447094708681107, "step": 2128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 558.0, "completions/max_terminated_length": 558.0, "completions/mean_length": 334.03125, "completions/mean_terminated_length": 334.03125, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "entropy": 0.5671287775039673, "epoch": 2.6090686274509802, "frac_reward_zero_std": 0.75, "grad_norm": 0.6734427166082461, "kl": 0.025317411869764328, "learning_rate": 5.2017493060002196e-08, "loss": 0.0026, "num_tokens": 91232387.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.6483038663864136, "sampling/importance_sampling_ratio/mean": 1.0005111694335938, "sampling/importance_sampling_ratio/min": 0.7164372801780701, "sampling/sampling_logp_difference/max": 0.49974679946899414, "sampling/sampling_logp_difference/mean": 0.017738021910190582, "step": 2129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1153.0, "completions/max_terminated_length": 1153.0, "completions/mean_length": 427.359375, "completions/mean_terminated_length": 427.359375, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "entropy": 0.44521453976631165, "epoch": 2.610294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.024084766678304913, "kl": 0.014872138388454914, "learning_rate": 5.1701562662489596e-08, "loss": 0.0002, "num_tokens": 91281434.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.4517592191696167, "sampling/importance_sampling_ratio/mean": 0.9998337626457214, "sampling/importance_sampling_ratio/min": 0.6560724973678589, "sampling/sampling_logp_difference/max": 0.42148399353027344, "sampling/sampling_logp_difference/mean": 0.013971804641187191, "step": 2130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1019.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 447.578125, "completions/mean_terminated_length": 447.578125, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "entropy": 0.5499076843261719, "epoch": 2.611519607843137, "frac_reward_zero_std": 0.75, "grad_norm": 0.5495339324085434, "kl": 0.049516625702381134, "learning_rate": 5.138654228240424e-08, "loss": -0.0455, "num_tokens": 91328543.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.5547884702682495, "sampling/importance_sampling_ratio/mean": 0.9999079704284668, "sampling/importance_sampling_ratio/min": 0.7267242670059204, "sampling/sampling_logp_difference/max": 0.44133949279785156, "sampling/sampling_logp_difference/mean": 0.015520891174674034, "step": 2131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 740.0, "completions/max_terminated_length": 740.0, "completions/mean_length": 320.1875, "completions/mean_terminated_length": 320.1875, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.4025037884712219, "epoch": 2.6127450980392157, "frac_reward_zero_std": 1.0, "grad_norm": 0.009826413417551621, "kl": 0.015471817925572395, "learning_rate": 5.1072432559217446e-08, "loss": 0.0001, "num_tokens": 91367803.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4528872966766357, "sampling/importance_sampling_ratio/mean": 0.999944269657135, "sampling/importance_sampling_ratio/min": 0.6795623898506165, "sampling/sampling_logp_difference/max": 0.3863062858581543, "sampling/sampling_logp_difference/mean": 0.012744343839585781, "step": 2132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1047.0, "completions/max_terminated_length": 1047.0, "completions/mean_length": 357.53125, "completions/mean_terminated_length": 357.53125, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.5480242967605591, "epoch": 2.6139705882352944, "frac_reward_zero_std": 0.5, "grad_norm": 0.7931702195407101, "kl": 0.022708654403686523, "learning_rate": 5.075923413055222e-08, "loss": 0.0145, "num_tokens": 91407005.0, "reward": 0.40625, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.5260651111602783, "sampling/importance_sampling_ratio/mean": 0.9999131560325623, "sampling/importance_sampling_ratio/min": 0.6828500628471375, "sampling/sampling_logp_difference/max": 0.42269253730773926, "sampling/sampling_logp_difference/mean": 0.015997182577848434, "step": 2133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 686.0, "completions/max_terminated_length": 686.0, "completions/mean_length": 397.59375, "completions/mean_terminated_length": 397.59375, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "entropy": 0.43914833664894104, "epoch": 2.6151960784313726, "frac_reward_zero_std": 0.5, "grad_norm": 0.6548805907997413, "kl": 0.025049079209566116, "learning_rate": 5.044694763218149e-08, "loss": -0.0104, "num_tokens": 91448611.0, "reward": 0.4375, "reward_std": 0.47360679507255554, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.3830078840255737, "sampling/importance_sampling_ratio/mean": 0.9998511075973511, "sampling/importance_sampling_ratio/min": 0.6663520336151123, "sampling/sampling_logp_difference/max": 0.40593719482421875, "sampling/sampling_logp_difference/mean": 0.012268567457795143, "step": 2134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1310.0, "completions/max_terminated_length": 1310.0, "completions/mean_length": 377.265625, "completions/mean_terminated_length": 377.265625, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "entropy": 0.34789374470710754, "epoch": 2.616421568627451, "frac_reward_zero_std": 1.0, "grad_norm": 0.00947941592764054, "kl": 0.013895096257328987, "learning_rate": 5.013557369802701e-08, "loss": 0.0002, "num_tokens": 91490276.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4627541303634644, "sampling/importance_sampling_ratio/mean": 0.9999017715454102, "sampling/importance_sampling_ratio/min": 0.5820474028587341, "sampling/sampling_logp_difference/max": 0.5412033796310425, "sampling/sampling_logp_difference/mean": 0.011275464668869972, "step": 2135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 572.0, "completions/max_terminated_length": 572.0, "completions/mean_length": 325.28125, "completions/mean_terminated_length": 325.28125, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.5165039300918579, "epoch": 2.6176470588235294, "frac_reward_zero_std": 0.75, "grad_norm": 0.6305367627031113, "kl": 0.03071487322449684, "learning_rate": 4.982511296015807e-08, "loss": 0.0395, "num_tokens": 91526566.0, "reward": 0.21875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 1.3519428968429565, "sampling/importance_sampling_ratio/mean": 0.9998173713684082, "sampling/importance_sampling_ratio/min": 0.5136284828186035, "sampling/sampling_logp_difference/max": 0.666254997253418, "sampling/sampling_logp_difference/mean": 0.016286982223391533, "step": 2136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 846.0, "completions/max_terminated_length": 846.0, "completions/mean_length": 395.8125, "completions/mean_terminated_length": 395.8125, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.5512070655822754, "epoch": 2.618872549019608, "frac_reward_zero_std": 0.75, "grad_norm": 0.5385108439163243, "kl": 0.03133903816342354, "learning_rate": 4.951556604879048e-08, "loss": -0.0344, "num_tokens": 91568442.0, "reward": 0.84375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999226331710815, "sampling/importance_sampling_ratio/min": 0.6680668592453003, "sampling/sampling_logp_difference/max": 0.9376134872436523, "sampling/sampling_logp_difference/mean": 0.01590603031218052, "step": 2137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1461.0, "completions/max_terminated_length": 1461.0, "completions/mean_length": 423.71875, "completions/mean_terminated_length": 423.71875, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.5197600722312927, "epoch": 2.6200980392156863, "frac_reward_zero_std": 0.5, "grad_norm": 0.9201188803122288, "kl": 0.03142571449279785, "learning_rate": 4.9206933592284725e-08, "loss": -0.0237, "num_tokens": 91620552.0, "reward": 0.78125, "reward_std": 0.375, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.4541431665420532, "sampling/importance_sampling_ratio/mean": 0.9997895359992981, "sampling/importance_sampling_ratio/min": 0.4093865752220154, "sampling/sampling_logp_difference/max": 0.8930954337120056, "sampling/sampling_logp_difference/mean": 0.015498241409659386, "step": 2138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1497.0, "completions/max_terminated_length": 1497.0, "completions/mean_length": 501.453125, "completions/mean_terminated_length": 501.453125, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "entropy": 0.4112657308578491, "epoch": 2.6213235294117645, "frac_reward_zero_std": 0.75, "grad_norm": 0.4004567437035502, "kl": 0.013939827680587769, "learning_rate": 4.889921621714516e-08, "loss": 0.014, "num_tokens": 91677397.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.9378459453582764, "sampling/importance_sampling_ratio/mean": 0.9999183416366577, "sampling/importance_sampling_ratio/min": 0.596213161945343, "sampling/sampling_logp_difference/max": 0.661577045917511, "sampling/sampling_logp_difference/mean": 0.013169560581445694, "step": 2139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1480.0, "completions/max_terminated_length": 1480.0, "completions/mean_length": 449.140625, "completions/mean_terminated_length": 449.140625, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "entropy": 0.626112699508667, "epoch": 2.622549019607843, "frac_reward_zero_std": 0.5, "grad_norm": 0.7947074272546802, "kl": 0.029405536130070686, "learning_rate": 4.859241454801866e-08, "loss": 0.0671, "num_tokens": 91727022.0, "reward": 0.625, "reward_std": 0.4577302038669586, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.615956425666809, "sampling/importance_sampling_ratio/mean": 0.9995068907737732, "sampling/importance_sampling_ratio/min": 0.6636209487915039, "sampling/sampling_logp_difference/max": 0.47992706298828125, "sampling/sampling_logp_difference/mean": 0.017639920115470886, "step": 2140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 695.0, "completions/max_terminated_length": 695.0, "completions/mean_length": 313.75, "completions/mean_terminated_length": 313.75, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.397737979888916, "epoch": 2.623774509803922, "frac_reward_zero_std": 0.75, "grad_norm": 0.5720210479786003, "kl": 0.01839638501405716, "learning_rate": 4.828652920769311e-08, "loss": 0.0156, "num_tokens": 91765390.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.6362266540527344, "sampling/importance_sampling_ratio/mean": 0.9997011423110962, "sampling/importance_sampling_ratio/min": 0.6791910529136658, "sampling/sampling_logp_difference/max": 0.49239277839660645, "sampling/sampling_logp_difference/mean": 0.013769764453172684, "step": 2141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 563.0, "completions/max_terminated_length": 563.0, "completions/mean_length": 267.28125, "completions/mean_terminated_length": 267.28125, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.4066786766052246, "epoch": 2.625, "frac_reward_zero_std": 1.0, "grad_norm": 0.014092978328109526, "kl": 0.019647009670734406, "learning_rate": 4.7981560817096366e-08, "loss": 0.0002, "num_tokens": 91799408.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.609436273574829, "sampling/importance_sampling_ratio/mean": 1.0005987882614136, "sampling/importance_sampling_ratio/min": 0.6331400275230408, "sampling/sampling_logp_difference/max": 0.47588396072387695, "sampling/sampling_logp_difference/mean": 0.014774092473089695, "step": 2142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 563.0, "completions/max_terminated_length": 563.0, "completions/mean_length": 295.109375, "completions/mean_terminated_length": 295.109375, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.4312411844730377, "epoch": 2.626225490196078, "frac_reward_zero_std": 1.0, "grad_norm": 0.014151698504942859, "kl": 0.019674602895975113, "learning_rate": 4.767750999529485e-08, "loss": 0.0002, "num_tokens": 91832551.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5470919609069824, "sampling/importance_sampling_ratio/mean": 1.0000505447387695, "sampling/importance_sampling_ratio/min": 0.4678095281124115, "sampling/sampling_logp_difference/max": 0.7596940994262695, "sampling/sampling_logp_difference/mean": 0.015301831997931004, "step": 2143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 581.0, "completions/max_terminated_length": 581.0, "completions/mean_length": 317.859375, "completions/mean_terminated_length": 317.859375, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "entropy": 0.3476734757423401, "epoch": 2.627450980392157, "frac_reward_zero_std": 1.0, "grad_norm": 0.011052749613615092, "kl": 0.013192037120461464, "learning_rate": 4.7374377359492624e-08, "loss": 0.0001, "num_tokens": 91871118.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4689490795135498, "sampling/importance_sampling_ratio/mean": 1.0000194311141968, "sampling/importance_sampling_ratio/min": 0.6368674039840698, "sampling/sampling_logp_difference/max": 0.45119380950927734, "sampling/sampling_logp_difference/mean": 0.012811603955924511, "step": 2144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/max_terminated_length": 526.0, "completions/mean_length": 298.171875, "completions/mean_terminated_length": 298.171875, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.5127139091491699, "epoch": 2.6286764705882355, "frac_reward_zero_std": 0.75, "grad_norm": 0.6409012219677769, "kl": 0.02660539746284485, "learning_rate": 4.707216352502974e-08, "loss": 0.0068, "num_tokens": 91906937.0, "reward": 0.0625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.5232656002044678, "sampling/importance_sampling_ratio/mean": 0.9998744130134583, "sampling/importance_sampling_ratio/min": 0.7558307647705078, "sampling/sampling_logp_difference/max": 0.4208564758300781, "sampling/sampling_logp_difference/mean": 0.016481786966323853, "step": 2145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1180.0, "completions/max_terminated_length": 1180.0, "completions/mean_length": 526.6875, "completions/mean_terminated_length": 526.6875, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "entropy": 0.5941407680511475, "epoch": 2.6299019607843137, "frac_reward_zero_std": 0.5, "grad_norm": 0.686756916969148, "kl": 0.021774420514702797, "learning_rate": 4.6770869105380914e-08, "loss": -0.0428, "num_tokens": 91962917.0, "reward": 0.78125, "reward_std": 0.375, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.5795683860778809, "sampling/importance_sampling_ratio/mean": 1.0003068447113037, "sampling/importance_sampling_ratio/min": 0.6555105447769165, "sampling/sampling_logp_difference/max": 0.4571516513824463, "sampling/sampling_logp_difference/mean": 0.0160979051142931, "step": 2146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 571.0, "completions/max_terminated_length": 571.0, "completions/mean_length": 365.828125, "completions/mean_terminated_length": 365.828125, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "entropy": 0.34388333559036255, "epoch": 2.631127450980392, "frac_reward_zero_std": 1.0, "grad_norm": 0.011847437454368066, "kl": 0.01689210534095764, "learning_rate": 4.647049471215497e-08, "loss": 0.0002, "num_tokens": 92004458.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6110248565673828, "sampling/importance_sampling_ratio/mean": 0.9998412132263184, "sampling/importance_sampling_ratio/min": 0.45643991231918335, "sampling/sampling_logp_difference/max": 0.784298300743103, "sampling/sampling_logp_difference/mean": 0.01132202334702015, "step": 2147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 561.0, "completions/max_terminated_length": 561.0, "completions/mean_length": 302.0625, "completions/mean_terminated_length": 302.0625, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "entropy": 0.44855570793151855, "epoch": 2.6323529411764706, "frac_reward_zero_std": 0.75, "grad_norm": 0.6583745493480744, "kl": 0.03697017952799797, "learning_rate": 4.6171040955092835e-08, "loss": 0.0116, "num_tokens": 92039870.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.5402182340621948, "sampling/importance_sampling_ratio/mean": 0.9996145963668823, "sampling/importance_sampling_ratio/min": 0.577110767364502, "sampling/sampling_logp_difference/max": 0.5497210025787354, "sampling/sampling_logp_difference/mean": 0.015568509697914124, "step": 2148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/max_terminated_length": 536.0, "completions/mean_length": 329.90625, "completions/mean_terminated_length": 329.90625, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.4730050563812256, "epoch": 2.633578431372549, "frac_reward_zero_std": 1.0, "grad_norm": 0.017231967728609254, "kl": 0.02485346980392933, "learning_rate": 4.587250844206664e-08, "loss": 0.0002, "num_tokens": 92078488.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4193305969238281, "sampling/importance_sampling_ratio/mean": 1.00032377243042, "sampling/importance_sampling_ratio/min": 0.5271129608154297, "sampling/sampling_logp_difference/max": 0.6403404474258423, "sampling/sampling_logp_difference/mean": 0.01613643392920494, "step": 2149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 600.0, "completions/max_terminated_length": 600.0, "completions/mean_length": 302.84375, "completions/mean_terminated_length": 302.84375, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.5073530673980713, "epoch": 2.6348039215686274, "frac_reward_zero_std": 1.0, "grad_norm": 0.017804264603758762, "kl": 0.026108356192708015, "learning_rate": 4.557489777907836e-08, "loss": 0.0003, "num_tokens": 92113806.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0002410411834717, "sampling/importance_sampling_ratio/min": 0.6979686617851257, "sampling/sampling_logp_difference/max": 1.6199116706848145, "sampling/sampling_logp_difference/mean": 0.0151288453489542, "step": 2150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1150.0, "completions/max_terminated_length": 1150.0, "completions/mean_length": 566.46875, "completions/mean_terminated_length": 566.46875, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.3078612685203552, "epoch": 2.6360294117647056, "frac_reward_zero_std": 0.75, "grad_norm": 0.28590791202605026, "kl": 0.014630671590566635, "learning_rate": 4.527820957025891e-08, "loss": 0.0042, "num_tokens": 92169292.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.5744246244430542, "sampling/importance_sampling_ratio/mean": 1.000139594078064, "sampling/importance_sampling_ratio/min": 0.3612194359302521, "sampling/sampling_logp_difference/max": 1.018269658088684, "sampling/sampling_logp_difference/mean": 0.010436870157718658, "step": 2151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 679.0, "completions/max_terminated_length": 679.0, "completions/mean_length": 290.6875, "completions/mean_terminated_length": 290.6875, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "entropy": 0.45187270641326904, "epoch": 2.6372549019607843, "frac_reward_zero_std": 0.75, "grad_norm": 0.5986366236489944, "kl": 0.02953849919140339, "learning_rate": 4.498244441786675e-08, "loss": 0.0036, "num_tokens": 92204008.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.4351998567581177, "sampling/importance_sampling_ratio/mean": 0.9997985363006592, "sampling/importance_sampling_ratio/min": 0.6255089640617371, "sampling/sampling_logp_difference/max": 0.4691896438598633, "sampling/sampling_logp_difference/mean": 0.014835581183433533, "step": 2152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1018.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 389.765625, "completions/mean_terminated_length": 389.765625, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "entropy": 0.41199058294296265, "epoch": 2.638480392156863, "frac_reward_zero_std": 0.75, "grad_norm": 0.48725816712817815, "kl": 0.01954377256333828, "learning_rate": 4.4687602922286016e-08, "loss": -0.0041, "num_tokens": 92246329.0, "reward": -0.09375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": -0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.850350022315979, "sampling/importance_sampling_ratio/mean": 1.0003166198730469, "sampling/importance_sampling_ratio/min": 0.5928933024406433, "sampling/sampling_logp_difference/max": 0.6153748035430908, "sampling/sampling_logp_difference/mean": 0.013367127627134323, "step": 2153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1266.0, "completions/max_terminated_length": 1266.0, "completions/mean_length": 419.09375, "completions/mean_terminated_length": 419.09375, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "entropy": 0.4960559606552124, "epoch": 2.639705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 0.564847081925991, "kl": 0.025602562353014946, "learning_rate": 4.4393685682026505e-08, "loss": -0.002, "num_tokens": 92295775.0, "reward": 0.0, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.6211717128753662, "sampling/importance_sampling_ratio/mean": 1.0000526905059814, "sampling/importance_sampling_ratio/min": 0.32326748967170715, "sampling/sampling_logp_difference/max": 1.1292752027511597, "sampling/sampling_logp_difference/mean": 0.01461585983633995, "step": 2154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 800.0, "completions/max_terminated_length": 800.0, "completions/mean_length": 354.015625, "completions/mean_terminated_length": 354.015625, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "entropy": 0.5948508977890015, "epoch": 2.6409313725490198, "frac_reward_zero_std": 0.25, "grad_norm": 1.063074226447132, "kl": 0.03347254917025566, "learning_rate": 4.4100693293721516e-08, "loss": 0.0209, "num_tokens": 92334320.0, "reward": 0.5, "reward_std": 0.4973389506340027, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5780409574508667, "sampling/importance_sampling_ratio/mean": 0.9998078346252441, "sampling/importance_sampling_ratio/min": 0.6554862856864929, "sampling/sampling_logp_difference/max": 0.45618414878845215, "sampling/sampling_logp_difference/mean": 0.018191680312156677, "step": 2155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 908.0, "completions/max_terminated_length": 908.0, "completions/mean_length": 312.859375, "completions/mean_terminated_length": 312.859375, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.5045948028564453, "epoch": 2.642156862745098, "frac_reward_zero_std": 0.5, "grad_norm": 0.8770669685371277, "kl": 0.03713681548833847, "learning_rate": 4.3808626352127066e-08, "loss": -0.0229, "num_tokens": 92373351.0, "reward": 0.5625, "reward_std": 0.5123475193977356, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.47539484500885, "sampling/importance_sampling_ratio/mean": 1.00016188621521, "sampling/importance_sampling_ratio/min": 0.649113118648529, "sampling/sampling_logp_difference/max": 0.4321483373641968, "sampling/sampling_logp_difference/mean": 0.016803346574306488, "step": 2156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 876.0, "completions/max_terminated_length": 876.0, "completions/mean_length": 342.4375, "completions/mean_terminated_length": 342.4375, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.4655565917491913, "epoch": 2.6433823529411766, "frac_reward_zero_std": 0.75, "grad_norm": 0.633960951451073, "kl": 0.03835434094071388, "learning_rate": 4.351748545012057e-08, "loss": 0.0244, "num_tokens": 92412275.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.374263882637024, "sampling/importance_sampling_ratio/mean": 1.0003995895385742, "sampling/importance_sampling_ratio/min": 0.6839761137962341, "sampling/sampling_logp_difference/max": 0.37983226776123047, "sampling/sampling_logp_difference/mean": 0.014549206010997295, "step": 2157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 933.0, "completions/max_terminated_length": 933.0, "completions/mean_length": 411.40625, "completions/mean_terminated_length": 411.40625, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "entropy": 0.40187567472457886, "epoch": 2.644607843137255, "frac_reward_zero_std": 0.75, "grad_norm": 0.5872958577243232, "kl": 0.02072880044579506, "learning_rate": 4.322727117869951e-08, "loss": 0.0102, "num_tokens": 92458525.0, "reward": 0.71875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.404646396636963, "sampling/importance_sampling_ratio/mean": 0.9999189376831055, "sampling/importance_sampling_ratio/min": 0.6622371673583984, "sampling/sampling_logp_difference/max": 0.41213154792785645, "sampling/sampling_logp_difference/mean": 0.012898609042167664, "step": 2158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/max_terminated_length": 522.0, "completions/mean_length": 294.71875, "completions/mean_terminated_length": 294.71875, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.453173965215683, "epoch": 2.6458333333333335, "frac_reward_zero_std": 0.75, "grad_norm": 0.5858210282896514, "kl": 0.031289417296648026, "learning_rate": 4.2937984126980686e-08, "loss": 0.0206, "num_tokens": 92492555.0, "reward": 0.8125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.3694278001785278, "sampling/importance_sampling_ratio/mean": 0.9998902082443237, "sampling/importance_sampling_ratio/min": 0.7279112339019775, "sampling/sampling_logp_difference/max": 0.31757616996765137, "sampling/sampling_logp_difference/mean": 0.014219855889678001, "step": 2159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 837.0, "completions/max_terminated_length": 837.0, "completions/mean_length": 383.625, "completions/mean_terminated_length": 383.625, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.609261691570282, "epoch": 2.6470588235294117, "frac_reward_zero_std": 0.5, "grad_norm": 0.7787017444755466, "kl": 0.03436736762523651, "learning_rate": 4.2649624882198196e-08, "loss": 0.0142, "num_tokens": 92537395.0, "reward": 0.125, "reward_std": 0.49553054571151733, "rewards/decision_reward_func/mean": 0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 1.4753813743591309, "sampling/importance_sampling_ratio/mean": 1.0000814199447632, "sampling/importance_sampling_ratio/min": 0.39502081274986267, "sampling/sampling_logp_difference/max": 0.9288167953491211, "sampling/sampling_logp_difference/mean": 0.017357073724269867, "step": 2160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 687.0, "completions/max_terminated_length": 687.0, "completions/mean_length": 289.46875, "completions/mean_terminated_length": 289.46875, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.4560351073741913, "epoch": 2.6482843137254903, "frac_reward_zero_std": 0.75, "grad_norm": 0.791271504796823, "kl": 0.022995686158537865, "learning_rate": 4.2362194029703256e-08, "loss": 0.0169, "num_tokens": 92570785.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.445144534111023, "sampling/importance_sampling_ratio/mean": 0.9999598860740662, "sampling/importance_sampling_ratio/min": 0.6771822571754456, "sampling/sampling_logp_difference/max": 0.3898148536682129, "sampling/sampling_logp_difference/mean": 0.015091652981936932, "step": 2161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1128.0, "completions/max_terminated_length": 1128.0, "completions/mean_length": 421.78125, "completions/mean_terminated_length": 421.78125, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "entropy": 0.4318383038043976, "epoch": 2.6495098039215685, "frac_reward_zero_std": 0.75, "grad_norm": 0.415115381735649, "kl": 0.02210388518869877, "learning_rate": 4.207569215296214e-08, "loss": 0.0027, "num_tokens": 92615555.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.4918524026870728, "sampling/importance_sampling_ratio/mean": 0.9998021125793457, "sampling/importance_sampling_ratio/min": 0.693626344203949, "sampling/sampling_logp_difference/max": 0.400018572807312, "sampling/sampling_logp_difference/mean": 0.01365116611123085, "step": 2162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 575.0, "completions/max_terminated_length": 575.0, "completions/mean_length": 341.90625, "completions/mean_terminated_length": 341.90625, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "entropy": 0.3437095880508423, "epoch": 2.650735294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.009143073188704678, "kl": 0.010384215041995049, "learning_rate": 4.179011983355568e-08, "loss": 0.0001, "num_tokens": 92662253.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001939535140991, "sampling/importance_sampling_ratio/min": 0.6356238126754761, "sampling/sampling_logp_difference/max": 0.8979084491729736, "sampling/sampling_logp_difference/mean": 0.012094703502953053, "step": 2163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 584.0, "completions/max_terminated_length": 584.0, "completions/mean_length": 350.65625, "completions/mean_terminated_length": 350.65625, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "entropy": 0.37913262844085693, "epoch": 2.6519607843137254, "frac_reward_zero_std": 1.0, "grad_norm": 0.008941348660174648, "kl": 0.012654365040361881, "learning_rate": 4.150547765117746e-08, "loss": 0.0001, "num_tokens": 92700311.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4036005735397339, "sampling/importance_sampling_ratio/mean": 1.0000085830688477, "sampling/importance_sampling_ratio/min": 0.6639572381973267, "sampling/sampling_logp_difference/max": 0.40953755378723145, "sampling/sampling_logp_difference/mean": 0.011886341497302055, "step": 2164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 683.0, "completions/max_terminated_length": 683.0, "completions/mean_length": 363.734375, "completions/mean_terminated_length": 363.734375, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.41701072454452515, "epoch": 2.653186274509804, "frac_reward_zero_std": 0.75, "grad_norm": 0.7309252692373709, "kl": 0.016792792826890945, "learning_rate": 4.1221766183633045e-08, "loss": 0.0202, "num_tokens": 92747126.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.570647954940796, "sampling/importance_sampling_ratio/mean": 0.9999947547912598, "sampling/importance_sampling_ratio/min": 0.6622481346130371, "sampling/sampling_logp_difference/max": 0.4514882564544678, "sampling/sampling_logp_difference/mean": 0.013499492779374123, "step": 2165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 465.0, "completions/max_terminated_length": 465.0, "completions/mean_length": 214.5625, "completions/mean_terminated_length": 214.5625, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.334402859210968, "epoch": 2.6544117647058822, "frac_reward_zero_std": 1.0, "grad_norm": 0.012990471161878884, "kl": 0.01642770692706108, "learning_rate": 4.0938986006838926e-08, "loss": 0.0002, "num_tokens": 92776298.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.609859585762024, "sampling/importance_sampling_ratio/mean": 0.999617338180542, "sampling/importance_sampling_ratio/min": 0.6785916090011597, "sampling/sampling_logp_difference/max": 0.476146936416626, "sampling/sampling_logp_difference/mean": 0.013193923979997635, "step": 2166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 600.0, "completions/max_terminated_length": 600.0, "completions/mean_length": 308.78125, "completions/mean_terminated_length": 308.78125, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.6274556517601013, "epoch": 2.655637254901961, "frac_reward_zero_std": 0.5, "grad_norm": 0.908891136292583, "kl": 0.04834940284490585, "learning_rate": 4.065713769482082e-08, "loss": 0.0026, "num_tokens": 92814748.0, "reward": 0.625, "reward_std": 0.49553054571151733, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.5989848375320435, "sampling/importance_sampling_ratio/mean": 0.9997384548187256, "sampling/importance_sampling_ratio/min": 0.526940643787384, "sampling/sampling_logp_difference/max": 0.6406674385070801, "sampling/sampling_logp_difference/mean": 0.019060824066400528, "step": 2167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 711.0, "completions/max_terminated_length": 711.0, "completions/mean_length": 313.484375, "completions/mean_terminated_length": 313.484375, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.4534775912761688, "epoch": 2.656862745098039, "frac_reward_zero_std": 0.75, "grad_norm": 0.6187017905015856, "kl": 0.03020528331398964, "learning_rate": 4.037622181971295e-08, "loss": 0.0095, "num_tokens": 92851899.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.6262576580047607, "sampling/importance_sampling_ratio/mean": 0.9998173117637634, "sampling/importance_sampling_ratio/min": 0.6132686734199524, "sampling/sampling_logp_difference/max": 0.4889521598815918, "sampling/sampling_logp_difference/mean": 0.015431024134159088, "step": 2168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 675.0, "completions/max_terminated_length": 675.0, "completions/mean_length": 300.453125, "completions/mean_terminated_length": 300.453125, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.6351818442344666, "epoch": 2.6580882352941178, "frac_reward_zero_std": 0.5, "grad_norm": 1.0700393294051544, "kl": 0.04221993684768677, "learning_rate": 4.009623895175662e-08, "loss": 0.0356, "num_tokens": 92888264.0, "reward": 0.625, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.5622336864471436, "sampling/importance_sampling_ratio/mean": 1.000267744064331, "sampling/importance_sampling_ratio/min": 0.6639724373817444, "sampling/sampling_logp_difference/max": 0.44611668586730957, "sampling/sampling_logp_difference/mean": 0.018640775233507156, "step": 2169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 764.0, "completions/max_terminated_length": 764.0, "completions/mean_length": 324.953125, "completions/mean_terminated_length": 324.953125, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "entropy": 0.42195308208465576, "epoch": 2.659313725490196, "frac_reward_zero_std": 1.0, "grad_norm": 0.01388443418003193, "kl": 0.020983904600143433, "learning_rate": 3.981718965929959e-08, "loss": 0.0002, "num_tokens": 92931557.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.2971214056015015, "sampling/importance_sampling_ratio/mean": 1.0003007650375366, "sampling/importance_sampling_ratio/min": 0.6977412700653076, "sampling/sampling_logp_difference/max": 0.3599069118499756, "sampling/sampling_logp_difference/mean": 0.012996615841984749, "step": 2170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 640.0, "completions/max_terminated_length": 640.0, "completions/mean_length": 310.546875, "completions/mean_terminated_length": 310.546875, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "entropy": 0.4112635850906372, "epoch": 2.6605392156862746, "frac_reward_zero_std": 1.0, "grad_norm": 0.011809253439083286, "kl": 0.01542516890913248, "learning_rate": 3.953907450879407e-08, "loss": 0.0001, "num_tokens": 92966408.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5512927770614624, "sampling/importance_sampling_ratio/mean": 1.0009139776229858, "sampling/importance_sampling_ratio/min": 0.5652694702148438, "sampling/sampling_logp_difference/max": 0.5704526901245117, "sampling/sampling_logp_difference/mean": 0.01473847683519125, "step": 2171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1028.0, "completions/max_terminated_length": 1028.0, "completions/mean_length": 338.96875, "completions/mean_terminated_length": 338.96875, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "entropy": 0.44123736023902893, "epoch": 2.661764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.013230320617174846, "kl": 0.01892825961112976, "learning_rate": 3.926189406479613e-08, "loss": 0.0002, "num_tokens": 93011254.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4919720888137817, "sampling/importance_sampling_ratio/mean": 1.0001320838928223, "sampling/importance_sampling_ratio/min": 0.7055361270904541, "sampling/sampling_logp_difference/max": 0.4000988006591797, "sampling/sampling_logp_difference/mean": 0.013944682665169239, "step": 2172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1080.0, "completions/max_terminated_length": 1080.0, "completions/mean_length": 405.453125, "completions/mean_terminated_length": 405.453125, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.6114040613174438, "epoch": 2.6629901960784315, "frac_reward_zero_std": 0.25, "grad_norm": 0.924210138983583, "kl": 0.04314914345741272, "learning_rate": 3.898564888996475e-08, "loss": -0.0275, "num_tokens": 93054371.0, "reward": 0.125, "reward_std": 0.6789814233779907, "rewards/decision_reward_func/mean": 0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 1.634942650794983, "sampling/importance_sampling_ratio/mean": 1.000441551208496, "sampling/importance_sampling_ratio/min": 0.4854905903339386, "sampling/sampling_logp_difference/max": 0.7225953340530396, "sampling/sampling_logp_difference/mean": 0.017150260508060455, "step": 2173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1027.0, "completions/max_terminated_length": 1027.0, "completions/mean_length": 409.734375, "completions/mean_terminated_length": 409.734375, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.49351853132247925, "epoch": 2.6642156862745097, "frac_reward_zero_std": 0.5, "grad_norm": 0.7500545949073216, "kl": 0.049409765750169754, "learning_rate": 3.871033954505998e-08, "loss": -0.0333, "num_tokens": 93094498.0, "reward": -0.125, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": -0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 1.4143016338348389, "sampling/importance_sampling_ratio/mean": 1.0003690719604492, "sampling/importance_sampling_ratio/min": 0.6387597322463989, "sampling/sampling_logp_difference/max": 0.4482269287109375, "sampling/sampling_logp_difference/mean": 0.015478448010981083, "step": 2174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 812.0, "completions/max_terminated_length": 812.0, "completions/mean_length": 297.734375, "completions/mean_terminated_length": 297.734375, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "entropy": 0.4581536054611206, "epoch": 2.6654411764705883, "frac_reward_zero_std": 0.75, "grad_norm": 0.749022746400144, "kl": 0.02056906372308731, "learning_rate": 3.843596658894232e-08, "loss": 0.0374, "num_tokens": 93131761.0, "reward": 0.84375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.458642601966858, "sampling/importance_sampling_ratio/mean": 0.9997742772102356, "sampling/importance_sampling_ratio/min": 0.6160301566123962, "sampling/sampling_logp_difference/max": 0.48445940017700195, "sampling/sampling_logp_difference/mean": 0.014289296232163906, "step": 2175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 778.0, "completions/max_terminated_length": 778.0, "completions/mean_length": 265.265625, "completions/mean_terminated_length": 265.265625, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.4546167850494385, "epoch": 2.6666666666666665, "frac_reward_zero_std": 1.0, "grad_norm": 0.019155484231062718, "kl": 0.04094024375081062, "learning_rate": 3.816253057857144e-08, "loss": 0.0003, "num_tokens": 93164914.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.52510666847229, "sampling/importance_sampling_ratio/mean": 0.9999338388442993, "sampling/importance_sampling_ratio/min": 0.6215102672576904, "sampling/sampling_logp_difference/max": 0.4756028652191162, "sampling/sampling_logp_difference/mean": 0.015673184767365456, "step": 2176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 749.0, "completions/max_terminated_length": 749.0, "completions/mean_length": 342.40625, "completions/mean_terminated_length": 342.40625, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.4054229259490967, "epoch": 2.667892156862745, "frac_reward_zero_std": 1.0, "grad_norm": 0.013348902792119895, "kl": 0.015296544879674911, "learning_rate": 3.789003206900537e-08, "loss": 0.0002, "num_tokens": 93208476.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4651987552642822, "sampling/importance_sampling_ratio/mean": 0.9996941089630127, "sampling/importance_sampling_ratio/min": 0.6216425895690918, "sampling/sampling_logp_difference/max": 0.4753899574279785, "sampling/sampling_logp_difference/mean": 0.013940153643488884, "step": 2177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1009.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 439.640625, "completions/mean_terminated_length": 439.640625, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "entropy": 0.4622269868850708, "epoch": 2.6691176470588234, "frac_reward_zero_std": 0.5, "grad_norm": 0.7062581990322453, "kl": 0.025069234892725945, "learning_rate": 3.7618471613398597e-08, "loss": 0.0184, "num_tokens": 93261333.0, "reward": 0.71875, "reward_std": 0.4515564441680908, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.5545698404312134, "sampling/importance_sampling_ratio/mean": 0.9996371865272522, "sampling/importance_sampling_ratio/min": 0.4469165503978729, "sampling/sampling_logp_difference/max": 0.8053834438323975, "sampling/sampling_logp_difference/mean": 0.014205265790224075, "step": 2178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1614.0, "completions/max_terminated_length": 1614.0, "completions/mean_length": 536.40625, "completions/mean_terminated_length": 536.40625, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "entropy": 0.5057632923126221, "epoch": 2.670343137254902, "frac_reward_zero_std": 0.5, "grad_norm": 0.6164559635240208, "kl": 0.020866716280579567, "learning_rate": 3.734784976300165e-08, "loss": 0.0307, "num_tokens": 93317823.0, "reward": 0.59375, "reward_std": 0.4515564441680908, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.8393701314926147, "sampling/importance_sampling_ratio/mean": 1.0002498626708984, "sampling/importance_sampling_ratio/min": 0.5181258916854858, "sampling/sampling_logp_difference/max": 0.6575369834899902, "sampling/sampling_logp_difference/mean": 0.014194350689649582, "step": 2179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 428.0, "completions/max_terminated_length": 428.0, "completions/mean_length": 274.515625, "completions/mean_terminated_length": 274.515625, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "entropy": 0.5111697912216187, "epoch": 2.6715686274509802, "frac_reward_zero_std": 0.75, "grad_norm": 0.6434468165846118, "kl": 0.0249460618942976, "learning_rate": 3.7078167067159826e-08, "loss": 0.0052, "num_tokens": 93350384.0, "reward": 0.78125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.475433349609375, "sampling/importance_sampling_ratio/mean": 1.0005995035171509, "sampling/importance_sampling_ratio/min": 0.6707435250282288, "sampling/sampling_logp_difference/max": 0.3993685245513916, "sampling/sampling_logp_difference/mean": 0.017399121075868607, "step": 2180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1218.0, "completions/max_terminated_length": 1218.0, "completions/mean_length": 439.859375, "completions/mean_terminated_length": 439.859375, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "entropy": 0.5827081799507141, "epoch": 2.672794117647059, "frac_reward_zero_std": 0.75, "grad_norm": 0.6458811607145419, "kl": 0.018581073731184006, "learning_rate": 3.6809424073311944e-08, "loss": -0.0577, "num_tokens": 93397975.0, "reward": 0.625, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.5235601663589478, "sampling/importance_sampling_ratio/mean": 0.9999580383300781, "sampling/importance_sampling_ratio/min": 0.32684406638145447, "sampling/sampling_logp_difference/max": 1.118272066116333, "sampling/sampling_logp_difference/mean": 0.017109178006649017, "step": 2181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 286.890625, "completions/mean_terminated_length": 286.890625, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "entropy": 0.4381641149520874, "epoch": 2.674019607843137, "frac_reward_zero_std": 0.75, "grad_norm": 0.5893060424956509, "kl": 0.027370978146791458, "learning_rate": 3.654162132698918e-08, "loss": -0.0118, "num_tokens": 93431504.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.6235814094543457, "sampling/importance_sampling_ratio/mean": 0.9996362924575806, "sampling/importance_sampling_ratio/min": 0.6479434370994568, "sampling/sampling_logp_difference/max": 0.4846343994140625, "sampling/sampling_logp_difference/mean": 0.013666161336004734, "step": 2182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/max_terminated_length": 494.0, "completions/mean_length": 312.265625, "completions/mean_terminated_length": 312.265625, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "entropy": 0.4803517460823059, "epoch": 2.6752450980392157, "frac_reward_zero_std": 0.5, "grad_norm": 0.909754875788143, "kl": 0.030572451651096344, "learning_rate": 3.627475937181407e-08, "loss": 0.0105, "num_tokens": 93472817.0, "reward": 0.5625, "reward_std": 0.49553054571151733, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.3463016748428345, "sampling/importance_sampling_ratio/mean": 0.9999368190765381, "sampling/importance_sampling_ratio/min": 0.7161918878555298, "sampling/sampling_logp_difference/max": 0.33380722999572754, "sampling/sampling_logp_difference/mean": 0.013788733631372452, "step": 2183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1238.0, "completions/max_terminated_length": 1238.0, "completions/mean_length": 370.796875, "completions/mean_terminated_length": 370.796875, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.42588216066360474, "epoch": 2.6764705882352944, "frac_reward_zero_std": 1.0, "grad_norm": 0.010188241197440195, "kl": 0.017837708815932274, "learning_rate": 3.600883874949967e-08, "loss": 0.0002, "num_tokens": 93512340.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0006251335144043, "sampling/importance_sampling_ratio/min": 0.6902033686637878, "sampling/sampling_logp_difference/max": 1.0870728492736816, "sampling/sampling_logp_difference/mean": 0.014463575556874275, "step": 2184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1035.0, "completions/max_terminated_length": 1035.0, "completions/mean_length": 436.703125, "completions/mean_terminated_length": 436.703125, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "entropy": 0.45462387800216675, "epoch": 2.6776960784313726, "frac_reward_zero_std": 0.75, "grad_norm": 0.5890728215688394, "kl": 0.015423311851918697, "learning_rate": 3.574385999984786e-08, "loss": -0.014, "num_tokens": 93558417.0, "reward": 0.59375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.6331628561019897, "sampling/importance_sampling_ratio/mean": 1.0004830360412598, "sampling/importance_sampling_ratio/min": 0.617653489112854, "sampling/sampling_logp_difference/max": 0.49051856994628906, "sampling/sampling_logp_difference/mean": 0.014521213248372078, "step": 2185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 875.0, "completions/max_terminated_length": 875.0, "completions/mean_length": 344.3125, "completions/mean_terminated_length": 344.3125, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.3956924378871918, "epoch": 2.678921568627451, "frac_reward_zero_std": 0.75, "grad_norm": 0.5906638114403924, "kl": 0.019515957683324814, "learning_rate": 3.54798236607487e-08, "loss": 0.0244, "num_tokens": 93595157.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.4284634590148926, "sampling/importance_sampling_ratio/mean": 0.999590277671814, "sampling/importance_sampling_ratio/min": 0.5922631621360779, "sampling/sampling_logp_difference/max": 0.5238041877746582, "sampling/sampling_logp_difference/mean": 0.013699344359338284, "step": 2186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 602.0, "completions/max_terminated_length": 602.0, "completions/mean_length": 325.734375, "completions/mean_terminated_length": 325.734375, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.4836278259754181, "epoch": 2.6801470588235294, "frac_reward_zero_std": 0.75, "grad_norm": 0.6135034521823749, "kl": 0.038698941469192505, "learning_rate": 3.5216730268179337e-08, "loss": 0.0078, "num_tokens": 93638404.0, "reward": 0.71875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.5689958333969116, "sampling/importance_sampling_ratio/mean": 1.0005279779434204, "sampling/importance_sampling_ratio/min": 0.39079710841178894, "sampling/sampling_logp_difference/max": 0.9395667910575867, "sampling/sampling_logp_difference/mean": 0.014523688703775406, "step": 2187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 958.0, "completions/max_terminated_length": 958.0, "completions/mean_length": 342.796875, "completions/mean_terminated_length": 342.796875, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.4544488787651062, "epoch": 2.681372549019608, "frac_reward_zero_std": 0.75, "grad_norm": 0.6250024362084945, "kl": 0.01590898633003235, "learning_rate": 3.495458035620252e-08, "loss": -0.031, "num_tokens": 93677303.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.4687858819961548, "sampling/importance_sampling_ratio/mean": 1.0002470016479492, "sampling/importance_sampling_ratio/min": 0.6781128644943237, "sampling/sampling_logp_difference/max": 0.3884415626525879, "sampling/sampling_logp_difference/mean": 0.014223344624042511, "step": 2188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1075.0, "completions/max_terminated_length": 1075.0, "completions/mean_length": 408.0625, "completions/mean_terminated_length": 408.0625, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.5359256863594055, "epoch": 2.6825980392156863, "frac_reward_zero_std": 0.75, "grad_norm": 0.5189759082628763, "kl": 0.02488449588418007, "learning_rate": 3.469337445696629e-08, "loss": 0.0166, "num_tokens": 93720875.0, "reward": -0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": -0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.4654594659805298, "sampling/importance_sampling_ratio/mean": 1.0001146793365479, "sampling/importance_sampling_ratio/min": 0.6327075958251953, "sampling/sampling_logp_difference/max": 0.4577469825744629, "sampling/sampling_logp_difference/mean": 0.016843516379594803, "step": 2189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1837.0, "completions/max_terminated_length": 1837.0, "completions/mean_length": 496.71875, "completions/mean_terminated_length": 496.71875, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.5413087010383606, "epoch": 2.6838235294117645, "frac_reward_zero_std": 0.5, "grad_norm": 0.6882241624654968, "kl": 0.020549602806568146, "learning_rate": 3.4433113100701683e-08, "loss": -0.0869, "num_tokens": 93769705.0, "reward": 0.65625, "reward_std": 0.42695626616477966, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.441927433013916, "sampling/importance_sampling_ratio/mean": 0.9998815059661865, "sampling/importance_sampling_ratio/min": 0.6446142792701721, "sampling/sampling_logp_difference/max": 0.4391031265258789, "sampling/sampling_logp_difference/mean": 0.016351643949747086, "step": 2190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1517.0, "completions/max_terminated_length": 1517.0, "completions/mean_length": 524.15625, "completions/mean_terminated_length": 524.15625, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.37261050939559937, "epoch": 2.685049019607843, "frac_reward_zero_std": 0.75, "grad_norm": 0.4689892810694198, "kl": 0.015979494899511337, "learning_rate": 3.417379681572296e-08, "loss": 0.0176, "num_tokens": 93820899.0, "reward": 0.625, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.6834337711334229, "sampling/importance_sampling_ratio/mean": 1.000479817390442, "sampling/importance_sampling_ratio/min": 0.6624715328216553, "sampling/sampling_logp_difference/max": 0.5208356380462646, "sampling/sampling_logp_difference/mean": 0.012050756253302097, "step": 2191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1190.0, "completions/max_terminated_length": 1190.0, "completions/mean_length": 341.8125, "completions/mean_terminated_length": 341.8125, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "entropy": 0.5203717947006226, "epoch": 2.686274509803922, "frac_reward_zero_std": 0.75, "grad_norm": 0.6283032926461651, "kl": 0.021181093528866768, "learning_rate": 3.391542612842574e-08, "loss": -0.0512, "num_tokens": 93862151.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.3633372783660889, "sampling/importance_sampling_ratio/mean": 0.999996542930603, "sampling/importance_sampling_ratio/min": 0.6418452858924866, "sampling/sampling_logp_difference/max": 0.4434080123901367, "sampling/sampling_logp_difference/mean": 0.01622844487428665, "step": 2192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 801.0, "completions/max_terminated_length": 801.0, "completions/mean_length": 321.265625, "completions/mean_terminated_length": 321.265625, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.43313953280448914, "epoch": 2.6875, "frac_reward_zero_std": 0.75, "grad_norm": 0.6841049502611429, "kl": 0.03364472836256027, "learning_rate": 3.365800156328619e-08, "loss": 0.07, "num_tokens": 93902152.0, "reward": -0.0625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": -0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.5505036115646362, "sampling/importance_sampling_ratio/mean": 0.9997625946998596, "sampling/importance_sampling_ratio/min": 0.6147714853286743, "sampling/sampling_logp_difference/max": 0.4865047335624695, "sampling/sampling_logp_difference/mean": 0.01432824693620205, "step": 2193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 898.0, "completions/max_terminated_length": 898.0, "completions/mean_length": 375.4375, "completions/mean_terminated_length": 375.4375, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.49880069494247437, "epoch": 2.688725490196078, "frac_reward_zero_std": 0.75, "grad_norm": 0.5219789620822387, "kl": 0.020923597738146782, "learning_rate": 3.3401523642859805e-08, "loss": -0.0092, "num_tokens": 93947684.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.3700482845306396, "sampling/importance_sampling_ratio/mean": 0.9996163845062256, "sampling/importance_sampling_ratio/min": 0.2831355631351471, "sampling/sampling_logp_difference/max": 1.2618294954299927, "sampling/sampling_logp_difference/mean": 0.015523786656558514, "step": 2194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 573.0, "completions/max_terminated_length": 573.0, "completions/mean_length": 315.8125, "completions/mean_terminated_length": 315.8125, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "entropy": 0.5108804702758789, "epoch": 2.689950980392157, "frac_reward_zero_std": 0.75, "grad_norm": 0.7089663043137233, "kl": 0.02362756058573723, "learning_rate": 3.3145992887780475e-08, "loss": 0.026, "num_tokens": 93984616.0, "reward": 0.34375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 1.5971654653549194, "sampling/importance_sampling_ratio/mean": 1.0003596544265747, "sampling/importance_sampling_ratio/min": 0.6394304037094116, "sampling/sampling_logp_difference/max": 0.4682304859161377, "sampling/sampling_logp_difference/mean": 0.014306722208857536, "step": 2195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 765.0, "completions/max_terminated_length": 765.0, "completions/mean_length": 382.953125, "completions/mean_terminated_length": 382.953125, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "entropy": 0.4124827980995178, "epoch": 2.6911764705882355, "frac_reward_zero_std": 0.75, "grad_norm": 0.5634302306807625, "kl": 0.01561555452644825, "learning_rate": 3.289140981675964e-08, "loss": -0.0112, "num_tokens": 94026869.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.5394909381866455, "sampling/importance_sampling_ratio/mean": 0.9997172951698303, "sampling/importance_sampling_ratio/min": 0.6157910823822021, "sampling/sampling_logp_difference/max": 0.4848475456237793, "sampling/sampling_logp_difference/mean": 0.013275430537760258, "step": 2196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1090.0, "completions/max_terminated_length": 1090.0, "completions/mean_length": 498.453125, "completions/mean_terminated_length": 498.453125, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "entropy": 0.5563613176345825, "epoch": 2.6924019607843137, "frac_reward_zero_std": 0.5, "grad_norm": 0.6708726552416912, "kl": 0.02358122542500496, "learning_rate": 3.263777494658448e-08, "loss": 0.0416, "num_tokens": 94080322.0, "reward": 0.84375, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.5776947736740112, "sampling/importance_sampling_ratio/mean": 1.0000419616699219, "sampling/importance_sampling_ratio/min": 0.7348998188972473, "sampling/sampling_logp_difference/max": 0.4559648036956787, "sampling/sampling_logp_difference/mean": 0.015086319297552109, "step": 2197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 745.0, "completions/max_terminated_length": 745.0, "completions/mean_length": 330.1875, "completions/mean_terminated_length": 330.1875, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.46512919664382935, "epoch": 2.693627450980392, "frac_reward_zero_std": 0.75, "grad_norm": 0.492975622583298, "kl": 0.030317893251776695, "learning_rate": 3.2385088792118044e-08, "loss": 0.0018, "num_tokens": 94117502.0, "reward": 0.65625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.862428903579712, "sampling/importance_sampling_ratio/mean": 1.0001270771026611, "sampling/importance_sampling_ratio/min": 0.6902703642845154, "sampling/sampling_logp_difference/max": 0.6218814849853516, "sampling/sampling_logp_difference/mean": 0.015323083847761154, "step": 2198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 726.0, "completions/max_terminated_length": 726.0, "completions/mean_length": 403.59375, "completions/mean_terminated_length": 403.59375, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "entropy": 0.52828049659729, "epoch": 2.6948529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 0.7578963228196325, "kl": 0.03253309428691864, "learning_rate": 3.2133351866296955e-08, "loss": -0.039, "num_tokens": 94163188.0, "reward": 0.25, "reward_std": 0.3811737596988678, "rewards/decision_reward_func/mean": 0.25, "rewards/decision_reward_func/std": 0.9759001135826111, "sampling/importance_sampling_ratio/max": 1.6024463176727295, "sampling/importance_sampling_ratio/mean": 0.999915599822998, "sampling/importance_sampling_ratio/min": 0.6146404147148132, "sampling/sampling_logp_difference/max": 0.4867178797721863, "sampling/sampling_logp_difference/mean": 0.01487363874912262, "step": 2199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 564.0, "completions/max_terminated_length": 564.0, "completions/mean_length": 335.203125, "completions/mean_terminated_length": 335.203125, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "entropy": 0.3984048366546631, "epoch": 2.696078431372549, "frac_reward_zero_std": 1.0, "grad_norm": 0.010135917239215371, "kl": 0.01205175556242466, "learning_rate": 3.188256468013139e-08, "loss": 0.0001, "num_tokens": 94203025.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3651678562164307, "sampling/importance_sampling_ratio/mean": 0.999480664730072, "sampling/importance_sampling_ratio/min": 0.6784735918045044, "sampling/sampling_logp_difference/max": 0.3879096508026123, "sampling/sampling_logp_difference/mean": 0.013071643188595772, "step": 2200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 568.0, "completions/max_terminated_length": 568.0, "completions/mean_length": 272.625, "completions/mean_terminated_length": 272.625, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.6825083494186401, "epoch": 2.6973039215686274, "frac_reward_zero_std": 0.25, "grad_norm": 1.3346507873723046, "kl": 0.0462813675403595, "learning_rate": 3.163272774270348e-08, "loss": -0.065, "num_tokens": 94234873.0, "reward": 0.28125, "reward_std": 0.5977387428283691, "rewards/decision_reward_func/mean": 0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 1.4508885145187378, "sampling/importance_sampling_ratio/mean": 0.9997091293334961, "sampling/importance_sampling_ratio/min": 0.7061358690261841, "sampling/sampling_logp_difference/max": 0.3721761703491211, "sampling/sampling_logp_difference/mean": 0.018668007105588913, "step": 2201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 291.65625, "completions/mean_terminated_length": 291.65625, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.466967910528183, "epoch": 2.6985294117647056, "frac_reward_zero_std": 0.75, "grad_norm": 0.733502894326454, "kl": 0.028825923800468445, "learning_rate": 3.1383841561166134e-08, "loss": -0.0235, "num_tokens": 94266083.0, "reward": 0.3125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.6068168878555298, "sampling/importance_sampling_ratio/mean": 1.0005357265472412, "sampling/importance_sampling_ratio/min": 0.6520295739173889, "sampling/sampling_logp_difference/max": 0.4742550849914551, "sampling/sampling_logp_difference/mean": 0.01565352827310562, "step": 2202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1089.0, "completions/max_terminated_length": 1089.0, "completions/mean_length": 341.296875, "completions/mean_terminated_length": 341.296875, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "entropy": 0.6221673488616943, "epoch": 2.6997549019607843, "frac_reward_zero_std": 0.5, "grad_norm": 0.9355966317527868, "kl": 0.028829948976635933, "learning_rate": 3.1135906640742836e-08, "loss": 0.0281, "num_tokens": 94306294.0, "reward": 0.4375, "reward_std": 0.3943893015384674, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.4141842126846313, "sampling/importance_sampling_ratio/mean": 0.9998221397399902, "sampling/importance_sampling_ratio/min": 0.6815834641456604, "sampling/sampling_logp_difference/max": 0.38333654403686523, "sampling/sampling_logp_difference/mean": 0.01818588376045227, "step": 2203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1222.0, "completions/max_terminated_length": 1222.0, "completions/mean_length": 384.90625, "completions/mean_terminated_length": 384.90625, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.369319349527359, "epoch": 2.700980392156863, "frac_reward_zero_std": 1.0, "grad_norm": 0.013416798195766954, "kl": 0.014719956554472446, "learning_rate": 3.088892348472561e-08, "loss": 0.0001, "num_tokens": 94349408.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.6620323657989502, "sampling/importance_sampling_ratio/mean": 0.9998986721038818, "sampling/importance_sampling_ratio/min": 0.6049888134002686, "sampling/sampling_logp_difference/max": 0.5080411434173584, "sampling/sampling_logp_difference/mean": 0.012760384939610958, "step": 2204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 598.0, "completions/max_terminated_length": 598.0, "completions/mean_length": 299.5625, "completions/mean_terminated_length": 299.5625, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "entropy": 0.5343676209449768, "epoch": 2.702205882352941, "frac_reward_zero_std": 0.75, "grad_norm": 0.6808641846343869, "kl": 0.0304710790514946, "learning_rate": 3.064289259447455e-08, "loss": 0.0178, "num_tokens": 94382452.0, "reward": 0.1875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 1.348798155784607, "sampling/importance_sampling_ratio/mean": 0.9999092817306519, "sampling/importance_sampling_ratio/min": 0.6141843795776367, "sampling/sampling_logp_difference/max": 0.4874601364135742, "sampling/sampling_logp_difference/mean": 0.017089789733290672, "step": 2205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 937.0, "completions/max_terminated_length": 937.0, "completions/mean_length": 446.140625, "completions/mean_terminated_length": 446.140625, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "entropy": 0.6610206365585327, "epoch": 2.7034313725490198, "frac_reward_zero_std": 0.25, "grad_norm": 0.8057045527855579, "kl": 0.03859423100948334, "learning_rate": 3.039781446941697e-08, "loss": -0.0372, "num_tokens": 94430285.0, "reward": 0.59375, "reward_std": 0.6205305457115173, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.663864016532898, "sampling/importance_sampling_ratio/mean": 0.9997532367706299, "sampling/importance_sampling_ratio/min": 0.3392254710197449, "sampling/sampling_logp_difference/max": 1.0810902118682861, "sampling/sampling_logp_difference/mean": 0.017790645360946655, "step": 2206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 284.75, "completions/mean_terminated_length": 284.75, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.557000994682312, "epoch": 2.704656862745098, "frac_reward_zero_std": 1.0, "grad_norm": 0.013481917928261259, "kl": 0.018794454634189606, "learning_rate": 3.015368960704584e-08, "loss": 0.0002, "num_tokens": 94466301.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5323885679244995, "sampling/importance_sampling_ratio/mean": 1.0002732276916504, "sampling/importance_sampling_ratio/min": 0.625548779964447, "sampling/sampling_logp_difference/max": 0.46912598609924316, "sampling/sampling_logp_difference/mean": 0.01812085695564747, "step": 2207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 613.0, "completions/max_terminated_length": 613.0, "completions/mean_length": 355.46875, "completions/mean_terminated_length": 355.46875, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "entropy": 0.5426332950592041, "epoch": 2.7058823529411766, "frac_reward_zero_std": 0.5, "grad_norm": 0.8347137230328404, "kl": 0.043868035078048706, "learning_rate": 2.991051850291915e-08, "loss": 0.0242, "num_tokens": 94503579.0, "reward": 0.09375, "reward_std": 0.4101392924785614, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.6088857650756836, "sampling/importance_sampling_ratio/mean": 1.000251293182373, "sampling/importance_sampling_ratio/min": 0.6698979139328003, "sampling/sampling_logp_difference/max": 0.4755418300628662, "sampling/sampling_logp_difference/mean": 0.016561981290578842, "step": 2208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1105.0, "completions/max_terminated_length": 1105.0, "completions/mean_length": 374.53125, "completions/mean_terminated_length": 374.53125, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.39853766560554504, "epoch": 2.707107843137255, "frac_reward_zero_std": 0.75, "grad_norm": 0.5278488920816967, "kl": 0.019798535853624344, "learning_rate": 2.9668301650658756e-08, "loss": -0.0352, "num_tokens": 94548349.0, "reward": -0.375, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": -0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.3718715906143188, "sampling/importance_sampling_ratio/mean": 1.00008225440979, "sampling/importance_sampling_ratio/min": 0.7343106865882874, "sampling/sampling_logp_difference/max": 0.3161759376525879, "sampling/sampling_logp_difference/mean": 0.012415362522006035, "step": 2209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1202.0, "completions/max_terminated_length": 1202.0, "completions/mean_length": 379.515625, "completions/mean_terminated_length": 379.515625, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "entropy": 0.43077051639556885, "epoch": 2.7083333333333335, "frac_reward_zero_std": 0.75, "grad_norm": 0.5590952767570064, "kl": 0.01541432086378336, "learning_rate": 2.9427039541949638e-08, "loss": 0.0192, "num_tokens": 94588990.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.4357898235321045, "sampling/importance_sampling_ratio/mean": 0.9997974038124084, "sampling/importance_sampling_ratio/min": 0.6775035262107849, "sampling/sampling_logp_difference/max": 0.38934051990509033, "sampling/sampling_logp_difference/mean": 0.012576785869896412, "step": 2210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1172.0, "completions/max_terminated_length": 1172.0, "completions/mean_length": 357.21875, "completions/mean_terminated_length": 357.21875, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.4786778390407562, "epoch": 2.7095588235294117, "frac_reward_zero_std": 0.75, "grad_norm": 0.7590014704299647, "kl": 0.01831422559916973, "learning_rate": 2.918673266653865e-08, "loss": -0.0044, "num_tokens": 94629612.0, "reward": 0.34375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 1.6139318943023682, "sampling/importance_sampling_ratio/mean": 1.000009298324585, "sampling/importance_sampling_ratio/min": 0.6187257170677185, "sampling/sampling_logp_difference/max": 0.48009324073791504, "sampling/sampling_logp_difference/mean": 0.014829698018729687, "step": 2211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 647.0, "completions/max_terminated_length": 647.0, "completions/mean_length": 308.203125, "completions/mean_terminated_length": 308.203125, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.4786732792854309, "epoch": 2.7107843137254903, "frac_reward_zero_std": 0.75, "grad_norm": 0.7066002907306196, "kl": 0.02756316028535366, "learning_rate": 2.8947381512233305e-08, "loss": -0.0218, "num_tokens": 94666857.0, "reward": 0.625, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.441131830215454, "sampling/importance_sampling_ratio/mean": 0.9993857145309448, "sampling/importance_sampling_ratio/min": 0.6429356336593628, "sampling/sampling_logp_difference/max": 0.4417107105255127, "sampling/sampling_logp_difference/mean": 0.01642749458551407, "step": 2212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 588.0, "completions/max_terminated_length": 588.0, "completions/mean_length": 282.953125, "completions/mean_terminated_length": 282.953125, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "entropy": 0.46477949619293213, "epoch": 2.7120098039215685, "frac_reward_zero_std": 1.0, "grad_norm": 0.019981065429094932, "kl": 0.028127755969762802, "learning_rate": 2.8708986564901504e-08, "loss": 0.0003, "num_tokens": 94703222.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5767884254455566, "sampling/importance_sampling_ratio/mean": 0.9998513460159302, "sampling/importance_sampling_ratio/min": 0.532294511795044, "sampling/sampling_logp_difference/max": 0.6305583715438843, "sampling/sampling_logp_difference/mean": 0.014360880479216576, "step": 2213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 420.0, "completions/max_terminated_length": 420.0, "completions/mean_length": 272.484375, "completions/mean_terminated_length": 272.484375, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.37175825238227844, "epoch": 2.713235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.012293100745966434, "kl": 0.015151958912611008, "learning_rate": 2.8471548308469706e-08, "loss": 0.0001, "num_tokens": 94732773.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.694084882736206, "sampling/importance_sampling_ratio/mean": 1.0002171993255615, "sampling/importance_sampling_ratio/min": 0.625119686126709, "sampling/sampling_logp_difference/max": 0.5271427631378174, "sampling/sampling_logp_difference/mean": 0.014161134138703346, "step": 2214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/max_terminated_length": 529.0, "completions/mean_length": 295.390625, "completions/mean_terminated_length": 295.390625, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "entropy": 0.4166908860206604, "epoch": 2.7144607843137254, "frac_reward_zero_std": 0.75, "grad_norm": 0.6544560721443025, "kl": 0.023886986076831818, "learning_rate": 2.8235067224922802e-08, "loss": -0.0233, "num_tokens": 94766414.0, "reward": 0.3125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.6251431703567505, "sampling/importance_sampling_ratio/mean": 1.0000993013381958, "sampling/importance_sampling_ratio/min": 0.6230321526527405, "sampling/sampling_logp_difference/max": 0.4855959415435791, "sampling/sampling_logp_difference/mean": 0.014379645697772503, "step": 2215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1000.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 470.21875, "completions/mean_terminated_length": 470.21875, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "entropy": 0.3575405478477478, "epoch": 2.715686274509804, "frac_reward_zero_std": 0.75, "grad_norm": 0.546296885156146, "kl": 0.01485485676676035, "learning_rate": 2.799954379430208e-08, "loss": 0.0036, "num_tokens": 94818204.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.6088894605636597, "sampling/importance_sampling_ratio/mean": 1.000126600265503, "sampling/importance_sampling_ratio/min": 0.5518200993537903, "sampling/sampling_logp_difference/max": 0.5945332050323486, "sampling/sampling_logp_difference/mean": 0.010559973306953907, "step": 2216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 765.0, "completions/max_terminated_length": 765.0, "completions/mean_length": 297.046875, "completions/mean_terminated_length": 297.046875, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.5376313924789429, "epoch": 2.7169117647058822, "frac_reward_zero_std": 0.5, "grad_norm": 0.9444686998144807, "kl": 0.02422238513827324, "learning_rate": 2.7764978494705437e-08, "loss": 0.0676, "num_tokens": 94857599.0, "reward": 0.0, "reward_std": 0.34156501293182373, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.4261256456375122, "sampling/importance_sampling_ratio/mean": 0.9998828768730164, "sampling/importance_sampling_ratio/min": 0.699869692325592, "sampling/sampling_logp_difference/max": 0.3568611145019531, "sampling/sampling_logp_difference/mean": 0.01665247417986393, "step": 2217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 716.0, "completions/max_terminated_length": 716.0, "completions/mean_length": 309.4375, "completions/mean_terminated_length": 309.4375, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "entropy": 0.3348027467727661, "epoch": 2.718137254901961, "frac_reward_zero_std": 0.75, "grad_norm": 0.6268552749165421, "kl": 0.023101806640625, "learning_rate": 2.753137180228543e-08, "loss": -0.0085, "num_tokens": 94890123.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.6191203594207764, "sampling/importance_sampling_ratio/mean": 0.9996556043624878, "sampling/importance_sampling_ratio/min": 0.6080378890037537, "sampling/sampling_logp_difference/max": 0.49751806259155273, "sampling/sampling_logp_difference/mean": 0.012272598221898079, "step": 2218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1381.0, "completions/max_terminated_length": 1381.0, "completions/mean_length": 500.078125, "completions/mean_terminated_length": 500.078125, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "entropy": 0.5232434272766113, "epoch": 2.719362745098039, "frac_reward_zero_std": 0.75, "grad_norm": 0.4586098479445358, "kl": 0.024434436112642288, "learning_rate": 2.729872419124879e-08, "loss": -0.0068, "num_tokens": 94939584.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.6975164413452148, "sampling/importance_sampling_ratio/mean": 1.0002378225326538, "sampling/importance_sampling_ratio/min": 0.6999858021736145, "sampling/sampling_logp_difference/max": 0.5291662216186523, "sampling/sampling_logp_difference/mean": 0.013527914881706238, "step": 2219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 470.0, "completions/max_terminated_length": 470.0, "completions/mean_length": 264.28125, "completions/mean_terminated_length": 264.28125, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.43096983432769775, "epoch": 2.7205882352941178, "frac_reward_zero_std": 1.0, "grad_norm": 0.019692418169211308, "kl": 0.022423814982175827, "learning_rate": 2.7067036133855636e-08, "loss": 0.0002, "num_tokens": 94975634.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6208345890045166, "sampling/importance_sampling_ratio/mean": 0.9999746084213257, "sampling/importance_sampling_ratio/min": 0.6189461946487427, "sampling/sampling_logp_difference/max": 0.4829411506652832, "sampling/sampling_logp_difference/mean": 0.01618112623691559, "step": 2220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 534.0, "completions/max_terminated_length": 534.0, "completions/mean_length": 278.640625, "completions/mean_terminated_length": 278.640625, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.4326554238796234, "epoch": 2.721813725490196, "frac_reward_zero_std": 1.0, "grad_norm": 0.03122931420037609, "kl": 0.02165728062391281, "learning_rate": 2.6836308100417872e-08, "loss": 0.0002, "num_tokens": 95011627.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.2972798347473145, "sampling/importance_sampling_ratio/mean": 0.9998713731765747, "sampling/importance_sampling_ratio/min": 0.5806476473808289, "sampling/sampling_logp_difference/max": 0.5436111688613892, "sampling/sampling_logp_difference/mean": 0.014432272873818874, "step": 2221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 680.0, "completions/max_terminated_length": 680.0, "completions/mean_length": 320.28125, "completions/mean_terminated_length": 320.28125, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "entropy": 0.42901313304901123, "epoch": 2.7230392156862746, "frac_reward_zero_std": 1.0, "grad_norm": 0.01660883415531075, "kl": 0.02547239139676094, "learning_rate": 2.6606540559298952e-08, "loss": 0.0002, "num_tokens": 95050685.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.809097409248352, "sampling/importance_sampling_ratio/mean": 1.0000054836273193, "sampling/importance_sampling_ratio/min": 0.5748293399810791, "sampling/sampling_logp_difference/max": 0.5928280353546143, "sampling/sampling_logp_difference/mean": 0.012815484777092934, "step": 2222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 458.0, "completions/max_terminated_length": 458.0, "completions/mean_length": 278.453125, "completions/mean_terminated_length": 278.453125, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.38187384605407715, "epoch": 2.724264705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.012110882899997368, "kl": 0.01626858487725258, "learning_rate": 2.6377733976912232e-08, "loss": 0.0001, "num_tokens": 95083050.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.504974126815796, "sampling/importance_sampling_ratio/mean": 1.0004074573516846, "sampling/importance_sampling_ratio/min": 0.7239823341369629, "sampling/sampling_logp_difference/max": 0.4087756872177124, "sampling/sampling_logp_difference/mean": 0.013433999381959438, "step": 2223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 763.0, "completions/max_terminated_length": 763.0, "completions/mean_length": 374.515625, "completions/mean_terminated_length": 374.515625, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.4767635464668274, "epoch": 2.7254901960784315, "frac_reward_zero_std": 0.5, "grad_norm": 0.9806738242313254, "kl": 0.020406976342201233, "learning_rate": 2.6149888817720733e-08, "loss": 0.0639, "num_tokens": 95126331.0, "reward": 0.84375, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.56149160861969, "sampling/importance_sampling_ratio/mean": 0.9999184608459473, "sampling/importance_sampling_ratio/min": 0.6317293047904968, "sampling/sampling_logp_difference/max": 0.45929431915283203, "sampling/sampling_logp_difference/mean": 0.015051727183163166, "step": 2224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 598.0, "completions/max_terminated_length": 598.0, "completions/mean_length": 347.328125, "completions/mean_terminated_length": 347.328125, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "entropy": 0.4504515528678894, "epoch": 2.7267156862745097, "frac_reward_zero_std": 1.0, "grad_norm": 0.011053442440415612, "kl": 0.01574881747364998, "learning_rate": 2.5923005544235545e-08, "loss": 0.0002, "num_tokens": 95172000.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4481887817382812, "sampling/importance_sampling_ratio/mean": 1.0001811981201172, "sampling/importance_sampling_ratio/min": 0.5180891752243042, "sampling/sampling_logp_difference/max": 0.6576077938079834, "sampling/sampling_logp_difference/mean": 0.015256019309163094, "step": 2225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1517.0, "completions/max_terminated_length": 1517.0, "completions/mean_length": 543.890625, "completions/mean_terminated_length": 543.890625, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "entropy": 0.5578203797340393, "epoch": 2.7279411764705883, "frac_reward_zero_std": 0.75, "grad_norm": 0.49939191051692555, "kl": 0.02215338870882988, "learning_rate": 2.5697084617015475e-08, "loss": -0.0295, "num_tokens": 95230217.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.6574866771697998, "sampling/importance_sampling_ratio/mean": 0.9999018907546997, "sampling/importance_sampling_ratio/min": 0.6111041307449341, "sampling/sampling_logp_difference/max": 0.5053024291992188, "sampling/sampling_logp_difference/mean": 0.015879396349191666, "step": 2226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 885.0, "completions/max_terminated_length": 885.0, "completions/mean_length": 411.015625, "completions/mean_terminated_length": 411.015625, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.38775569200515747, "epoch": 2.7291666666666665, "frac_reward_zero_std": 1.0, "grad_norm": 0.019384997167248934, "kl": 0.025220651179552078, "learning_rate": 2.547212649466568e-08, "loss": 0.0003, "num_tokens": 95278602.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4279274940490723, "sampling/importance_sampling_ratio/mean": 0.9998647570610046, "sampling/importance_sampling_ratio/min": 0.6447826027870178, "sampling/sampling_logp_difference/max": 0.4388420581817627, "sampling/sampling_logp_difference/mean": 0.01186995767056942, "step": 2227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 550.0, "completions/max_terminated_length": 550.0, "completions/mean_length": 247.53125, "completions/mean_terminated_length": 247.53125, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.5242138504981995, "epoch": 2.730392156862745, "frac_reward_zero_std": 0.25, "grad_norm": 1.3807046240951724, "kl": 0.0328475758433342, "learning_rate": 2.5248131633836823e-08, "loss": 0.0849, "num_tokens": 95318012.0, "reward": 0.09375, "reward_std": 0.5827301740646362, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.6588823795318604, "sampling/importance_sampling_ratio/mean": 0.9998443722724915, "sampling/importance_sampling_ratio/min": 0.06947699934244156, "sampling/sampling_logp_difference/max": 2.666759490966797, "sampling/sampling_logp_difference/mean": 0.01750490814447403, "step": 2228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1608.0, "completions/max_terminated_length": 1608.0, "completions/mean_length": 398.125, "completions/mean_terminated_length": 398.125, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.593556821346283, "epoch": 2.7316176470588234, "frac_reward_zero_std": 0.25, "grad_norm": 0.9672331592776227, "kl": 0.03097037598490715, "learning_rate": 2.5025100489224406e-08, "loss": 0.0235, "num_tokens": 95362340.0, "reward": -0.125, "reward_std": 0.6645200252532959, "rewards/decision_reward_func/mean": -0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 1.319156289100647, "sampling/importance_sampling_ratio/mean": 0.999582052230835, "sampling/importance_sampling_ratio/min": 0.648799479007721, "sampling/sampling_logp_difference/max": 0.4326314926147461, "sampling/sampling_logp_difference/mean": 0.01760314218699932, "step": 2229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 878.0, "completions/max_terminated_length": 878.0, "completions/mean_length": 401.65625, "completions/mean_terminated_length": 401.65625, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.5647904872894287, "epoch": 2.732843137254902, "frac_reward_zero_std": 0.75, "grad_norm": 0.4299054823472512, "kl": 0.02483641542494297, "learning_rate": 2.480303351356733e-08, "loss": 0.0023, "num_tokens": 95409310.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.7322709560394287, "sampling/importance_sampling_ratio/mean": 1.0000509023666382, "sampling/importance_sampling_ratio/min": 0.6878761649131775, "sampling/sampling_logp_difference/max": 0.5494332313537598, "sampling/sampling_logp_difference/mean": 0.01611713133752346, "step": 2230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 664.0, "completions/max_terminated_length": 664.0, "completions/mean_length": 284.5625, "completions/mean_terminated_length": 284.5625, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.4650837779045105, "epoch": 2.7340686274509802, "frac_reward_zero_std": 1.0, "grad_norm": 0.019277327462560803, "kl": 0.02596260979771614, "learning_rate": 2.4581931157647674e-08, "loss": 0.0002, "num_tokens": 95444930.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4397000074386597, "sampling/importance_sampling_ratio/mean": 0.9999262094497681, "sampling/importance_sampling_ratio/min": 0.706262469291687, "sampling/sampling_logp_difference/max": 0.36443471908569336, "sampling/sampling_logp_difference/mean": 0.014896858483552933, "step": 2231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 613.0, "completions/max_terminated_length": 613.0, "completions/mean_length": 282.84375, "completions/mean_terminated_length": 282.84375, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.4215168356895447, "epoch": 2.735294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.01211875723996692, "kl": 0.019590934738516808, "learning_rate": 2.4361793870289028e-08, "loss": 0.0002, "num_tokens": 95481176.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5971711874008179, "sampling/importance_sampling_ratio/mean": 0.9999405145645142, "sampling/importance_sampling_ratio/min": 0.6877859234809875, "sampling/sampling_logp_difference/max": 0.4682340621948242, "sampling/sampling_logp_difference/mean": 0.015071826055645943, "step": 2232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1759.0, "completions/max_terminated_length": 1759.0, "completions/mean_length": 485.46875, "completions/mean_terminated_length": 485.46875, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "entropy": 0.4627479314804077, "epoch": 2.736519607843137, "frac_reward_zero_std": 0.5, "grad_norm": 0.8149393711091384, "kl": 0.020865924656391144, "learning_rate": 2.4142622098356326e-08, "loss": 0.11, "num_tokens": 95530534.0, "reward": 0.25, "reward_std": 0.3811737596988678, "rewards/decision_reward_func/mean": 0.25, "rewards/decision_reward_func/std": 0.9759001135826111, "sampling/importance_sampling_ratio/max": 1.5461392402648926, "sampling/importance_sampling_ratio/mean": 1.0002117156982422, "sampling/importance_sampling_ratio/min": 0.5371774435043335, "sampling/sampling_logp_difference/max": 0.6214268207550049, "sampling/sampling_logp_difference/mean": 0.013875655829906464, "step": 2233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 809.0, "completions/max_terminated_length": 809.0, "completions/mean_length": 360.890625, "completions/mean_terminated_length": 360.890625, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.4299345314502716, "epoch": 2.7377450980392157, "frac_reward_zero_std": 1.0, "grad_norm": 0.010463245315238746, "kl": 0.012503364123404026, "learning_rate": 2.3924416286754345e-08, "loss": 0.0001, "num_tokens": 95568911.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.630758285522461, "sampling/importance_sampling_ratio/mean": 1.0000767707824707, "sampling/importance_sampling_ratio/min": 0.41753995418548584, "sampling/sampling_logp_difference/max": 0.8733750581741333, "sampling/sampling_logp_difference/mean": 0.013493189588189125, "step": 2234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1035.0, "completions/max_terminated_length": 1035.0, "completions/mean_length": 429.640625, "completions/mean_terminated_length": 429.640625, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "entropy": 0.3341035842895508, "epoch": 2.7389705882352944, "frac_reward_zero_std": 1.0, "grad_norm": 0.008903867105957562, "kl": 0.010664981789886951, "learning_rate": 2.3707176878426882e-08, "loss": 0.0001, "num_tokens": 95614408.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.2847856283187866, "sampling/importance_sampling_ratio/mean": 1.0000593662261963, "sampling/importance_sampling_ratio/min": 0.4729178249835968, "sampling/sampling_logp_difference/max": 0.7488336563110352, "sampling/sampling_logp_difference/mean": 0.010459180921316147, "step": 2235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 756.0, "completions/max_terminated_length": 756.0, "completions/mean_length": 275.703125, "completions/mean_terminated_length": 275.703125, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.35084372758865356, "epoch": 2.7401960784313726, "frac_reward_zero_std": 1.0, "grad_norm": 0.012890819617328169, "kl": 0.016788916662335396, "learning_rate": 2.3490904314356407e-08, "loss": 0.0002, "num_tokens": 95649781.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3267532587051392, "sampling/importance_sampling_ratio/mean": 0.9998570680618286, "sampling/importance_sampling_ratio/min": 0.5984209179878235, "sampling/sampling_logp_difference/max": 0.5134608745574951, "sampling/sampling_logp_difference/mean": 0.012421779334545135, "step": 2236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 686.0, "completions/max_terminated_length": 686.0, "completions/mean_length": 362.265625, "completions/mean_terminated_length": 362.265625, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "entropy": 0.3742223381996155, "epoch": 2.741421568627451, "frac_reward_zero_std": 1.0, "grad_norm": 0.0130219958286325, "kl": 0.013823084533214569, "learning_rate": 2.327559903356241e-08, "loss": 0.0001, "num_tokens": 95698326.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6132533550262451, "sampling/importance_sampling_ratio/mean": 1.000236988067627, "sampling/importance_sampling_ratio/min": 0.7192864418029785, "sampling/sampling_logp_difference/max": 0.4782528877258301, "sampling/sampling_logp_difference/mean": 0.012275997549295425, "step": 2237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 598.0, "completions/max_terminated_length": 598.0, "completions/mean_length": 291.40625, "completions/mean_terminated_length": 291.40625, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "entropy": 0.4050942659378052, "epoch": 2.7426470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.011257985931340916, "kl": 0.01540314219892025, "learning_rate": 2.3061261473101002e-08, "loss": 0.0002, "num_tokens": 95738608.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4502716064453125, "sampling/importance_sampling_ratio/mean": 0.9995676875114441, "sampling/importance_sampling_ratio/min": 0.7111552357673645, "sampling/sampling_logp_difference/max": 0.3717508316040039, "sampling/sampling_logp_difference/mean": 0.013299926184117794, "step": 2238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 456.0, "completions/max_terminated_length": 456.0, "completions/mean_length": 303.53125, "completions/mean_terminated_length": 303.53125, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.5675269365310669, "epoch": 2.743872549019608, "frac_reward_zero_std": 0.5, "grad_norm": 0.8936258381225426, "kl": 0.03353060409426689, "learning_rate": 2.2847892068063755e-08, "loss": 0.0103, "num_tokens": 95779762.0, "reward": 0.71875, "reward_std": 0.38319888710975647, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.6127461194992065, "sampling/importance_sampling_ratio/mean": 1.0001956224441528, "sampling/importance_sampling_ratio/min": 0.5805733799934387, "sampling/sampling_logp_difference/max": 0.5437390804290771, "sampling/sampling_logp_difference/mean": 0.017995594069361687, "step": 2239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/max_terminated_length": 509.0, "completions/mean_length": 317.546875, "completions/mean_terminated_length": 317.546875, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "entropy": 0.549437403678894, "epoch": 2.7450980392156863, "frac_reward_zero_std": 0.75, "grad_norm": 0.9838192197571862, "kl": 0.022823963314294815, "learning_rate": 2.263549125157721e-08, "loss": -0.0137, "num_tokens": 95820821.0, "reward": -0.0625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": -0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.6315010786056519, "sampling/importance_sampling_ratio/mean": 0.9998685121536255, "sampling/importance_sampling_ratio/min": 0.573428750038147, "sampling/sampling_logp_difference/max": 0.5561215877532959, "sampling/sampling_logp_difference/mean": 0.016850002110004425, "step": 2240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/max_terminated_length": 379.0, "completions/mean_length": 253.09375, "completions/mean_terminated_length": 253.09375, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.4826771318912506, "epoch": 2.7463235294117645, "frac_reward_zero_std": 0.75, "grad_norm": 0.7495576977668011, "kl": 0.028796866536140442, "learning_rate": 2.242405945480147e-08, "loss": -0.0126, "num_tokens": 95853211.0, "reward": 0.625, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0003528594970703, "sampling/importance_sampling_ratio/min": 0.5705616474151611, "sampling/sampling_logp_difference/max": 1.1345250606536865, "sampling/sampling_logp_difference/mean": 0.016209090128540993, "step": 2241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1556.0, "completions/max_terminated_length": 1556.0, "completions/mean_length": 377.15625, "completions/mean_terminated_length": 377.15625, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "entropy": 0.4599630534648895, "epoch": 2.747549019607843, "frac_reward_zero_std": 0.5, "grad_norm": 1.000331229721033, "kl": 0.020885251462459564, "learning_rate": 2.2213597106929605e-08, "loss": 0.0291, "num_tokens": 95899685.0, "reward": 0.90625, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.4548144340515137, "sampling/importance_sampling_ratio/mean": 1.0003427267074585, "sampling/importance_sampling_ratio/min": 0.4459027647972107, "sampling/sampling_logp_difference/max": 0.8076543807983398, "sampling/sampling_logp_difference/mean": 0.015171395614743233, "step": 2242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2157.0, "completions/max_terminated_length": 2157.0, "completions/mean_length": 436.65625, "completions/mean_terminated_length": 436.65625, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.44950172305107117, "epoch": 2.748774509803922, "frac_reward_zero_std": 0.75, "grad_norm": 0.47717758460841636, "kl": 0.020626217126846313, "learning_rate": 2.200410463518704e-08, "loss": -0.0051, "num_tokens": 95942719.0, "reward": 0.3125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.6593478918075562, "sampling/importance_sampling_ratio/mean": 0.9994701743125916, "sampling/importance_sampling_ratio/min": 0.5397487282752991, "sampling/sampling_logp_difference/max": 0.6166515350341797, "sampling/sampling_logp_difference/mean": 0.014128158800303936, "step": 2243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 690.0, "completions/max_terminated_length": 690.0, "completions/mean_length": 403.578125, "completions/mean_terminated_length": 403.578125, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.3498053252696991, "epoch": 2.75, "frac_reward_zero_std": 1.0, "grad_norm": 0.009624648111078096, "kl": 0.013651442714035511, "learning_rate": 2.1795582464830153e-08, "loss": 0.0001, "num_tokens": 95983988.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.7820401191711426, "sampling/importance_sampling_ratio/mean": 1.000190258026123, "sampling/importance_sampling_ratio/min": 0.2884364724159241, "sampling/sampling_logp_difference/max": 1.2432804107666016, "sampling/sampling_logp_difference/mean": 0.011510975658893585, "step": 2244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/max_terminated_length": 517.0, "completions/mean_length": 240.859375, "completions/mean_terminated_length": 240.859375, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.286258727312088, "epoch": 2.751225490196078, "frac_reward_zero_std": 1.0, "grad_norm": 0.012026206837582265, "kl": 0.014967124909162521, "learning_rate": 2.1588031019145636e-08, "loss": 0.0002, "num_tokens": 96015339.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.805784821510315, "sampling/importance_sampling_ratio/mean": 0.9999625086784363, "sampling/importance_sampling_ratio/min": 0.6970002055168152, "sampling/sampling_logp_difference/max": 0.5909953117370605, "sampling/sampling_logp_difference/mean": 0.010979239828884602, "step": 2245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 978.0, "completions/max_terminated_length": 978.0, "completions/mean_length": 365.78125, "completions/mean_terminated_length": 365.78125, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "entropy": 0.4364008903503418, "epoch": 2.752450980392157, "frac_reward_zero_std": 1.0, "grad_norm": 0.01394930528135087, "kl": 0.01487510185688734, "learning_rate": 2.13814507194498e-08, "loss": 0.0002, "num_tokens": 96055709.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3042970895767212, "sampling/importance_sampling_ratio/mean": 0.9996305108070374, "sampling/importance_sampling_ratio/min": 0.6200764775276184, "sampling/sampling_logp_difference/max": 0.47791242599487305, "sampling/sampling_logp_difference/mean": 0.013663838617503643, "step": 2246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1750.0, "completions/max_terminated_length": 1750.0, "completions/mean_length": 516.703125, "completions/mean_terminated_length": 516.703125, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "entropy": 0.6273214221000671, "epoch": 2.7536764705882355, "frac_reward_zero_std": 0.5, "grad_norm": 0.6838144036558101, "kl": 0.03057291731238365, "learning_rate": 2.1175841985087707e-08, "loss": -0.0395, "num_tokens": 96109562.0, "reward": 0.21875, "reward_std": 0.38319888710975647, "rewards/decision_reward_func/mean": 0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 1.4568791389465332, "sampling/importance_sampling_ratio/mean": 0.9998257160186768, "sampling/importance_sampling_ratio/min": 0.5912301540374756, "sampling/sampling_logp_difference/max": 0.5255498886108398, "sampling/sampling_logp_difference/mean": 0.01750962994992733, "step": 2247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 667.0, "completions/max_terminated_length": 667.0, "completions/mean_length": 326.59375, "completions/mean_terminated_length": 326.59375, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.43862098455429077, "epoch": 2.7549019607843137, "frac_reward_zero_std": 0.75, "grad_norm": 0.7368079140505334, "kl": 0.02311726287007332, "learning_rate": 2.097120523343199e-08, "loss": 0.0034, "num_tokens": 96148592.0, "reward": 0.8125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.4383857250213623, "sampling/importance_sampling_ratio/mean": 1.0001132488250732, "sampling/importance_sampling_ratio/min": 0.6180692911148071, "sampling/sampling_logp_difference/max": 0.4811546802520752, "sampling/sampling_logp_difference/mean": 0.014644955284893513, "step": 2248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.0, "completions/max_terminated_length": 463.0, "completions/mean_length": 271.96875, "completions/mean_terminated_length": 271.96875, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.6222430467605591, "epoch": 2.756127450980392, "frac_reward_zero_std": 0.75, "grad_norm": 0.6344559057241674, "kl": 0.034112103283405304, "learning_rate": 2.076754087988214e-08, "loss": 0.0019, "num_tokens": 96181022.0, "reward": -0.1875, "reward_std": 0.25, "rewards/decision_reward_func/mean": -0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 1.7119203805923462, "sampling/importance_sampling_ratio/mean": 1.00011146068573, "sampling/importance_sampling_ratio/min": 0.6258807182312012, "sampling/sampling_logp_difference/max": 0.5376157760620117, "sampling/sampling_logp_difference/mean": 0.019091814756393433, "step": 2249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 659.0, "completions/max_terminated_length": 659.0, "completions/mean_length": 344.1875, "completions/mean_terminated_length": 344.1875, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "entropy": 0.36319035291671753, "epoch": 2.7573529411764706, "frac_reward_zero_std": 0.75, "grad_norm": 0.5614995279592894, "kl": 0.013424079865217209, "learning_rate": 2.0564849337864122e-08, "loss": 0.0176, "num_tokens": 96220794.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.4467214345932007, "sampling/importance_sampling_ratio/mean": 1.0001120567321777, "sampling/importance_sampling_ratio/min": 0.6827268004417419, "sampling/sampling_logp_difference/max": 0.38166046142578125, "sampling/sampling_logp_difference/mean": 0.012421916238963604, "step": 2250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 893.0, "completions/max_terminated_length": 893.0, "completions/mean_length": 382.234375, "completions/mean_terminated_length": 382.234375, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.46152225136756897, "epoch": 2.758578431372549, "frac_reward_zero_std": 0.75, "grad_norm": 0.6073678394307916, "kl": 0.02202107198536396, "learning_rate": 2.036313101882875e-08, "loss": 0.0056, "num_tokens": 96270057.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.6117079257965088, "sampling/importance_sampling_ratio/mean": 1.0003361701965332, "sampling/importance_sampling_ratio/min": 0.7003106474876404, "sampling/sampling_logp_difference/max": 0.4772944450378418, "sampling/sampling_logp_difference/mean": 0.014372183009982109, "step": 2251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1109.0, "completions/max_terminated_length": 1109.0, "completions/mean_length": 499.1875, "completions/mean_terminated_length": 499.1875, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "entropy": 0.536835789680481, "epoch": 2.7598039215686274, "frac_reward_zero_std": 0.5, "grad_norm": 0.5943807817047012, "kl": 0.027621783316135406, "learning_rate": 2.0162386332251648e-08, "loss": 0.0032, "num_tokens": 96322389.0, "reward": 0.15625, "reward_std": 0.47978055477142334, "rewards/decision_reward_func/mean": 0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.5915271043777466, "sampling/importance_sampling_ratio/mean": 1.0000755786895752, "sampling/importance_sampling_ratio/min": 0.5651647448539734, "sampling/sampling_logp_difference/max": 0.5706379413604736, "sampling/sampling_logp_difference/mean": 0.01525792945176363, "step": 2252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1217.0, "completions/max_terminated_length": 1217.0, "completions/mean_length": 523.90625, "completions/mean_terminated_length": 523.90625, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "entropy": 0.5064910650253296, "epoch": 2.7610294117647056, "frac_reward_zero_std": 0.75, "grad_norm": 0.41878398595469496, "kl": 0.017930740490555763, "learning_rate": 1.9962615685631568e-08, "loss": -0.0429, "num_tokens": 96373279.0, "reward": 0.59375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.4353212118148804, "sampling/importance_sampling_ratio/mean": 1.0000600814819336, "sampling/importance_sampling_ratio/min": 0.7078096270561218, "sampling/sampling_logp_difference/max": 0.36138856410980225, "sampling/sampling_logp_difference/mean": 0.01432035956531763, "step": 2253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/max_terminated_length": 517.0, "completions/mean_length": 292.0, "completions/mean_terminated_length": 292.0, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.49972209334373474, "epoch": 2.7622549019607843, "frac_reward_zero_std": 1.0, "grad_norm": 0.03368669239386535, "kl": 0.03995859622955322, "learning_rate": 1.976381948449035e-08, "loss": 0.0004, "num_tokens": 96416047.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.582202672958374, "sampling/importance_sampling_ratio/mean": 0.9996892213821411, "sampling/importance_sampling_ratio/min": 0.6176449060440063, "sampling/sampling_logp_difference/max": 0.4818415641784668, "sampling/sampling_logp_difference/mean": 0.015780841931700706, "step": 2254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 595.0, "completions/max_terminated_length": 595.0, "completions/mean_length": 354.0625, "completions/mean_terminated_length": 354.0625, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "entropy": 0.506740152835846, "epoch": 2.763480392156863, "frac_reward_zero_std": 0.75, "grad_norm": 0.5615084009944731, "kl": 0.03497650846838951, "learning_rate": 1.9565998132371808e-08, "loss": -0.0175, "num_tokens": 96461075.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.628941297531128, "sampling/importance_sampling_ratio/mean": 1.00096595287323, "sampling/importance_sampling_ratio/min": 0.6118922829627991, "sampling/sampling_logp_difference/max": 0.4911990165710449, "sampling/sampling_logp_difference/mean": 0.016580484807491302, "step": 2255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 885.0, "completions/max_terminated_length": 885.0, "completions/mean_length": 400.046875, "completions/mean_terminated_length": 400.046875, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "entropy": 0.4775651693344116, "epoch": 2.764705882352941, "frac_reward_zero_std": 0.75, "grad_norm": 0.6102821948157313, "kl": 0.02109212800860405, "learning_rate": 1.936915203084055e-08, "loss": 0.0023, "num_tokens": 96505942.0, "reward": 0.78125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.6463165283203125, "sampling/importance_sampling_ratio/mean": 0.9995803833007812, "sampling/importance_sampling_ratio/min": 0.5581098198890686, "sampling/sampling_logp_difference/max": 0.5831995010375977, "sampling/sampling_logp_difference/mean": 0.014721056446433067, "step": 2256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 737.0, "completions/max_terminated_length": 737.0, "completions/mean_length": 353.109375, "completions/mean_terminated_length": 353.109375, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.47432655096054077, "epoch": 2.7659313725490198, "frac_reward_zero_std": 0.75, "grad_norm": 0.6670399691378168, "kl": 0.0242506954818964, "learning_rate": 1.9173281579481894e-08, "loss": 0.0278, "num_tokens": 96545869.0, "reward": 0.40625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.4530975818634033, "sampling/importance_sampling_ratio/mean": 0.9997560977935791, "sampling/importance_sampling_ratio/min": 0.5448033809661865, "sampling/sampling_logp_difference/max": 0.607330322265625, "sampling/sampling_logp_difference/mean": 0.0145778339356184, "step": 2257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 654.0, "completions/max_terminated_length": 654.0, "completions/mean_length": 304.609375, "completions/mean_terminated_length": 304.609375, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "entropy": 0.5073785185813904, "epoch": 2.767156862745098, "frac_reward_zero_std": 0.75, "grad_norm": 0.7816418998217768, "kl": 0.025292105972766876, "learning_rate": 1.897838717590028e-08, "loss": 0.0102, "num_tokens": 96588180.0, "reward": 0.34375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 1.5847249031066895, "sampling/importance_sampling_ratio/mean": 1.000016689300537, "sampling/importance_sampling_ratio/min": 0.22405587136745453, "sampling/sampling_logp_difference/max": 1.4958598613739014, "sampling/sampling_logp_difference/mean": 0.01622415892779827, "step": 2258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 888.0, "completions/max_terminated_length": 888.0, "completions/mean_length": 337.59375, "completions/mean_terminated_length": 337.59375, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "entropy": 0.41211313009262085, "epoch": 2.7683823529411766, "frac_reward_zero_std": 1.0, "grad_norm": 0.011653652661013513, "kl": 0.013764030300080776, "learning_rate": 1.8784469215719077e-08, "loss": 0.0001, "num_tokens": 96628890.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.8492308855056763, "sampling/importance_sampling_ratio/mean": 1.0001037120819092, "sampling/importance_sampling_ratio/min": 0.7135636210441589, "sampling/sampling_logp_difference/max": 0.6147698163986206, "sampling/sampling_logp_difference/mean": 0.014045262709259987, "step": 2259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 878.0, "completions/max_terminated_length": 878.0, "completions/mean_length": 424.875, "completions/mean_terminated_length": 424.875, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "entropy": 0.47183921933174133, "epoch": 2.769607843137255, "frac_reward_zero_std": 0.5, "grad_norm": 0.7725684458732396, "kl": 0.020674433559179306, "learning_rate": 1.8591528092579524e-08, "loss": -0.0725, "num_tokens": 96673490.0, "reward": 0.3125, "reward_std": 0.4577302038669586, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.432611107826233, "sampling/importance_sampling_ratio/mean": 1.0000653266906738, "sampling/importance_sampling_ratio/min": 0.6356883645057678, "sampling/sampling_logp_difference/max": 0.4530467987060547, "sampling/sampling_logp_difference/mean": 0.014653357677161694, "step": 2260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1452.0, "completions/max_terminated_length": 1452.0, "completions/mean_length": 469.296875, "completions/mean_terminated_length": 469.296875, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "entropy": 0.5663557052612305, "epoch": 2.7708333333333335, "frac_reward_zero_std": 0.5, "grad_norm": 0.8572469386376823, "kl": 0.024551039561629295, "learning_rate": 1.8399564198139707e-08, "loss": -0.0788, "num_tokens": 96725669.0, "reward": 0.5, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5744256973266602, "sampling/importance_sampling_ratio/mean": 1.0000231266021729, "sampling/importance_sampling_ratio/min": 0.6140844821929932, "sampling/sampling_logp_difference/max": 0.4876227378845215, "sampling/sampling_logp_difference/mean": 0.014862902462482452, "step": 2261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 920.0, "completions/max_terminated_length": 920.0, "completions/mean_length": 383.078125, "completions/mean_terminated_length": 383.078125, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "entropy": 0.3205503225326538, "epoch": 2.7720588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 0.007720240865422439, "kl": 0.013812142424285412, "learning_rate": 1.8208577922074308e-08, "loss": 0.0001, "num_tokens": 96766666.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5151963233947754, "sampling/importance_sampling_ratio/mean": 1.000304102897644, "sampling/importance_sampling_ratio/min": 0.646678626537323, "sampling/sampling_logp_difference/max": 0.43590593338012695, "sampling/sampling_logp_difference/mean": 0.010663479566574097, "step": 2262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 719.0, "completions/max_terminated_length": 719.0, "completions/mean_length": 363.859375, "completions/mean_terminated_length": 363.859375, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "entropy": 0.40220189094543457, "epoch": 2.7732843137254903, "frac_reward_zero_std": 0.75, "grad_norm": 0.5171718827509798, "kl": 0.016130173578858376, "learning_rate": 1.8018569652073378e-08, "loss": -0.0493, "num_tokens": 96812145.0, "reward": 0.84375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.5848008394241333, "sampling/importance_sampling_ratio/mean": 0.9999322295188904, "sampling/importance_sampling_ratio/min": 0.634609580039978, "sampling/sampling_logp_difference/max": 0.46045875549316406, "sampling/sampling_logp_difference/mean": 0.012966087087988853, "step": 2263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 580.0, "completions/max_terminated_length": 580.0, "completions/mean_length": 292.375, "completions/mean_terminated_length": 292.375, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "entropy": 0.4095951020717621, "epoch": 2.7745098039215685, "frac_reward_zero_std": 1.0, "grad_norm": 0.014031917810403322, "kl": 0.015747925266623497, "learning_rate": 1.7829539773841608e-08, "loss": 0.0002, "num_tokens": 96845801.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.5467315912246704, "sampling/importance_sampling_ratio/mean": 1.0001325607299805, "sampling/importance_sampling_ratio/min": 0.6428225040435791, "sampling/sampling_logp_difference/max": 0.44188666343688965, "sampling/sampling_logp_difference/mean": 0.01330617256462574, "step": 2264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1408.0, "completions/max_terminated_length": 1408.0, "completions/mean_length": 506.984375, "completions/mean_terminated_length": 506.984375, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "entropy": 0.385444700717926, "epoch": 2.775735294117647, "frac_reward_zero_std": 0.75, "grad_norm": 0.43544961621155887, "kl": 0.013301155529916286, "learning_rate": 1.7641488671097606e-08, "loss": 0.0271, "num_tokens": 96897096.0, "reward": 0.8125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.5131417512893677, "sampling/importance_sampling_ratio/mean": 1.0000059604644775, "sampling/importance_sampling_ratio/min": 0.6564568877220154, "sampling/sampling_logp_difference/max": 0.42089831829071045, "sampling/sampling_logp_difference/mean": 0.011556504294276237, "step": 2265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 695.0, "completions/max_terminated_length": 695.0, "completions/mean_length": 301.25, "completions/mean_terminated_length": 301.25, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.4516142010688782, "epoch": 2.7769607843137254, "frac_reward_zero_std": 0.75, "grad_norm": 0.8243771775562277, "kl": 0.02160021662712097, "learning_rate": 1.745441672557335e-08, "loss": -0.0083, "num_tokens": 96933160.0, "reward": 0.625, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.367070198059082, "sampling/importance_sampling_ratio/mean": 0.999740481376648, "sampling/importance_sampling_ratio/min": 0.6816330552101135, "sampling/sampling_logp_difference/max": 0.38326382637023926, "sampling/sampling_logp_difference/mean": 0.015528872609138489, "step": 2266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1529.0, "completions/max_terminated_length": 1529.0, "completions/mean_length": 376.96875, "completions/mean_terminated_length": 376.96875, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.36675822734832764, "epoch": 2.778186274509804, "frac_reward_zero_std": 1.0, "grad_norm": 0.013689828838833122, "kl": 0.022631928324699402, "learning_rate": 1.7268324317012973e-08, "loss": 0.0002, "num_tokens": 96980774.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.8760662078857422, "sampling/importance_sampling_ratio/mean": 0.999826192855835, "sampling/importance_sampling_ratio/min": 0.677149772644043, "sampling/sampling_logp_difference/max": 0.6291770935058594, "sampling/sampling_logp_difference/mean": 0.011925946921110153, "step": 2267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 779.0, "completions/max_terminated_length": 779.0, "completions/mean_length": 427.625, "completions/mean_terminated_length": 427.625, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "entropy": 0.4877200126647949, "epoch": 2.7794117647058822, "frac_reward_zero_std": 0.5, "grad_norm": 0.601032070289227, "kl": 0.024858999997377396, "learning_rate": 1.7083211823172184e-08, "loss": -0.0773, "num_tokens": 97037758.0, "reward": -0.46875, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": -0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.8595178127288818, "sampling/importance_sampling_ratio/mean": 1.000018835067749, "sampling/importance_sampling_ratio/min": 0.5088993310928345, "sampling/sampling_logp_difference/max": 0.6755050420761108, "sampling/sampling_logp_difference/mean": 0.014273425564169884, "step": 2268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 739.0, "completions/max_terminated_length": 739.0, "completions/mean_length": 359.265625, "completions/mean_terminated_length": 359.265625, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.37527838349342346, "epoch": 2.780637254901961, "frac_reward_zero_std": 0.75, "grad_norm": 0.6170204196000414, "kl": 0.018174685537815094, "learning_rate": 1.6899079619817792e-08, "loss": 0.0172, "num_tokens": 97081359.0, "reward": 0.6875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.3078186511993408, "sampling/importance_sampling_ratio/mean": 1.0000073909759521, "sampling/importance_sampling_ratio/min": 0.7469223737716675, "sampling/sampling_logp_difference/max": 0.2917940616607666, "sampling/sampling_logp_difference/mean": 0.012103160843253136, "step": 2269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1373.0, "completions/max_terminated_length": 1373.0, "completions/mean_length": 354.828125, "completions/mean_terminated_length": 354.828125, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.42550235986709595, "epoch": 2.781862745098039, "frac_reward_zero_std": 0.75, "grad_norm": 0.8129055811583845, "kl": 0.014433644711971283, "learning_rate": 1.6715928080726415e-08, "loss": -0.0898, "num_tokens": 97118500.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.4665822982788086, "sampling/importance_sampling_ratio/mean": 1.0001648664474487, "sampling/importance_sampling_ratio/min": 0.6959490776062012, "sampling/sampling_logp_difference/max": 0.38293468952178955, "sampling/sampling_logp_difference/mean": 0.013791700825095177, "step": 2270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1194.0, "completions/max_terminated_length": 1194.0, "completions/mean_length": 466.765625, "completions/mean_terminated_length": 466.765625, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "entropy": 0.4183560013771057, "epoch": 2.7830882352941178, "frac_reward_zero_std": 0.5, "grad_norm": 0.6302919976576429, "kl": 0.02139405533671379, "learning_rate": 1.653375757768405e-08, "loss": -0.0033, "num_tokens": 97170789.0, "reward": 0.71875, "reward_std": 0.38319888710975647, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.420342206954956, "sampling/importance_sampling_ratio/mean": 0.9996213912963867, "sampling/importance_sampling_ratio/min": 0.5260736346244812, "sampling/sampling_logp_difference/max": 0.642314076423645, "sampling/sampling_logp_difference/mean": 0.013347708620131016, "step": 2271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 776.0, "completions/max_terminated_length": 776.0, "completions/mean_length": 349.625, "completions/mean_terminated_length": 349.625, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.43921154737472534, "epoch": 2.784313725490196, "frac_reward_zero_std": 0.5, "grad_norm": 1.0207761671465125, "kl": 0.019040122628211975, "learning_rate": 1.6352568480485275e-08, "loss": 0.0117, "num_tokens": 97212621.0, "reward": 0.4375, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.952823519706726, "sampling/importance_sampling_ratio/mean": 0.9999309778213501, "sampling/importance_sampling_ratio/min": 0.6030860543251038, "sampling/sampling_logp_difference/max": 0.669276237487793, "sampling/sampling_logp_difference/mean": 0.014630784280598164, "step": 2272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1874.0, "completions/max_terminated_length": 1874.0, "completions/mean_length": 491.6875, "completions/mean_terminated_length": 491.6875, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "entropy": 0.4779222011566162, "epoch": 2.7855392156862746, "frac_reward_zero_std": 1.0, "grad_norm": 0.012493994074525358, "kl": 0.012597102671861649, "learning_rate": 1.6172361156932547e-08, "loss": 0.0001, "num_tokens": 97263289.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5555369853973389, "sampling/importance_sampling_ratio/mean": 0.9999121427536011, "sampling/importance_sampling_ratio/min": 0.6548989415168762, "sampling/sampling_logp_difference/max": 0.4418208599090576, "sampling/sampling_logp_difference/mean": 0.014069056138396263, "step": 2273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 686.0, "completions/max_terminated_length": 686.0, "completions/mean_length": 329.3125, "completions/mean_terminated_length": 329.3125, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.41843926906585693, "epoch": 2.786764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.012250712376341786, "kl": 0.021816179156303406, "learning_rate": 1.5993135972835303e-08, "loss": 0.0002, "num_tokens": 97297933.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.9196676015853882, "sampling/importance_sampling_ratio/mean": 0.9999901056289673, "sampling/importance_sampling_ratio/min": 0.6348850131034851, "sampling/sampling_logp_difference/max": 0.6521520614624023, "sampling/sampling_logp_difference/mean": 0.014871243387460709, "step": 2274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 904.0, "completions/max_terminated_length": 904.0, "completions/mean_length": 336.921875, "completions/mean_terminated_length": 336.921875, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.39420658349990845, "epoch": 2.7879901960784315, "frac_reward_zero_std": 0.75, "grad_norm": 0.7065386055150625, "kl": 0.01681577041745186, "learning_rate": 1.581489329200919e-08, "loss": 0.0095, "num_tokens": 97337016.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.6141853332519531, "sampling/importance_sampling_ratio/mean": 1.0002472400665283, "sampling/importance_sampling_ratio/min": 0.5459088087081909, "sampling/sampling_logp_difference/max": 0.6053032875061035, "sampling/sampling_logp_difference/mean": 0.01435629092156887, "step": 2275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 779.0, "completions/max_terminated_length": 779.0, "completions/mean_length": 322.59375, "completions/mean_terminated_length": 322.59375, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.511810302734375, "epoch": 2.7892156862745097, "frac_reward_zero_std": 0.75, "grad_norm": 0.5176035645665078, "kl": 0.05244290828704834, "learning_rate": 1.5637633476275724e-08, "loss": -0.0057, "num_tokens": 97373694.0, "reward": 0.6875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.5073862075805664, "sampling/importance_sampling_ratio/mean": 0.9995853900909424, "sampling/importance_sampling_ratio/min": 0.5197300910949707, "sampling/sampling_logp_difference/max": 0.6544456481933594, "sampling/sampling_logp_difference/mean": 0.016356725245714188, "step": 2276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 729.0, "completions/max_terminated_length": 729.0, "completions/mean_length": 349.765625, "completions/mean_terminated_length": 349.765625, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "entropy": 0.34796786308288574, "epoch": 2.7904411764705883, "frac_reward_zero_std": 0.75, "grad_norm": 0.4746245667979969, "kl": 0.017916081473231316, "learning_rate": 1.5461356885461075e-08, "loss": 0.0058, "num_tokens": 97410399.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.301538348197937, "sampling/importance_sampling_ratio/mean": 1.0000250339508057, "sampling/importance_sampling_ratio/min": 0.7840486764907837, "sampling/sampling_logp_difference/max": 0.2635469436645508, "sampling/sampling_logp_difference/mean": 0.010538973845541477, "step": 2277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1636.0, "completions/max_terminated_length": 1636.0, "completions/mean_length": 507.421875, "completions/mean_terminated_length": 507.421875, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "entropy": 0.5067821741104126, "epoch": 2.7916666666666665, "frac_reward_zero_std": 0.5, "grad_norm": 0.7597814757844825, "kl": 0.03960362821817398, "learning_rate": 1.528606387739545e-08, "loss": -0.0661, "num_tokens": 97468298.0, "reward": 0.53125, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999412894248962, "sampling/importance_sampling_ratio/min": 0.4454651176929474, "sampling/sampling_logp_difference/max": 0.8086364269256592, "sampling/sampling_logp_difference/mean": 0.01438069250434637, "step": 2278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 805.0, "completions/max_terminated_length": 805.0, "completions/mean_length": 405.796875, "completions/mean_terminated_length": 405.796875, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "entropy": 0.36340296268463135, "epoch": 2.792892156862745, "frac_reward_zero_std": 1.0, "grad_norm": 0.01067495899946391, "kl": 0.016636431217193604, "learning_rate": 1.5111754807912546e-08, "loss": 0.0002, "num_tokens": 97509517.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5535234212875366, "sampling/importance_sampling_ratio/mean": 0.9998453855514526, "sampling/importance_sampling_ratio/min": 0.527134358882904, "sampling/sampling_logp_difference/max": 0.6402997970581055, "sampling/sampling_logp_difference/mean": 0.01156933605670929, "step": 2279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 551.0, "completions/max_terminated_length": 551.0, "completions/mean_length": 310.625, "completions/mean_terminated_length": 310.625, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "entropy": 0.3183882534503937, "epoch": 2.7941176470588234, "frac_reward_zero_std": 1.0, "grad_norm": 0.011400796481070171, "kl": 0.013020694255828857, "learning_rate": 1.493843003084888e-08, "loss": 0.0001, "num_tokens": 97551285.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0002479553222656, "sampling/importance_sampling_ratio/min": 0.4624389111995697, "sampling/sampling_logp_difference/max": 0.771240770816803, "sampling/sampling_logp_difference/mean": 0.011775432154536247, "step": 2280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 914.0, "completions/max_terminated_length": 914.0, "completions/mean_length": 293.875, "completions/mean_terminated_length": 293.875, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.4269462823867798, "epoch": 2.795343137254902, "frac_reward_zero_std": 1.0, "grad_norm": 0.020621968160624806, "kl": 0.020844824612140656, "learning_rate": 1.4766089898042677e-08, "loss": 0.0002, "num_tokens": 97586381.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6170742511749268, "sampling/importance_sampling_ratio/mean": 0.9999589920043945, "sampling/importance_sampling_ratio/min": 0.6511200070381165, "sampling/sampling_logp_difference/max": 0.4806184768676758, "sampling/sampling_logp_difference/mean": 0.015155099332332611, "step": 2281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 913.0, "completions/max_terminated_length": 913.0, "completions/mean_length": 358.03125, "completions/mean_terminated_length": 358.03125, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "entropy": 0.3454877436161041, "epoch": 2.7965686274509802, "frac_reward_zero_std": 1.0, "grad_norm": 0.009528960996515295, "kl": 0.012483001686632633, "learning_rate": 1.4594734759333482e-08, "loss": 0.0001, "num_tokens": 97627999.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6088953018188477, "sampling/importance_sampling_ratio/mean": 0.9999604225158691, "sampling/importance_sampling_ratio/min": 0.6152472496032715, "sampling/sampling_logp_difference/max": 0.48573100566864014, "sampling/sampling_logp_difference/mean": 0.011105488054454327, "step": 2282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/max_terminated_length": 495.0, "completions/mean_length": 300.421875, "completions/mean_terminated_length": 300.421875, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.4105415344238281, "epoch": 2.797794117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.01294475223829163, "kl": 0.01676008105278015, "learning_rate": 1.4424364962561386e-08, "loss": 0.0002, "num_tokens": 97665546.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001001358032227, "sampling/importance_sampling_ratio/min": 0.2664828300476074, "sampling/sampling_logp_difference/max": 1.3224455118179321, "sampling/sampling_logp_difference/mean": 0.013162605464458466, "step": 2283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1776.0, "completions/max_terminated_length": 1776.0, "completions/mean_length": 472.953125, "completions/mean_terminated_length": 472.953125, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.5404666662216187, "epoch": 2.799019607843137, "frac_reward_zero_std": 0.75, "grad_norm": 0.3767692445498446, "kl": 0.06489947438240051, "learning_rate": 1.4254980853566246e-08, "loss": 0.0038, "num_tokens": 97714087.0, "reward": 0.59375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.585126519203186, "sampling/importance_sampling_ratio/mean": 0.9997423887252808, "sampling/importance_sampling_ratio/min": 0.6814550757408142, "sampling/sampling_logp_difference/max": 0.4606642723083496, "sampling/sampling_logp_difference/mean": 0.016523078083992004, "step": 2284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 535.0, "completions/max_terminated_length": 535.0, "completions/mean_length": 282.25, "completions/mean_terminated_length": 282.25, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.4240421652793884, "epoch": 2.8002450980392157, "frac_reward_zero_std": 0.75, "grad_norm": 0.6999763288039279, "kl": 0.026525920256972313, "learning_rate": 1.4086582776187239e-08, "loss": 0.0131, "num_tokens": 97752439.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.4015910625457764, "sampling/importance_sampling_ratio/mean": 0.9999055862426758, "sampling/importance_sampling_ratio/min": 0.6447655558586121, "sampling/sampling_logp_difference/max": 0.43886852264404297, "sampling/sampling_logp_difference/mean": 0.014359844848513603, "step": 2285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 648.0, "completions/max_terminated_length": 648.0, "completions/mean_length": 387.125, "completions/mean_terminated_length": 387.125, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "entropy": 0.5487446784973145, "epoch": 2.8014705882352944, "frac_reward_zero_std": 0.5, "grad_norm": 0.9160085516738242, "kl": 0.021151436492800713, "learning_rate": 1.3919171072261537e-08, "loss": 0.0004, "num_tokens": 97798175.0, "reward": 0.84375, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999063014984131, "sampling/importance_sampling_ratio/min": 0.6507179737091064, "sampling/sampling_logp_difference/max": 0.7287652492523193, "sampling/sampling_logp_difference/mean": 0.015698589384555817, "step": 2286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 832.0, "completions/max_terminated_length": 832.0, "completions/mean_length": 399.25, "completions/mean_terminated_length": 399.25, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.4556509852409363, "epoch": 2.8026960784313726, "frac_reward_zero_std": 1.0, "grad_norm": 0.011885264265432322, "kl": 0.013961758464574814, "learning_rate": 1.3752746081624467e-08, "loss": 0.0001, "num_tokens": 97843903.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5808663368225098, "sampling/importance_sampling_ratio/mean": 0.9998446702957153, "sampling/importance_sampling_ratio/min": 0.6279445290565491, "sampling/sampling_logp_difference/max": 0.4653034210205078, "sampling/sampling_logp_difference/mean": 0.013935624621808529, "step": 2287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 651.0, "completions/max_terminated_length": 651.0, "completions/mean_length": 352.75, "completions/mean_terminated_length": 352.75, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.38470837473869324, "epoch": 2.803921568627451, "frac_reward_zero_std": 1.0, "grad_norm": 0.02028324087791383, "kl": 0.02083289436995983, "learning_rate": 1.3587308142108178e-08, "loss": 0.0002, "num_tokens": 97885055.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6009936332702637, "sampling/importance_sampling_ratio/mean": 1.0004875659942627, "sampling/importance_sampling_ratio/min": 0.6281483173370361, "sampling/sampling_logp_difference/max": 0.4706244468688965, "sampling/sampling_logp_difference/mean": 0.013269072398543358, "step": 2288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 582.0, "completions/max_terminated_length": 582.0, "completions/mean_length": 284.296875, "completions/mean_terminated_length": 284.296875, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.47720056772232056, "epoch": 2.8051470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.013642286744546055, "kl": 0.016555706039071083, "learning_rate": 1.3422857589541148e-08, "loss": 0.0002, "num_tokens": 97917602.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.616188406944275, "sampling/importance_sampling_ratio/mean": 1.0004477500915527, "sampling/importance_sampling_ratio/min": 0.7021438479423523, "sampling/sampling_logp_difference/max": 0.4800705909729004, "sampling/sampling_logp_difference/mean": 0.015622569248080254, "step": 2289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1258.0, "completions/max_terminated_length": 1258.0, "completions/mean_length": 347.828125, "completions/mean_terminated_length": 347.828125, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.5155289173126221, "epoch": 2.806372549019608, "frac_reward_zero_std": 0.75, "grad_norm": 0.5445323971371663, "kl": 0.04575018584728241, "learning_rate": 1.3259394757747677e-08, "loss": -0.003, "num_tokens": 97954007.0, "reward": 0.59375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999682903289795, "sampling/importance_sampling_ratio/min": 0.6940925717353821, "sampling/sampling_logp_difference/max": 0.7287757396697998, "sampling/sampling_logp_difference/mean": 0.015975426882505417, "step": 2290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 704.0, "completions/max_terminated_length": 704.0, "completions/mean_length": 372.15625, "completions/mean_terminated_length": 372.15625, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.4574153423309326, "epoch": 2.8075980392156863, "frac_reward_zero_std": 0.75, "grad_norm": 0.5933933785441612, "kl": 0.017646905034780502, "learning_rate": 1.3096919978546838e-08, "loss": 0.0186, "num_tokens": 97993009.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.5615544319152832, "sampling/importance_sampling_ratio/mean": 0.9998899698257446, "sampling/importance_sampling_ratio/min": 0.7756643295288086, "sampling/sampling_logp_difference/max": 0.4456818103790283, "sampling/sampling_logp_difference/mean": 0.014717020094394684, "step": 2291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 558.0, "completions/max_terminated_length": 558.0, "completions/mean_length": 311.359375, "completions/mean_terminated_length": 311.359375, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.43595579266548157, "epoch": 2.8088235294117645, "frac_reward_zero_std": 0.75, "grad_norm": 0.6799639227387436, "kl": 0.020388400182127953, "learning_rate": 1.2935433581752365e-08, "loss": 0.0338, "num_tokens": 98030072.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.5174883604049683, "sampling/importance_sampling_ratio/mean": 0.9999021887779236, "sampling/importance_sampling_ratio/min": 0.6547018885612488, "sampling/sampling_logp_difference/max": 0.4235752820968628, "sampling/sampling_logp_difference/mean": 0.014474814757704735, "step": 2292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1365.0, "completions/max_terminated_length": 1365.0, "completions/mean_length": 389.34375, "completions/mean_terminated_length": 389.34375, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.3275129497051239, "epoch": 2.810049019607843, "frac_reward_zero_std": 1.0, "grad_norm": 0.030580390456212494, "kl": 0.01333676092326641, "learning_rate": 1.2774935895171091e-08, "loss": 0.0001, "num_tokens": 98070926.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.944157600402832, "sampling/importance_sampling_ratio/mean": 1.0001038312911987, "sampling/importance_sampling_ratio/min": 0.480814665555954, "sampling/sampling_logp_difference/max": 0.7322734594345093, "sampling/sampling_logp_difference/mean": 0.011473847553133965, "step": 2293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 720.0, "completions/max_terminated_length": 720.0, "completions/mean_length": 313.171875, "completions/mean_terminated_length": 313.171875, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.3319823145866394, "epoch": 2.811274509803922, "frac_reward_zero_std": 1.0, "grad_norm": 0.01374943675007819, "kl": 0.014328519813716412, "learning_rate": 1.2615427244603405e-08, "loss": 0.0002, "num_tokens": 98105833.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3249837160110474, "sampling/importance_sampling_ratio/mean": 0.999545156955719, "sampling/importance_sampling_ratio/min": 0.6279207468032837, "sampling/sampling_logp_difference/max": 0.46534132957458496, "sampling/sampling_logp_difference/mean": 0.011459492146968842, "step": 2294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 782.0, "completions/max_terminated_length": 782.0, "completions/mean_length": 407.46875, "completions/mean_terminated_length": 407.46875, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "entropy": 0.35231176018714905, "epoch": 2.8125, "frac_reward_zero_std": 0.75, "grad_norm": 0.3938595700261743, "kl": 0.014360626228153706, "learning_rate": 1.2456907953841633e-08, "loss": -0.0169, "num_tokens": 98148023.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.4347676038742065, "sampling/importance_sampling_ratio/mean": 0.9998478889465332, "sampling/importance_sampling_ratio/min": 0.611872136592865, "sampling/sampling_logp_difference/max": 0.49123191833496094, "sampling/sampling_logp_difference/mean": 0.01123404037207365, "step": 2295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1016.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 458.921875, "completions/mean_terminated_length": 458.921875, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.44471871852874756, "epoch": 2.813725490196078, "frac_reward_zero_std": 0.75, "grad_norm": 0.587322745471673, "kl": 0.019295576959848404, "learning_rate": 1.2299378344669986e-08, "loss": 0.0347, "num_tokens": 98192290.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.4299647808074951, "sampling/importance_sampling_ratio/mean": 0.9998105764389038, "sampling/importance_sampling_ratio/min": 0.5905094146728516, "sampling/sampling_logp_difference/max": 0.5267696380615234, "sampling/sampling_logp_difference/mean": 0.013226993381977081, "step": 2296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2138.0, "completions/max_terminated_length": 2138.0, "completions/mean_length": 439.078125, "completions/mean_terminated_length": 439.078125, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.5945037007331848, "epoch": 2.814950980392157, "frac_reward_zero_std": 0.25, "grad_norm": 0.9817318553635704, "kl": 0.029464326798915863, "learning_rate": 1.2142838736863559e-08, "loss": 0.0458, "num_tokens": 98235703.0, "reward": 0.46875, "reward_std": 0.7297805547714233, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.7972594499588013, "sampling/importance_sampling_ratio/mean": 0.9998276829719543, "sampling/importance_sampling_ratio/min": 0.5097754597663879, "sampling/sampling_logp_difference/max": 0.6737848520278931, "sampling/sampling_logp_difference/mean": 0.017316032201051712, "step": 2297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 762.0, "completions/max_terminated_length": 762.0, "completions/mean_length": 323.328125, "completions/mean_terminated_length": 323.328125, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.5051720142364502, "epoch": 2.8161764705882355, "frac_reward_zero_std": 0.75, "grad_norm": 0.6692194665551068, "kl": 0.025575894862413406, "learning_rate": 1.1987289448187777e-08, "loss": -0.0038, "num_tokens": 98274268.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999088048934937, "sampling/importance_sampling_ratio/min": 0.5002569556236267, "sampling/sampling_logp_difference/max": 0.6933512687683105, "sampling/sampling_logp_difference/mean": 0.015143569558858871, "step": 2298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1310.0, "completions/max_terminated_length": 1310.0, "completions/mean_length": 435.96875, "completions/mean_terminated_length": 435.96875, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.49756312370300293, "epoch": 2.8174019607843137, "frac_reward_zero_std": 0.75, "grad_norm": 0.611432076862268, "kl": 0.018120862543582916, "learning_rate": 1.183273079439795e-08, "loss": 0.0127, "num_tokens": 98324586.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.4592761993408203, "sampling/importance_sampling_ratio/mean": 1.0000847578048706, "sampling/importance_sampling_ratio/min": 0.5341841578483582, "sampling/sampling_logp_difference/max": 0.6270146369934082, "sampling/sampling_logp_difference/mean": 0.015510563738644123, "step": 2299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 711.0, "completions/max_terminated_length": 711.0, "completions/mean_length": 369.234375, "completions/mean_terminated_length": 369.234375, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "entropy": 0.5668011903762817, "epoch": 2.818627450980392, "frac_reward_zero_std": 0.75, "grad_norm": 0.5955040947397798, "kl": 0.02755962498486042, "learning_rate": 1.167916308923822e-08, "loss": -0.0022, "num_tokens": 98370601.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.5185694694519043, "sampling/importance_sampling_ratio/mean": 0.9998612403869629, "sampling/importance_sampling_ratio/min": 0.41604337096214294, "sampling/sampling_logp_difference/max": 0.8769657611846924, "sampling/sampling_logp_difference/mean": 0.016695063561201096, "step": 2300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 741.0, "completions/max_terminated_length": 741.0, "completions/mean_length": 389.296875, "completions/mean_terminated_length": 389.296875, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "entropy": 0.3395036458969116, "epoch": 2.8198529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.009493391358593138, "kl": 0.012065887451171875, "learning_rate": 1.152658664444145e-08, "loss": 0.0001, "num_tokens": 98414092.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5341918468475342, "sampling/importance_sampling_ratio/mean": 1.0000970363616943, "sampling/importance_sampling_ratio/min": 0.5516166090965271, "sampling/sampling_logp_difference/max": 0.5949020385742188, "sampling/sampling_logp_difference/mean": 0.010766386054456234, "step": 2301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 443.0, "completions/max_terminated_length": 443.0, "completions/mean_length": 293.359375, "completions/mean_terminated_length": 293.359375, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "entropy": 0.5148122906684875, "epoch": 2.821078431372549, "frac_reward_zero_std": 1.0, "grad_norm": 0.01298827445120404, "kl": 0.017756663262844086, "learning_rate": 1.1375001769727999e-08, "loss": 0.0002, "num_tokens": 98452115.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.37113356590271, "sampling/importance_sampling_ratio/mean": 1.0003986358642578, "sampling/importance_sampling_ratio/min": 0.5036870837211609, "sampling/sampling_logp_difference/max": 0.6858000755310059, "sampling/sampling_logp_difference/mean": 0.016669882461428642, "step": 2302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1415.0, "completions/max_terminated_length": 1415.0, "completions/mean_length": 426.546875, "completions/mean_terminated_length": 426.546875, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "entropy": 0.4853014051914215, "epoch": 2.8223039215686274, "frac_reward_zero_std": 0.75, "grad_norm": 0.5914937166339957, "kl": 0.016054287552833557, "learning_rate": 1.1224408772805671e-08, "loss": -0.0287, "num_tokens": 98498614.0, "reward": 0.65625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.4352011680603027, "sampling/importance_sampling_ratio/mean": 0.99949711561203, "sampling/importance_sampling_ratio/min": 0.617037296295166, "sampling/sampling_logp_difference/max": 0.48282575607299805, "sampling/sampling_logp_difference/mean": 0.015181169845163822, "step": 2303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 873.0, "completions/max_terminated_length": 873.0, "completions/mean_length": 359.53125, "completions/mean_terminated_length": 359.53125, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.5661686658859253, "epoch": 2.8235294117647056, "frac_reward_zero_std": 0.5, "grad_norm": 1.075040200026358, "kl": 0.03556272014975548, "learning_rate": 1.1074807959368715e-08, "loss": -0.0353, "num_tokens": 98537992.0, "reward": 0.0625, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.4613381624221802, "sampling/importance_sampling_ratio/mean": 0.9999798536300659, "sampling/importance_sampling_ratio/min": 0.5319331288337708, "sampling/sampling_logp_difference/max": 0.6312375068664551, "sampling/sampling_logp_difference/mean": 0.016982875764369965, "step": 2304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 728.0, "completions/max_terminated_length": 728.0, "completions/mean_length": 342.5, "completions/mean_terminated_length": 342.5, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "entropy": 0.5324392318725586, "epoch": 2.8247549019607843, "frac_reward_zero_std": 0.75, "grad_norm": 0.5355309633454466, "kl": 0.02553066797554493, "learning_rate": 1.0926199633097154e-08, "loss": 0.0077, "num_tokens": 98577672.0, "reward": 0.1875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 1.5931557416915894, "sampling/importance_sampling_ratio/mean": 1.0000958442687988, "sampling/importance_sampling_ratio/min": 0.29577216506004333, "sampling/sampling_logp_difference/max": 1.2181658744812012, "sampling/sampling_logp_difference/mean": 0.0168350487947464, "step": 2305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 823.0, "completions/max_terminated_length": 823.0, "completions/mean_length": 347.921875, "completions/mean_terminated_length": 347.921875, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "entropy": 0.4672725796699524, "epoch": 2.825980392156863, "frac_reward_zero_std": 0.75, "grad_norm": 0.6255443086375738, "kl": 0.022190898656845093, "learning_rate": 1.0778584095656685e-08, "loss": 0.0114, "num_tokens": 98613795.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.6260939836502075, "sampling/importance_sampling_ratio/mean": 1.0000793933868408, "sampling/importance_sampling_ratio/min": 0.5272733569145203, "sampling/sampling_logp_difference/max": 0.6400362253189087, "sampling/sampling_logp_difference/mean": 0.01528177410364151, "step": 2306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 737.0, "completions/max_terminated_length": 737.0, "completions/mean_length": 352.578125, "completions/mean_terminated_length": 352.578125, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.6756695508956909, "epoch": 2.827205882352941, "frac_reward_zero_std": 0.25, "grad_norm": 0.9764309084102635, "kl": 0.038510970771312714, "learning_rate": 1.0631961646697384e-08, "loss": -0.0049, "num_tokens": 98659768.0, "reward": 0.4375, "reward_std": 0.42078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.4768412113189697, "sampling/importance_sampling_ratio/mean": 1.000158429145813, "sampling/importance_sampling_ratio/min": 0.5823366045951843, "sampling/sampling_logp_difference/max": 0.5407066345214844, "sampling/sampling_logp_difference/mean": 0.018721967935562134, "step": 2307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1213.0, "completions/max_terminated_length": 1213.0, "completions/mean_length": 458.453125, "completions/mean_terminated_length": 458.453125, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.5936477184295654, "epoch": 2.8284313725490198, "frac_reward_zero_std": 0.25, "grad_norm": 0.7421663970386747, "kl": 0.02952830120921135, "learning_rate": 1.0486332583853564e-08, "loss": 0.0182, "num_tokens": 98710853.0, "reward": -0.40625, "reward_std": 0.6802700161933899, "rewards/decision_reward_func/mean": -0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.5021940469741821, "sampling/importance_sampling_ratio/mean": 0.9997568726539612, "sampling/importance_sampling_ratio/min": 0.6651408076286316, "sampling/sampling_logp_difference/max": 0.4077564477920532, "sampling/sampling_logp_difference/mean": 0.015607387758791447, "step": 2308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1473.0, "completions/max_terminated_length": 1473.0, "completions/mean_length": 367.328125, "completions/mean_terminated_length": 367.328125, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.5146651268005371, "epoch": 2.829656862745098, "frac_reward_zero_std": 0.75, "grad_norm": 0.7156490668029205, "kl": 0.021086594089865685, "learning_rate": 1.0341697202742971e-08, "loss": -0.064, "num_tokens": 98751066.0, "reward": 0.625, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.497835636138916, "sampling/importance_sampling_ratio/mean": 1.0002806186676025, "sampling/importance_sampling_ratio/min": 0.626653254032135, "sampling/sampling_logp_difference/max": 0.4673619270324707, "sampling/sampling_logp_difference/mean": 0.015949347987771034, "step": 2309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1036.0, "completions/max_terminated_length": 1036.0, "completions/mean_length": 541.765625, "completions/mean_terminated_length": 541.765625, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "entropy": 0.44725680351257324, "epoch": 2.8308823529411766, "frac_reward_zero_std": 0.75, "grad_norm": 0.41781187726453556, "kl": 0.017987266182899475, "learning_rate": 1.0198055796966253e-08, "loss": 0.001, "num_tokens": 98810875.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.543195366859436, "sampling/importance_sampling_ratio/mean": 0.9999432563781738, "sampling/importance_sampling_ratio/min": 0.6367940902709961, "sampling/sampling_logp_difference/max": 0.4513089656829834, "sampling/sampling_logp_difference/mean": 0.012258771806955338, "step": 2310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1082.0, "completions/max_terminated_length": 1082.0, "completions/mean_length": 417.65625, "completions/mean_terminated_length": 417.65625, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.464903861284256, "epoch": 2.832107843137255, "frac_reward_zero_std": 0.75, "grad_norm": 0.5944327522448521, "kl": 0.01926577463746071, "learning_rate": 1.0055408658106446e-08, "loss": 0.0326, "num_tokens": 98855349.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.5713741779327393, "sampling/importance_sampling_ratio/mean": 0.9999210834503174, "sampling/importance_sampling_ratio/min": 0.6057241559028625, "sampling/sampling_logp_difference/max": 0.5013306140899658, "sampling/sampling_logp_difference/mean": 0.014996242709457874, "step": 2311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 863.0, "completions/max_terminated_length": 863.0, "completions/mean_length": 425.234375, "completions/mean_terminated_length": 425.234375, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "entropy": 0.48893052339553833, "epoch": 2.8333333333333335, "frac_reward_zero_std": 0.75, "grad_norm": 0.5731457397517967, "kl": 0.014500105753540993, "learning_rate": 9.913756075728086e-09, "loss": -0.0088, "num_tokens": 98900020.0, "reward": 0.40625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.8615775108337402, "sampling/importance_sampling_ratio/mean": 0.9997873306274414, "sampling/importance_sampling_ratio/min": 0.6286540031433105, "sampling/sampling_logp_difference/max": 0.6214241981506348, "sampling/sampling_logp_difference/mean": 0.01545223779976368, "step": 2312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2097.0, "completions/max_terminated_length": 2097.0, "completions/mean_length": 462.140625, "completions/mean_terminated_length": 462.140625, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.5341820120811462, "epoch": 2.8345588235294117, "frac_reward_zero_std": 0.25, "grad_norm": 0.8680384135227862, "kl": 0.029454823583364487, "learning_rate": 9.77309833737705e-09, "loss": 0.0095, "num_tokens": 98948669.0, "reward": 0.34375, "reward_std": 0.5809217691421509, "rewards/decision_reward_func/mean": 0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 1.5063995122909546, "sampling/importance_sampling_ratio/mean": 0.9999097585678101, "sampling/importance_sampling_ratio/min": 0.5636377930641174, "sampling/sampling_logp_difference/max": 0.5733433961868286, "sampling/sampling_logp_difference/mean": 0.01586289331316948, "step": 2313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 916.0, "completions/max_terminated_length": 916.0, "completions/mean_length": 447.875, "completions/mean_terminated_length": 447.875, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "entropy": 0.3972395956516266, "epoch": 2.8357843137254903, "frac_reward_zero_std": 1.0, "grad_norm": 0.01074747171305059, "kl": 0.017175063490867615, "learning_rate": 9.633435728579553e-09, "loss": 0.0002, "num_tokens": 99002853.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5480619668960571, "sampling/importance_sampling_ratio/mean": 0.9994499683380127, "sampling/importance_sampling_ratio/min": 0.5493482947349548, "sampling/sampling_logp_difference/max": 0.599022626876831, "sampling/sampling_logp_difference/mean": 0.012614995241165161, "step": 2314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 682.0, "completions/max_terminated_length": 682.0, "completions/mean_length": 286.28125, "completions/mean_terminated_length": 286.28125, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.38804054260253906, "epoch": 2.8370098039215685, "frac_reward_zero_std": 0.75, "grad_norm": 0.717957588785395, "kl": 0.014255549758672714, "learning_rate": 9.494768532841868e-09, "loss": -0.0013, "num_tokens": 99036839.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.5501561164855957, "sampling/importance_sampling_ratio/mean": 1.0002764463424683, "sampling/importance_sampling_ratio/min": 0.7103763222694397, "sampling/sampling_logp_difference/max": 0.4383556842803955, "sampling/sampling_logp_difference/mean": 0.012422539293766022, "step": 2315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 729.0, "completions/max_terminated_length": 729.0, "completions/mean_length": 375.453125, "completions/mean_terminated_length": 375.453125, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "entropy": 0.5286999940872192, "epoch": 2.838235294117647, "frac_reward_zero_std": 0.75, "grad_norm": 0.4512141510085936, "kl": 0.030523039400577545, "learning_rate": 9.357097031649664e-09, "loss": -0.0016, "num_tokens": 99084436.0, "reward": 0.40625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.6496436595916748, "sampling/importance_sampling_ratio/mean": 0.9999242424964905, "sampling/importance_sampling_ratio/min": 0.6273241639137268, "sampling/sampling_logp_difference/max": 0.5005593299865723, "sampling/sampling_logp_difference/mean": 0.01667173206806183, "step": 2316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 893.0, "completions/max_terminated_length": 893.0, "completions/mean_length": 539.40625, "completions/mean_terminated_length": 539.40625, "completions/min_length": 321.0, "completions/min_terminated_length": 321.0, "entropy": 0.47758686542510986, "epoch": 2.8394607843137254, "frac_reward_zero_std": 0.5, "grad_norm": 0.6304857127871722, "kl": 0.019506722688674927, "learning_rate": 9.22042150446728e-09, "loss": -0.026, "num_tokens": 99139454.0, "reward": -0.03125, "reward_std": 0.5061737298965454, "rewards/decision_reward_func/mean": -0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.5554311275482178, "sampling/importance_sampling_ratio/mean": 0.9999026656150818, "sampling/importance_sampling_ratio/min": 0.458811491727829, "sampling/sampling_logp_difference/max": 0.7791159152984619, "sampling/sampling_logp_difference/mean": 0.01343061774969101, "step": 2317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/max_terminated_length": 402.0, "completions/mean_length": 262.875, "completions/mean_terminated_length": 262.875, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.35200202465057373, "epoch": 2.840686274509804, "frac_reward_zero_std": 1.0, "grad_norm": 0.013507667316504685, "kl": 0.015242555178701878, "learning_rate": 9.084742228737564e-09, "loss": 0.0002, "num_tokens": 99172246.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001285076141357, "sampling/importance_sampling_ratio/min": 0.6772170662879944, "sampling/sampling_logp_difference/max": 0.8386523723602295, "sampling/sampling_logp_difference/mean": 0.012661127373576164, "step": 2318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1064.0, "completions/max_terminated_length": 1064.0, "completions/mean_length": 457.375, "completions/mean_terminated_length": 457.375, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "entropy": 0.38087040185928345, "epoch": 2.8419117647058822, "frac_reward_zero_std": 0.75, "grad_norm": 0.5782595715673867, "kl": 0.014521822333335876, "learning_rate": 8.95005947988059e-09, "loss": 0.0141, "num_tokens": 99221966.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.4352259635925293, "sampling/importance_sampling_ratio/mean": 0.9998088479042053, "sampling/importance_sampling_ratio/min": 0.6254516839981079, "sampling/sampling_logp_difference/max": 0.4692811965942383, "sampling/sampling_logp_difference/mean": 0.012403838336467743, "step": 2319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 764.0, "completions/max_terminated_length": 764.0, "completions/mean_length": 414.203125, "completions/mean_terminated_length": 414.203125, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "entropy": 0.3957589268684387, "epoch": 2.843137254901961, "frac_reward_zero_std": 0.75, "grad_norm": 0.5416853701139166, "kl": 0.015463759191334248, "learning_rate": 8.816373531293941e-09, "loss": -0.0155, "num_tokens": 99276171.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.7181044816970825, "sampling/importance_sampling_ratio/mean": 1.0001823902130127, "sampling/importance_sampling_ratio/min": 0.6121931672096252, "sampling/sampling_logp_difference/max": 0.5412216186523438, "sampling/sampling_logp_difference/mean": 0.012339849025011063, "step": 2320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 697.0, "completions/max_terminated_length": 697.0, "completions/mean_length": 321.96875, "completions/mean_terminated_length": 321.96875, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "entropy": 0.5164297819137573, "epoch": 2.844362745098039, "frac_reward_zero_std": 0.75, "grad_norm": 0.7753380723288013, "kl": 0.04737868160009384, "learning_rate": 8.683684654351597e-09, "loss": 0.0426, "num_tokens": 99315081.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000040054321289, "sampling/importance_sampling_ratio/min": 0.3994150459766388, "sampling/sampling_logp_difference/max": 0.9177541732788086, "sampling/sampling_logp_difference/mean": 0.016855638474225998, "step": 2321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 744.0, "completions/max_terminated_length": 744.0, "completions/mean_length": 292.8125, "completions/mean_terminated_length": 292.8125, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.3598868250846863, "epoch": 2.8455882352941178, "frac_reward_zero_std": 0.75, "grad_norm": 0.6853734554501659, "kl": 0.02380228228867054, "learning_rate": 8.551993118403656e-09, "loss": 0.0006, "num_tokens": 99357021.0, "reward": 0.625, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.4400900602340698, "sampling/importance_sampling_ratio/mean": 1.0000972747802734, "sampling/importance_sampling_ratio/min": 0.7186180949211121, "sampling/sampling_logp_difference/max": 0.36470556259155273, "sampling/sampling_logp_difference/mean": 0.011588184162974358, "step": 2322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1870.0, "completions/max_terminated_length": 1870.0, "completions/mean_length": 445.046875, "completions/mean_terminated_length": 445.046875, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.42661774158477783, "epoch": 2.846813725490196, "frac_reward_zero_std": 1.0, "grad_norm": 0.01968534649301764, "kl": 0.020005742087960243, "learning_rate": 8.4212991907755e-09, "loss": 0.0002, "num_tokens": 99404912.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.621926188468933, "sampling/importance_sampling_ratio/mean": 1.0000298023223877, "sampling/importance_sampling_ratio/min": 0.07632189989089966, "sampling/sampling_logp_difference/max": 2.5727953910827637, "sampling/sampling_logp_difference/mean": 0.012173660099506378, "step": 2323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 708.0, "completions/max_terminated_length": 708.0, "completions/mean_length": 350.4375, "completions/mean_terminated_length": 350.4375, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.510948121547699, "epoch": 2.8480392156862746, "frac_reward_zero_std": 1.0, "grad_norm": 0.01507904291362503, "kl": 0.02694392018020153, "learning_rate": 8.291603136767521e-09, "loss": 0.0003, "num_tokens": 99443148.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4844279289245605, "sampling/importance_sampling_ratio/mean": 1.0005927085876465, "sampling/importance_sampling_ratio/min": 0.6905835866928101, "sampling/sampling_logp_difference/max": 0.39502954483032227, "sampling/sampling_logp_difference/mean": 0.015955043956637383, "step": 2324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 564.0, "completions/max_terminated_length": 564.0, "completions/mean_length": 252.6875, "completions/mean_terminated_length": 252.6875, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.4211464524269104, "epoch": 2.849264705882353, "frac_reward_zero_std": 0.75, "grad_norm": 0.7336080134410482, "kl": 0.022172071039676666, "learning_rate": 8.16290521965457e-09, "loss": 0.0084, "num_tokens": 99473432.0, "reward": -0.0625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": -0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.6556731462478638, "sampling/importance_sampling_ratio/mean": 0.9997988343238831, "sampling/importance_sampling_ratio/min": 0.6071981191635132, "sampling/sampling_logp_difference/max": 0.5042076110839844, "sampling/sampling_logp_difference/mean": 0.01485019363462925, "step": 2325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/max_terminated_length": 356.0, "completions/mean_length": 238.90625, "completions/mean_terminated_length": 238.90625, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.43450140953063965, "epoch": 2.8504901960784315, "frac_reward_zero_std": 0.75, "grad_norm": 0.921285524698859, "kl": 0.02928030863404274, "learning_rate": 8.035205700685165e-09, "loss": 0.0119, "num_tokens": 99505410.0, "reward": 0.1875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0002546310424805, "sampling/importance_sampling_ratio/min": 0.6121808290481567, "sampling/sampling_logp_difference/max": 1.0292255878448486, "sampling/sampling_logp_difference/mean": 0.01571078598499298, "step": 2326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 822.0, "completions/max_terminated_length": 822.0, "completions/mean_length": 341.671875, "completions/mean_terminated_length": 341.671875, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "entropy": 0.4671202600002289, "epoch": 2.8517156862745097, "frac_reward_zero_std": 0.75, "grad_norm": 0.5027744064763844, "kl": 0.017127471044659615, "learning_rate": 7.908504839081342e-09, "loss": 0.0241, "num_tokens": 99544525.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.4756743907928467, "sampling/importance_sampling_ratio/mean": 1.0001815557479858, "sampling/importance_sampling_ratio/min": 0.0630580261349678, "sampling/sampling_logp_difference/max": 2.763700008392334, "sampling/sampling_logp_difference/mean": 0.014796015806496143, "step": 2327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 643.0, "completions/max_terminated_length": 643.0, "completions/mean_length": 269.78125, "completions/mean_terminated_length": 269.78125, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.44914090633392334, "epoch": 2.8529411764705883, "frac_reward_zero_std": 0.75, "grad_norm": 0.6359593078435175, "kl": 0.024053193628787994, "learning_rate": 7.7828028920377e-09, "loss": -0.0157, "num_tokens": 99583567.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.621127963066101, "sampling/importance_sampling_ratio/mean": 1.0001006126403809, "sampling/importance_sampling_ratio/min": 0.406927227973938, "sampling/sampling_logp_difference/max": 0.8991209268569946, "sampling/sampling_logp_difference/mean": 0.01529725082218647, "step": 2328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 676.0, "completions/max_terminated_length": 676.0, "completions/mean_length": 344.578125, "completions/mean_terminated_length": 344.578125, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "entropy": 0.4130982756614685, "epoch": 2.8541666666666665, "frac_reward_zero_std": 0.75, "grad_norm": 0.6055425997964272, "kl": 0.017590370029211044, "learning_rate": 7.658100114721344e-09, "loss": 0.0012, "num_tokens": 99622884.0, "reward": 0.625, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.799952745437622, "sampling/importance_sampling_ratio/mean": 1.0002295970916748, "sampling/importance_sampling_ratio/min": 0.40359652042388916, "sampling/sampling_logp_difference/max": 0.9073396921157837, "sampling/sampling_logp_difference/mean": 0.012896635569632053, "step": 2329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 858.0, "completions/max_terminated_length": 858.0, "completions/mean_length": 405.65625, "completions/mean_terminated_length": 405.65625, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "entropy": 0.6202362179756165, "epoch": 2.855392156862745, "frac_reward_zero_std": 0.75, "grad_norm": 0.5689235822143718, "kl": 0.023454347625374794, "learning_rate": 7.534396760270956e-09, "loss": 0.072, "num_tokens": 99670494.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.724532127380371, "sampling/importance_sampling_ratio/mean": 0.9998660087585449, "sampling/importance_sampling_ratio/min": 0.6398389339447021, "sampling/sampling_logp_difference/max": 0.5449557304382324, "sampling/sampling_logp_difference/mean": 0.017109744250774384, "step": 2330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 934.0, "completions/max_terminated_length": 934.0, "completions/mean_length": 386.109375, "completions/mean_terminated_length": 386.109375, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.45018285512924194, "epoch": 2.8566176470588234, "frac_reward_zero_std": 1.0, "grad_norm": 0.010074391272218878, "kl": 0.014103177934885025, "learning_rate": 7.411693079796499e-09, "loss": 0.0001, "num_tokens": 99712741.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.370207667350769, "sampling/importance_sampling_ratio/mean": 0.9996811747550964, "sampling/importance_sampling_ratio/min": 0.7097377777099609, "sampling/sampling_logp_difference/max": 0.34285974502563477, "sampling/sampling_logp_difference/mean": 0.013637324795126915, "step": 2331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 531.0, "completions/max_terminated_length": 531.0, "completions/mean_length": 285.296875, "completions/mean_terminated_length": 285.296875, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.49419939517974854, "epoch": 2.857843137254902, "frac_reward_zero_std": 1.0, "grad_norm": 0.01959768752046551, "kl": 0.033901408314704895, "learning_rate": 7.289989322378731e-09, "loss": 0.0003, "num_tokens": 99748952.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.744460940361023, "sampling/importance_sampling_ratio/mean": 1.0000650882720947, "sampling/importance_sampling_ratio/min": 0.6808500289916992, "sampling/sampling_logp_difference/max": 0.5564455986022949, "sampling/sampling_logp_difference/mean": 0.015564914792776108, "step": 2332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1535.0, "completions/max_terminated_length": 1535.0, "completions/mean_length": 527.59375, "completions/mean_terminated_length": 527.59375, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "entropy": 0.39338135719299316, "epoch": 2.8590686274509802, "frac_reward_zero_std": 0.75, "grad_norm": 0.4546005266574657, "kl": 0.01623457856476307, "learning_rate": 7.169285735068531e-09, "loss": 0.0739, "num_tokens": 99802622.0, "reward": 0.75, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.562448263168335, "sampling/importance_sampling_ratio/mean": 1.0001939535140991, "sampling/importance_sampling_ratio/min": 0.5363268852233887, "sampling/sampling_logp_difference/max": 0.6230114698410034, "sampling/sampling_logp_difference/mean": 0.011559432372450829, "step": 2333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 571.0, "completions/max_terminated_length": 571.0, "completions/mean_length": 294.1875, "completions/mean_terminated_length": 294.1875, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.5333156585693359, "epoch": 2.860294117647059, "frac_reward_zero_std": 0.75, "grad_norm": 0.662064147221071, "kl": 0.03392498195171356, "learning_rate": 7.049582562886513e-09, "loss": 0.0162, "num_tokens": 99835002.0, "reward": 0.71875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.9966349601745605, "sampling/importance_sampling_ratio/mean": 0.9996651411056519, "sampling/importance_sampling_ratio/min": 0.6713066101074219, "sampling/sampling_logp_difference/max": 0.6914632320404053, "sampling/sampling_logp_difference/mean": 0.017372693866491318, "step": 2334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2020.0, "completions/max_terminated_length": 2020.0, "completions/mean_length": 582.0625, "completions/mean_terminated_length": 582.0625, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.5065618753433228, "epoch": 2.861519607843137, "frac_reward_zero_std": 0.25, "grad_norm": 0.7365426523025286, "kl": 0.017472386360168457, "learning_rate": 6.930880048822529e-09, "loss": -0.056, "num_tokens": 99889294.0, "reward": -0.03125, "reward_std": 0.5281128883361816, "rewards/decision_reward_func/mean": -0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.464892864227295, "sampling/importance_sampling_ratio/mean": 1.0000815391540527, "sampling/importance_sampling_ratio/min": 0.6075552105903625, "sampling/sampling_logp_difference/max": 0.49831223487854004, "sampling/sampling_logp_difference/mean": 0.015307561494410038, "step": 2335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 587.0, "completions/max_terminated_length": 587.0, "completions/mean_length": 255.6875, "completions/mean_terminated_length": 255.6875, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.5882181525230408, "epoch": 2.8627450980392157, "frac_reward_zero_std": 0.5, "grad_norm": 1.0925238685884742, "kl": 0.06095802038908005, "learning_rate": 6.813178433835221e-09, "loss": -0.0461, "num_tokens": 99917674.0, "reward": 0.84375, "reward_std": 0.3723389506340027, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.6273785829544067, "sampling/importance_sampling_ratio/mean": 1.0000228881835938, "sampling/importance_sampling_ratio/min": 0.6445044279098511, "sampling/sampling_logp_difference/max": 0.4869704246520996, "sampling/sampling_logp_difference/mean": 0.017400918528437614, "step": 2336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1258.0, "completions/max_terminated_length": 1258.0, "completions/mean_length": 357.296875, "completions/mean_terminated_length": 357.296875, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "entropy": 0.43740779161453247, "epoch": 2.8639705882352944, "frac_reward_zero_std": 0.75, "grad_norm": 0.6524698104145088, "kl": 0.01961132138967514, "learning_rate": 6.696477956851354e-09, "loss": -0.0152, "num_tokens": 99961325.0, "reward": 0.84375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.608422875404358, "sampling/importance_sampling_ratio/mean": 0.9996290802955627, "sampling/importance_sampling_ratio/min": 0.6288003325462341, "sampling/sampling_logp_difference/max": 0.4752540588378906, "sampling/sampling_logp_difference/mean": 0.013892512768507004, "step": 2337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 737.0, "completions/max_terminated_length": 737.0, "completions/mean_length": 325.171875, "completions/mean_terminated_length": 325.171875, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.44927918910980225, "epoch": 2.8651960784313726, "frac_reward_zero_std": 0.75, "grad_norm": 0.5406981012732434, "kl": 0.02062533050775528, "learning_rate": 6.580778854765489e-09, "loss": 0.0361, "num_tokens": 100002568.0, "reward": 0.6875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.3665498495101929, "sampling/importance_sampling_ratio/mean": 0.9999678134918213, "sampling/importance_sampling_ratio/min": 0.6255471110343933, "sampling/sampling_logp_difference/max": 0.4691286087036133, "sampling/sampling_logp_difference/mean": 0.013860903680324554, "step": 2338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 926.0, "completions/max_terminated_length": 926.0, "completions/mean_length": 425.078125, "completions/mean_terminated_length": 425.078125, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "entropy": 0.44974464178085327, "epoch": 2.866421568627451, "frac_reward_zero_std": 0.75, "grad_norm": 0.5408818861956278, "kl": 0.013742710463702679, "learning_rate": 6.4660813624395905e-09, "loss": 0.0073, "num_tokens": 100050397.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.3389277458190918, "sampling/importance_sampling_ratio/mean": 0.9998536705970764, "sampling/importance_sampling_ratio/min": 0.6056217551231384, "sampling/sampling_logp_difference/max": 0.5014996528625488, "sampling/sampling_logp_difference/mean": 0.013618985190987587, "step": 2339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1432.0, "completions/max_terminated_length": 1432.0, "completions/mean_length": 451.984375, "completions/mean_terminated_length": 451.984375, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.4387035369873047, "epoch": 2.8676470588235294, "frac_reward_zero_std": 0.75, "grad_norm": 0.5103908519241876, "kl": 0.017873995006084442, "learning_rate": 6.3523857127021905e-09, "loss": 0.0188, "num_tokens": 100098364.0, "reward": 0.8125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.875913143157959, "sampling/importance_sampling_ratio/mean": 1.000211477279663, "sampling/importance_sampling_ratio/min": 0.6210402846336365, "sampling/sampling_logp_difference/max": 0.6290955543518066, "sampling/sampling_logp_difference/mean": 0.01354280672967434, "step": 2340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 697.0, "completions/max_terminated_length": 697.0, "completions/mean_length": 362.03125, "completions/mean_terminated_length": 362.03125, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "entropy": 0.5200740098953247, "epoch": 2.868872549019608, "frac_reward_zero_std": 0.5, "grad_norm": 0.7694818875084687, "kl": 0.028647197410464287, "learning_rate": 6.239692136348284e-09, "loss": 0.0034, "num_tokens": 100142094.0, "reward": 0.9375, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.4080097675323486, "sampling/importance_sampling_ratio/mean": 1.0002400875091553, "sampling/importance_sampling_ratio/min": 0.710779070854187, "sampling/sampling_logp_difference/max": 0.342177152633667, "sampling/sampling_logp_difference/mean": 0.015282141044735909, "step": 2341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1586.0, "completions/max_terminated_length": 1586.0, "completions/mean_length": 421.015625, "completions/mean_terminated_length": 421.015625, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "entropy": 0.4624200761318207, "epoch": 2.8700980392156863, "frac_reward_zero_std": 1.0, "grad_norm": 0.0090672371654275, "kl": 0.015695903450250626, "learning_rate": 6.12800086213866e-09, "loss": 0.0002, "num_tokens": 100188399.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3267065286636353, "sampling/importance_sampling_ratio/mean": 1.0001215934753418, "sampling/importance_sampling_ratio/min": 0.736450731754303, "sampling/sampling_logp_difference/max": 0.30591297149658203, "sampling/sampling_logp_difference/mean": 0.013913612812757492, "step": 2342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1036.0, "completions/max_terminated_length": 1036.0, "completions/mean_length": 456.6875, "completions/mean_terminated_length": 456.6875, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "entropy": 0.6845259666442871, "epoch": 2.8713235294117645, "frac_reward_zero_std": 0.25, "grad_norm": 0.8633232556574345, "kl": 0.03195241838693619, "learning_rate": 6.017312116799566e-09, "loss": -0.0593, "num_tokens": 100234667.0, "reward": 0.59375, "reward_std": 0.5457825064659119, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.50296950340271, "sampling/importance_sampling_ratio/mean": 0.9998782873153687, "sampling/importance_sampling_ratio/min": 0.582205057144165, "sampling/sampling_logp_difference/max": 0.5409326553344727, "sampling/sampling_logp_difference/mean": 0.016785066574811935, "step": 2343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/max_terminated_length": 395.0, "completions/mean_length": 212.5, "completions/mean_terminated_length": 212.5, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.39177605509757996, "epoch": 2.872549019607843, "frac_reward_zero_std": 1.0, "grad_norm": 0.01707036476892173, "kl": 0.021115006878972054, "learning_rate": 5.907626125022158e-09, "loss": 0.0002, "num_tokens": 100264779.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.8070577383041382, "sampling/importance_sampling_ratio/mean": 1.0006194114685059, "sampling/importance_sampling_ratio/min": 0.6545250415802002, "sampling/sampling_logp_difference/max": 0.5916999578475952, "sampling/sampling_logp_difference/mean": 0.016103532165288925, "step": 2344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 579.0, "completions/max_terminated_length": 579.0, "completions/mean_length": 312.03125, "completions/mean_terminated_length": 312.03125, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.4210081696510315, "epoch": 2.873774509803922, "frac_reward_zero_std": 1.0, "grad_norm": 0.011607031717638593, "kl": 0.015602907165884972, "learning_rate": 5.798943109461995e-09, "loss": 0.0002, "num_tokens": 100301069.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.3744248151779175, "sampling/importance_sampling_ratio/mean": 1.0001614093780518, "sampling/importance_sampling_ratio/min": 0.6600196361541748, "sampling/sampling_logp_difference/max": 0.4154857397079468, "sampling/sampling_logp_difference/mean": 0.012930480763316154, "step": 2345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 595.0, "completions/max_terminated_length": 595.0, "completions/mean_length": 296.125, "completions/mean_terminated_length": 296.125, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "entropy": 0.4276294708251953, "epoch": 2.875, "frac_reward_zero_std": 1.0, "grad_norm": 0.021682279370563926, "kl": 0.020578129217028618, "learning_rate": 5.691263290738824e-09, "loss": 0.0002, "num_tokens": 100339621.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5989035367965698, "sampling/importance_sampling_ratio/mean": 0.999875545501709, "sampling/importance_sampling_ratio/min": 0.39150330424308777, "sampling/sampling_logp_difference/max": 0.9377613067626953, "sampling/sampling_logp_difference/mean": 0.015631040558218956, "step": 2346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2054.0, "completions/max_terminated_length": 2054.0, "completions/mean_length": 495.90625, "completions/mean_terminated_length": 495.90625, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.4042538106441498, "epoch": 2.876225490196078, "frac_reward_zero_std": 0.75, "grad_norm": 0.5059552801593324, "kl": 0.012142207473516464, "learning_rate": 5.5845868874357385e-09, "loss": -0.0011, "num_tokens": 100392751.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.4418950080871582, "sampling/importance_sampling_ratio/mean": 1.0002624988555908, "sampling/importance_sampling_ratio/min": 0.6701191067695618, "sampling/sampling_logp_difference/max": 0.4002997875213623, "sampling/sampling_logp_difference/mean": 0.012515656650066376, "step": 2347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 826.0, "completions/max_terminated_length": 826.0, "completions/mean_length": 346.78125, "completions/mean_terminated_length": 346.78125, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.5432413816452026, "epoch": 2.877450980392157, "frac_reward_zero_std": 1.0, "grad_norm": 0.01299272848543247, "kl": 0.020082704722881317, "learning_rate": 5.4789141160991314e-09, "loss": 0.0002, "num_tokens": 100438145.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4132767915725708, "sampling/importance_sampling_ratio/mean": 0.999943196773529, "sampling/importance_sampling_ratio/min": 0.6919863224029541, "sampling/sampling_logp_difference/max": 0.36818909645080566, "sampling/sampling_logp_difference/mean": 0.016694845631718636, "step": 2348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1127.0, "completions/max_terminated_length": 1127.0, "completions/mean_length": 345.796875, "completions/mean_terminated_length": 345.796875, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.5404074192047119, "epoch": 2.8786764705882355, "frac_reward_zero_std": 0.25, "grad_norm": 1.0597210224569893, "kl": 0.03607857599854469, "learning_rate": 5.374245191238025e-09, "loss": -0.0058, "num_tokens": 100475252.0, "reward": 0.5, "reward_std": 0.4973389506340027, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6139254570007324, "sampling/importance_sampling_ratio/mean": 0.9997310638427734, "sampling/importance_sampling_ratio/min": 0.6121941804885864, "sampling/sampling_logp_difference/max": 0.49070578813552856, "sampling/sampling_logp_difference/mean": 0.017355889081954956, "step": 2349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1159.0, "completions/max_terminated_length": 1159.0, "completions/mean_length": 372.9375, "completions/mean_terminated_length": 372.9375, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "entropy": 0.4875870943069458, "epoch": 2.8799019607843137, "frac_reward_zero_std": 0.75, "grad_norm": 0.405063940045188, "kl": 0.025961831212043762, "learning_rate": 5.270580325323681e-09, "loss": -0.016, "num_tokens": 100518672.0, "reward": 0.8125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.5383939743041992, "sampling/importance_sampling_ratio/mean": 0.9998830556869507, "sampling/importance_sampling_ratio/min": 0.6055413484573364, "sampling/sampling_logp_difference/max": 0.5016324520111084, "sampling/sampling_logp_difference/mean": 0.012868352234363556, "step": 2350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.0, "completions/max_terminated_length": 532.0, "completions/mean_length": 379.796875, "completions/mean_terminated_length": 379.796875, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "entropy": 0.40388914942741394, "epoch": 2.881127450980392, "frac_reward_zero_std": 0.75, "grad_norm": 0.5330628472348512, "kl": 0.017106374725699425, "learning_rate": 5.167919728789271e-09, "loss": 0.0172, "num_tokens": 100559683.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.3669891357421875, "sampling/importance_sampling_ratio/mean": 0.9998323917388916, "sampling/importance_sampling_ratio/min": 0.6620159149169922, "sampling/sampling_logp_difference/max": 0.4124656915664673, "sampling/sampling_logp_difference/mean": 0.012291660532355309, "step": 2351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 948.0, "completions/max_terminated_length": 948.0, "completions/mean_length": 398.609375, "completions/mean_terminated_length": 398.609375, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.35022321343421936, "epoch": 2.8823529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.010573693993827836, "kl": 0.013440845534205437, "learning_rate": 5.0662636100292086e-09, "loss": 0.0001, "num_tokens": 100599834.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5686079263687134, "sampling/importance_sampling_ratio/mean": 1.0002094507217407, "sampling/importance_sampling_ratio/min": 0.5174283981323242, "sampling/sampling_logp_difference/max": 0.6588841080665588, "sampling/sampling_logp_difference/mean": 0.012268518097698689, "step": 2352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2158.0, "completions/max_terminated_length": 2158.0, "completions/mean_length": 556.4375, "completions/mean_terminated_length": 556.4375, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "entropy": 0.47329506278038025, "epoch": 2.883578431372549, "frac_reward_zero_std": 0.75, "grad_norm": 0.4493309182336163, "kl": 0.015724508091807365, "learning_rate": 4.965612175399092e-09, "loss": -0.0422, "num_tokens": 100659478.0, "reward": 0.21875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 1.600734829902649, "sampling/importance_sampling_ratio/mean": 0.9998318552970886, "sampling/importance_sampling_ratio/min": 0.6241998672485352, "sampling/sampling_logp_difference/max": 0.4712846279144287, "sampling/sampling_logp_difference/mean": 0.012900697067379951, "step": 2353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 912.0, "completions/max_terminated_length": 912.0, "completions/mean_length": 434.796875, "completions/mean_terminated_length": 434.796875, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "entropy": 0.4567154347896576, "epoch": 2.8848039215686274, "frac_reward_zero_std": 0.75, "grad_norm": 0.5362796552362281, "kl": 0.022682778537273407, "learning_rate": 4.865965629214819e-09, "loss": -0.008, "num_tokens": 100706665.0, "reward": 0.625, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.9877583980560303, "sampling/importance_sampling_ratio/mean": 0.9999185800552368, "sampling/importance_sampling_ratio/min": 0.4316330850124359, "sampling/sampling_logp_difference/max": 0.840179443359375, "sampling/sampling_logp_difference/mean": 0.01402033306658268, "step": 2354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 786.0, "completions/max_terminated_length": 786.0, "completions/mean_length": 340.875, "completions/mean_terminated_length": 340.875, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.420245885848999, "epoch": 2.8860294117647056, "frac_reward_zero_std": 0.75, "grad_norm": 0.5400036775150036, "kl": 0.02046874910593033, "learning_rate": 4.767324173752696e-09, "loss": -0.0233, "num_tokens": 100743713.0, "reward": 0.40625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.2826886177062988, "sampling/importance_sampling_ratio/mean": 1.0000203847885132, "sampling/importance_sampling_ratio/min": 0.6186538934707642, "sampling/sampling_logp_difference/max": 0.4802093505859375, "sampling/sampling_logp_difference/mean": 0.012464713305234909, "step": 2355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1319.0, "completions/max_terminated_length": 1319.0, "completions/mean_length": 413.765625, "completions/mean_terminated_length": 413.765625, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.462006151676178, "epoch": 2.8872549019607843, "frac_reward_zero_std": 0.75, "grad_norm": 0.43969871832106505, "kl": 0.024336105212569237, "learning_rate": 4.669688009248607e-09, "loss": 0.0109, "num_tokens": 100790066.0, "reward": 0.15625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.5612155199050903, "sampling/importance_sampling_ratio/mean": 0.9998504519462585, "sampling/importance_sampling_ratio/min": 0.6281097531318665, "sampling/sampling_logp_difference/max": 0.4650404453277588, "sampling/sampling_logp_difference/mean": 0.013461563736200333, "step": 2356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 638.0, "completions/max_terminated_length": 638.0, "completions/mean_length": 310.96875, "completions/mean_terminated_length": 310.96875, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.3789951205253601, "epoch": 2.888480392156863, "frac_reward_zero_std": 1.0, "grad_norm": 0.013111065974960652, "kl": 0.01851443015038967, "learning_rate": 4.5730573338976786e-09, "loss": 0.0002, "num_tokens": 100825088.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4256958961486816, "sampling/importance_sampling_ratio/mean": 1.0000756978988647, "sampling/importance_sampling_ratio/min": 0.6782644987106323, "sampling/sampling_logp_difference/max": 0.3882179260253906, "sampling/sampling_logp_difference/mean": 0.013241841457784176, "step": 2357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1642.0, "completions/max_terminated_length": 1642.0, "completions/mean_length": 534.109375, "completions/mean_terminated_length": 534.109375, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "entropy": 0.43033304810523987, "epoch": 2.889705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 0.5597267115996701, "kl": 0.020393244922161102, "learning_rate": 4.477432343854226e-09, "loss": -0.03, "num_tokens": 100882279.0, "reward": 0.15625, "reward_std": 0.42695626616477966, "rewards/decision_reward_func/mean": 0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.7263907194137573, "sampling/importance_sampling_ratio/mean": 1.0000274181365967, "sampling/importance_sampling_ratio/min": 0.6177425384521484, "sampling/sampling_logp_difference/max": 0.5460329055786133, "sampling/sampling_logp_difference/mean": 0.013729047030210495, "step": 2358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 945.0, "completions/max_terminated_length": 945.0, "completions/mean_length": 377.453125, "completions/mean_terminated_length": 377.453125, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "entropy": 0.34829071164131165, "epoch": 2.8909313725490198, "frac_reward_zero_std": 0.75, "grad_norm": 0.5925239704550483, "kl": 0.016305724158883095, "learning_rate": 4.382813233230698e-09, "loss": 0.0088, "num_tokens": 100923060.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.6315641403198242, "sampling/importance_sampling_ratio/mean": 0.9998087882995605, "sampling/importance_sampling_ratio/min": 0.6368663311004639, "sampling/sampling_logp_difference/max": 0.48953914642333984, "sampling/sampling_logp_difference/mean": 0.01154109463095665, "step": 2359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1804.0, "completions/max_terminated_length": 1804.0, "completions/mean_length": 419.4375, "completions/mean_terminated_length": 419.4375, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.47586923837661743, "epoch": 2.892156862745098, "frac_reward_zero_std": 0.75, "grad_norm": 0.5981641920665849, "kl": 0.01892945170402527, "learning_rate": 4.289200194098119e-09, "loss": 0.0143, "num_tokens": 100969872.0, "reward": 0.125, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 1.3300493955612183, "sampling/importance_sampling_ratio/mean": 0.9996912479400635, "sampling/importance_sampling_ratio/min": 0.6285139918327332, "sampling/sampling_logp_difference/max": 0.46439695358276367, "sampling/sampling_logp_difference/mean": 0.014153093099594116, "step": 2360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1665.0, "completions/max_terminated_length": 1665.0, "completions/mean_length": 452.09375, "completions/mean_terminated_length": 452.09375, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.6261996030807495, "epoch": 2.8933823529411766, "frac_reward_zero_std": 0.0, "grad_norm": 0.9893130145046235, "kl": 0.050888556987047195, "learning_rate": 4.196593416484873e-09, "loss": 0.0157, "num_tokens": 101015382.0, "reward": 0.21875, "reward_std": 0.6331988573074341, "rewards/decision_reward_func/mean": 0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 1.4491047859191895, "sampling/importance_sampling_ratio/mean": 1.000075340270996, "sampling/importance_sampling_ratio/min": 0.6289620995521545, "sampling/sampling_logp_difference/max": 0.4636843204498291, "sampling/sampling_logp_difference/mean": 0.016791917383670807, "step": 2361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1078.0, "completions/max_terminated_length": 1078.0, "completions/mean_length": 398.828125, "completions/mean_terminated_length": 398.828125, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "entropy": 0.45048823952674866, "epoch": 2.894607843137255, "frac_reward_zero_std": 0.75, "grad_norm": 0.4449634449864983, "kl": 0.018764588981866837, "learning_rate": 4.104993088376974e-09, "loss": 0.0164, "num_tokens": 101056091.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.6479456424713135, "sampling/importance_sampling_ratio/mean": 1.0000815391540527, "sampling/importance_sampling_ratio/min": 0.7063596844673157, "sampling/sampling_logp_difference/max": 0.49952948093414307, "sampling/sampling_logp_difference/mean": 0.013864700682461262, "step": 2362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 931.0, "completions/max_terminated_length": 931.0, "completions/mean_length": 457.46875, "completions/mean_terminated_length": 457.46875, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "entropy": 0.585579514503479, "epoch": 2.8958333333333335, "frac_reward_zero_std": 0.5, "grad_norm": 0.7553140684636795, "kl": 0.02793554589152336, "learning_rate": 4.0143993957171826e-09, "loss": -0.0126, "num_tokens": 101110121.0, "reward": -0.125, "reward_std": 0.481805682182312, "rewards/decision_reward_func/mean": -0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 1.8437440395355225, "sampling/importance_sampling_ratio/mean": 1.0004198551177979, "sampling/importance_sampling_ratio/min": 0.7053361535072327, "sampling/sampling_logp_difference/max": 0.6117982864379883, "sampling/sampling_logp_difference/mean": 0.015125397592782974, "step": 2363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.0, "completions/max_terminated_length": 546.0, "completions/mean_length": 323.0625, "completions/mean_terminated_length": 323.0625, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "entropy": 0.42233872413635254, "epoch": 2.8970588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 0.012226183860073028, "kl": 0.014871018007397652, "learning_rate": 3.924812522404952e-09, "loss": 0.0002, "num_tokens": 101153421.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9747731685638428, "sampling/importance_sampling_ratio/mean": 1.0001856088638306, "sampling/importance_sampling_ratio/min": 0.655364453792572, "sampling/sampling_logp_difference/max": 0.6804535388946533, "sampling/sampling_logp_difference/mean": 0.014234799891710281, "step": 2364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1277.0, "completions/max_terminated_length": 1277.0, "completions/mean_length": 580.328125, "completions/mean_terminated_length": 580.328125, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "entropy": 0.4147486686706543, "epoch": 2.8982843137254903, "frac_reward_zero_std": 0.75, "grad_norm": 0.4486423856040495, "kl": 0.01813938468694687, "learning_rate": 3.836232650296034e-09, "loss": -0.0547, "num_tokens": 101209794.0, "reward": 0.0625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.4651553630828857, "sampling/importance_sampling_ratio/mean": 1.0000543594360352, "sampling/importance_sampling_ratio/min": 0.6371139287948608, "sampling/sampling_logp_difference/max": 0.4508068561553955, "sampling/sampling_logp_difference/mean": 0.011888746172189713, "step": 2365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 599.0, "completions/max_terminated_length": 599.0, "completions/mean_length": 336.859375, "completions/mean_terminated_length": 336.859375, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.5970066785812378, "epoch": 2.8995098039215685, "frac_reward_zero_std": 0.5, "grad_norm": 0.9279403296012516, "kl": 0.03471086919307709, "learning_rate": 3.748659959201928e-09, "loss": 0.0157, "num_tokens": 101248121.0, "reward": 0.6875, "reward_std": 0.4577302038669586, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.4275636672973633, "sampling/importance_sampling_ratio/mean": 0.9997364282608032, "sampling/importance_sampling_ratio/min": 0.5897160768508911, "sampling/sampling_logp_difference/max": 0.5281140804290771, "sampling/sampling_logp_difference/mean": 0.017823277041316032, "step": 2366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 617.0, "completions/max_terminated_length": 617.0, "completions/mean_length": 299.4375, "completions/mean_terminated_length": 299.4375, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.4853855073451996, "epoch": 2.900735294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.1259953643801917, "kl": 0.022653069347143173, "learning_rate": 3.6620946268896556e-09, "loss": 0.0017, "num_tokens": 101281813.0, "reward": 0.71875, "reward_std": 0.38319888710975647, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.6216003894805908, "sampling/importance_sampling_ratio/mean": 0.9999677538871765, "sampling/importance_sampling_ratio/min": 0.6392384171485901, "sampling/sampling_logp_difference/max": 0.4834134578704834, "sampling/sampling_logp_difference/mean": 0.015542672015726566, "step": 2367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 702.0, "completions/max_terminated_length": 702.0, "completions/mean_length": 323.125, "completions/mean_terminated_length": 323.125, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.3951762914657593, "epoch": 2.9019607843137254, "frac_reward_zero_std": 0.75, "grad_norm": 0.5631749861909406, "kl": 0.017375687137246132, "learning_rate": 3.5765368290813223e-09, "loss": -0.0214, "num_tokens": 101321517.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.4413392543792725, "sampling/importance_sampling_ratio/mean": 0.9998010396957397, "sampling/importance_sampling_ratio/min": 0.5128399133682251, "sampling/sampling_logp_difference/max": 0.667791485786438, "sampling/sampling_logp_difference/mean": 0.012127013877034187, "step": 2368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 848.0, "completions/max_terminated_length": 848.0, "completions/mean_length": 372.96875, "completions/mean_terminated_length": 372.96875, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "entropy": 0.5377576351165771, "epoch": 2.903186274509804, "frac_reward_zero_std": 0.75, "grad_norm": 0.5808931204709321, "kl": 0.02451283298432827, "learning_rate": 3.491986739453889e-09, "loss": 0.0074, "num_tokens": 101365547.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.5475584268569946, "sampling/importance_sampling_ratio/mean": 0.9999323487281799, "sampling/importance_sampling_ratio/min": 0.5070363879203796, "sampling/sampling_logp_difference/max": 0.6791725158691406, "sampling/sampling_logp_difference/mean": 0.01629979908466339, "step": 2369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/max_terminated_length": 490.0, "completions/mean_length": 282.328125, "completions/mean_terminated_length": 282.328125, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.3158482015132904, "epoch": 2.9044117647058822, "frac_reward_zero_std": 1.0, "grad_norm": 0.012815212578498793, "kl": 0.014528009109199047, "learning_rate": 3.4084445296386767e-09, "loss": 0.0001, "num_tokens": 101403936.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.455664873123169, "sampling/importance_sampling_ratio/mean": 1.0002117156982422, "sampling/importance_sampling_ratio/min": 0.6630069613456726, "sampling/sampling_logp_difference/max": 0.41096973419189453, "sampling/sampling_logp_difference/mean": 0.012418247759342194, "step": 2370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 809.0, "completions/max_terminated_length": 809.0, "completions/mean_length": 347.5, "completions/mean_terminated_length": 347.5, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.45933470129966736, "epoch": 2.905637254901961, "frac_reward_zero_std": 1.0, "grad_norm": 0.012337558633712956, "kl": 0.016666121780872345, "learning_rate": 3.3259103692209745e-09, "loss": 0.0002, "num_tokens": 101443536.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.525689959526062, "sampling/importance_sampling_ratio/mean": 0.9997974634170532, "sampling/importance_sampling_ratio/min": 0.7071335911750793, "sampling/sampling_logp_difference/max": 0.42244672775268555, "sampling/sampling_logp_difference/mean": 0.014951761811971664, "step": 2371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1538.0, "completions/max_terminated_length": 1538.0, "completions/mean_length": 472.71875, "completions/mean_terminated_length": 472.71875, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.4698610305786133, "epoch": 2.906862745098039, "frac_reward_zero_std": 0.5, "grad_norm": 0.8224801373076821, "kl": 0.02003704383969307, "learning_rate": 3.2443844257400434e-09, "loss": 0.0628, "num_tokens": 101497550.0, "reward": 0.46875, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.5686978101730347, "sampling/importance_sampling_ratio/mean": 1.000051498413086, "sampling/importance_sampling_ratio/min": 0.49542471766471863, "sampling/sampling_logp_difference/max": 0.7023398876190186, "sampling/sampling_logp_difference/mean": 0.015213597565889359, "step": 2372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 855.0, "completions/max_terminated_length": 855.0, "completions/mean_length": 358.890625, "completions/mean_terminated_length": 358.890625, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.4917612075805664, "epoch": 2.9080882352941178, "frac_reward_zero_std": 0.5, "grad_norm": 0.8436931988869769, "kl": 0.03277352452278137, "learning_rate": 3.163866864688336e-09, "loss": -0.0493, "num_tokens": 101539287.0, "reward": -0.34375, "reward_std": 0.47978055477142334, "rewards/decision_reward_func/mean": -0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 1.8891140222549438, "sampling/importance_sampling_ratio/mean": 0.9998376965522766, "sampling/importance_sampling_ratio/min": 0.5913247466087341, "sampling/sampling_logp_difference/max": 0.6361079216003418, "sampling/sampling_logp_difference/mean": 0.015099632553756237, "step": 2373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 901.0, "completions/max_terminated_length": 901.0, "completions/mean_length": 391.828125, "completions/mean_terminated_length": 391.828125, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.43714913725852966, "epoch": 2.909313725490196, "frac_reward_zero_std": 1.0, "grad_norm": 0.011171177111290356, "kl": 0.016686713322997093, "learning_rate": 3.0843578495113877e-09, "loss": 0.0002, "num_tokens": 101583180.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3313146829605103, "sampling/importance_sampling_ratio/mean": 0.9998319149017334, "sampling/importance_sampling_ratio/min": 0.5100737810134888, "sampling/sampling_logp_difference/max": 0.6731998324394226, "sampling/sampling_logp_difference/mean": 0.013805728405714035, "step": 2374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 699.0, "completions/max_terminated_length": 699.0, "completions/mean_length": 318.90625, "completions/mean_terminated_length": 318.90625, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.527111291885376, "epoch": 2.9105392156862746, "frac_reward_zero_std": 0.25, "grad_norm": 1.0586789767484042, "kl": 0.05068255960941315, "learning_rate": 3.0058575416073707e-09, "loss": -0.0025, "num_tokens": 101619702.0, "reward": 0.5, "reward_std": 0.7054125070571899, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4035557508468628, "sampling/importance_sampling_ratio/mean": 1.0004234313964844, "sampling/importance_sampling_ratio/min": 0.6519959568977356, "sampling/sampling_logp_difference/max": 0.4277169704437256, "sampling/sampling_logp_difference/mean": 0.016016999259591103, "step": 2375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 983.0, "completions/max_terminated_length": 983.0, "completions/mean_length": 311.359375, "completions/mean_terminated_length": 311.359375, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.44835978746414185, "epoch": 2.911764705882353, "frac_reward_zero_std": 0.75, "grad_norm": 0.7982488664982114, "kl": 0.022948503494262695, "learning_rate": 2.9283661003270952e-09, "loss": -0.0181, "num_tokens": 101657757.0, "reward": 0.71875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.9153026342391968, "sampling/importance_sampling_ratio/mean": 0.9996403455734253, "sampling/importance_sampling_ratio/min": 0.6044558882713318, "sampling/sampling_logp_difference/max": 0.6498756408691406, "sampling/sampling_logp_difference/mean": 0.015696104615926743, "step": 2376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 572.0, "completions/max_terminated_length": 572.0, "completions/mean_length": 330.734375, "completions/mean_terminated_length": 330.734375, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "entropy": 0.44945186376571655, "epoch": 2.9129901960784315, "frac_reward_zero_std": 0.75, "grad_norm": 0.5640226698231396, "kl": 0.022824298590421677, "learning_rate": 2.851883682973233e-09, "loss": 0.0064, "num_tokens": 101699596.0, "reward": 0.71875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.4544235467910767, "sampling/importance_sampling_ratio/mean": 0.9998728036880493, "sampling/importance_sampling_ratio/min": 0.37515220046043396, "sampling/sampling_logp_difference/max": 0.9804234504699707, "sampling/sampling_logp_difference/mean": 0.013840720057487488, "step": 2377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 634.0, "completions/max_terminated_length": 634.0, "completions/mean_length": 320.46875, "completions/mean_terminated_length": 320.46875, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.42190656065940857, "epoch": 2.9142156862745097, "frac_reward_zero_std": 1.0, "grad_norm": 0.028683388645002274, "kl": 0.029486149549484253, "learning_rate": 2.776410444800148e-09, "loss": 0.0003, "num_tokens": 101740346.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.401697039604187, "sampling/importance_sampling_ratio/mean": 0.9998193383216858, "sampling/importance_sampling_ratio/min": 0.5513423085212708, "sampling/sampling_logp_difference/max": 0.5953993797302246, "sampling/sampling_logp_difference/mean": 0.013330800458788872, "step": 2378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1354.0, "completions/max_terminated_length": 1354.0, "completions/mean_length": 404.96875, "completions/mean_terminated_length": 404.96875, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.36291930079460144, "epoch": 2.9154411764705883, "frac_reward_zero_std": 1.0, "grad_norm": 0.010738855999125503, "kl": 0.0148126520216465, "learning_rate": 2.701946539013844e-09, "loss": 0.0001, "num_tokens": 101783720.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.2966948747634888, "sampling/importance_sampling_ratio/mean": 0.9999821186065674, "sampling/importance_sampling_ratio/min": 0.6641217470169067, "sampling/sampling_logp_difference/max": 0.40928971767425537, "sampling/sampling_logp_difference/mean": 0.011929331347346306, "step": 2379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1209.0, "completions/max_terminated_length": 1209.0, "completions/mean_length": 351.671875, "completions/mean_terminated_length": 351.671875, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.49954572319984436, "epoch": 2.9166666666666665, "frac_reward_zero_std": 1.0, "grad_norm": 0.14687159430561308, "kl": 0.019568584859371185, "learning_rate": 2.628492116771297e-09, "loss": 0.0002, "num_tokens": 101824771.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999896287918091, "sampling/importance_sampling_ratio/min": 0.07084600627422333, "sampling/sampling_logp_difference/max": 2.6472465991973877, "sampling/sampling_logp_difference/mean": 0.015569744631648064, "step": 2380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 671.0, "completions/max_terminated_length": 671.0, "completions/mean_length": 271.078125, "completions/mean_terminated_length": 271.078125, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.6893182396888733, "epoch": 2.917892156862745, "frac_reward_zero_std": 0.25, "grad_norm": 1.3306009198134419, "kl": 0.03732561320066452, "learning_rate": 2.556047327180344e-09, "loss": -0.0261, "num_tokens": 101858136.0, "reward": 0.0, "reward_std": 0.644389271736145, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.3179235458374023, "sampling/importance_sampling_ratio/mean": 0.9993493556976318, "sampling/importance_sampling_ratio/min": 0.7305989265441895, "sampling/sampling_logp_difference/max": 0.31389057636260986, "sampling/sampling_logp_difference/mean": 0.02014678716659546, "step": 2381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 616.0, "completions/max_terminated_length": 616.0, "completions/mean_length": 284.015625, "completions/mean_terminated_length": 284.015625, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.5217045545578003, "epoch": 2.9191176470588234, "frac_reward_zero_std": 0.75, "grad_norm": 0.6211525948275114, "kl": 0.04024787247180939, "learning_rate": 2.484612317299295e-09, "loss": 0.0067, "num_tokens": 101894185.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.5015239715576172, "sampling/importance_sampling_ratio/mean": 0.999920129776001, "sampling/importance_sampling_ratio/min": 0.6268008947372437, "sampling/sampling_logp_difference/max": 0.46712636947631836, "sampling/sampling_logp_difference/mean": 0.01714622974395752, "step": 2382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 717.0, "completions/max_terminated_length": 717.0, "completions/mean_length": 399.625, "completions/mean_terminated_length": 399.625, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "entropy": 0.4980427622795105, "epoch": 2.920343137254902, "frac_reward_zero_std": 0.25, "grad_norm": 0.9595886712568085, "kl": 0.03657931089401245, "learning_rate": 2.4141872321367107e-09, "loss": -0.005, "num_tokens": 101935809.0, "reward": 0.09375, "reward_std": 0.6331988573074341, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.8445580005645752, "sampling/importance_sampling_ratio/mean": 0.9997932314872742, "sampling/importance_sampling_ratio/min": 0.7602354884147644, "sampling/sampling_logp_difference/max": 0.6122397184371948, "sampling/sampling_logp_difference/mean": 0.01477772369980812, "step": 2383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 354.640625, "completions/mean_terminated_length": 354.640625, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.5037837028503418, "epoch": 2.9215686274509802, "frac_reward_zero_std": 0.75, "grad_norm": 0.7392211488391562, "kl": 0.021039564162492752, "learning_rate": 2.344772214651014e-09, "loss": 0.036, "num_tokens": 101981498.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.4343039989471436, "sampling/importance_sampling_ratio/mean": 1.00026535987854, "sampling/importance_sampling_ratio/min": 0.6546977162361145, "sampling/sampling_logp_difference/max": 0.42358171939849854, "sampling/sampling_logp_difference/mean": 0.015002001076936722, "step": 2384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1576.0, "completions/max_terminated_length": 1576.0, "completions/mean_length": 554.21875, "completions/mean_terminated_length": 554.21875, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.3302261531352997, "epoch": 2.922794117647059, "frac_reward_zero_std": 0.75, "grad_norm": 0.5185514615906703, "kl": 0.00990782119333744, "learning_rate": 2.2763674057503235e-09, "loss": 0.0378, "num_tokens": 102041208.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.5648448467254639, "sampling/importance_sampling_ratio/mean": 1.0000693798065186, "sampling/importance_sampling_ratio/min": 0.6826832294464111, "sampling/sampling_logp_difference/max": 0.44778668880462646, "sampling/sampling_logp_difference/mean": 0.009912814944982529, "step": 2385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 702.0, "completions/max_terminated_length": 702.0, "completions/mean_length": 344.65625, "completions/mean_terminated_length": 344.65625, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.43048086762428284, "epoch": 2.924019607843137, "frac_reward_zero_std": 0.75, "grad_norm": 0.5790455889663629, "kl": 0.024607647210359573, "learning_rate": 2.20897294429212e-09, "loss": -0.0252, "num_tokens": 102080690.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.3266682624816895, "sampling/importance_sampling_ratio/mean": 1.0001416206359863, "sampling/importance_sampling_ratio/min": 0.5260921716690063, "sampling/sampling_logp_difference/max": 0.6422789096832275, "sampling/sampling_logp_difference/mean": 0.013860352337360382, "step": 2386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1010.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 384.015625, "completions/mean_terminated_length": 384.015625, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.5160247683525085, "epoch": 2.9252450980392157, "frac_reward_zero_std": 0.75, "grad_norm": 0.6833996872211724, "kl": 0.02392728254199028, "learning_rate": 2.142588967082748e-09, "loss": 0.0399, "num_tokens": 102125139.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0002415180206299, "sampling/importance_sampling_ratio/min": 0.6073929667472839, "sampling/sampling_logp_difference/max": 0.7450170516967773, "sampling/sampling_logp_difference/mean": 0.015752974897623062, "step": 2387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1147.0, "completions/max_terminated_length": 1147.0, "completions/mean_length": 418.75, "completions/mean_terminated_length": 418.75, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.4918231666088104, "epoch": 2.9264705882352944, "frac_reward_zero_std": 0.75, "grad_norm": 0.41934719220658917, "kl": 0.022952277213335037, "learning_rate": 2.0772156088776913e-09, "loss": -0.0007, "num_tokens": 102168227.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.5111219882965088, "sampling/importance_sampling_ratio/mean": 1.000442624092102, "sampling/importance_sampling_ratio/min": 0.7166538834571838, "sampling/sampling_logp_difference/max": 0.41285240650177, "sampling/sampling_logp_difference/mean": 0.01631319150328636, "step": 2388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1132.0, "completions/max_terminated_length": 1132.0, "completions/mean_length": 459.625, "completions/mean_terminated_length": 459.625, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "entropy": 0.5766803026199341, "epoch": 2.9276960784313726, "frac_reward_zero_std": 0.5, "grad_norm": 0.7130935555023065, "kl": 0.03183315694332123, "learning_rate": 2.0128530023804656e-09, "loss": 0.0393, "num_tokens": 102217307.0, "reward": 0.3125, "reward_std": 0.47360679507255554, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.4495574235916138, "sampling/importance_sampling_ratio/mean": 1.000240683555603, "sampling/importance_sampling_ratio/min": 0.6595990657806396, "sampling/sampling_logp_difference/max": 0.4161231517791748, "sampling/sampling_logp_difference/mean": 0.015768205747008324, "step": 2389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1757.0, "completions/max_terminated_length": 1757.0, "completions/mean_length": 413.875, "completions/mean_terminated_length": 413.875, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "entropy": 0.4791301488876343, "epoch": 2.928921568627451, "frac_reward_zero_std": 0.5, "grad_norm": 0.9133044056268732, "kl": 0.02137903869152069, "learning_rate": 1.9495012782433375e-09, "loss": 0.1817, "num_tokens": 102266403.0, "reward": 0.0625, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.490382194519043, "sampling/importance_sampling_ratio/mean": 0.9999435544013977, "sampling/importance_sampling_ratio/min": 0.6361784338951111, "sampling/sampling_logp_difference/max": 0.45227622985839844, "sampling/sampling_logp_difference/mean": 0.01439676620066166, "step": 2390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 865.0, "completions/max_terminated_length": 865.0, "completions/mean_length": 307.703125, "completions/mean_terminated_length": 307.703125, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.38397425413131714, "epoch": 2.9301470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.01159763092845504, "kl": 0.020407523959875107, "learning_rate": 1.887160565066048e-09, "loss": 0.0002, "num_tokens": 102303376.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001131296157837, "sampling/importance_sampling_ratio/min": 0.6115626692771912, "sampling/sampling_logp_difference/max": 1.0462002754211426, "sampling/sampling_logp_difference/mean": 0.013601155951619148, "step": 2391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 794.0, "completions/max_terminated_length": 794.0, "completions/mean_length": 402.765625, "completions/mean_terminated_length": 402.765625, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.276860773563385, "epoch": 2.931372549019608, "frac_reward_zero_std": 1.0, "grad_norm": 0.009003511909639921, "kl": 0.014357026666402817, "learning_rate": 1.8258309893965374e-09, "loss": 0.0001, "num_tokens": 102351201.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0003635883331299, "sampling/importance_sampling_ratio/min": 0.5704283118247986, "sampling/sampling_logp_difference/max": 0.8980274200439453, "sampling/sampling_logp_difference/mean": 0.00964096374809742, "step": 2392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/max_terminated_length": 509.0, "completions/mean_length": 294.21875, "completions/mean_terminated_length": 294.21875, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.3950996696949005, "epoch": 2.9325980392156863, "frac_reward_zero_std": 1.0, "grad_norm": 0.012135216175438557, "kl": 0.018104471266269684, "learning_rate": 1.7655126757297744e-09, "loss": 0.0002, "num_tokens": 102388591.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.655230164527893, "sampling/importance_sampling_ratio/mean": 1.0004515647888184, "sampling/importance_sampling_ratio/min": 0.6625629663467407, "sampling/sampling_logp_difference/max": 0.5039401054382324, "sampling/sampling_logp_difference/mean": 0.013159911148250103, "step": 2393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 630.0, "completions/max_terminated_length": 630.0, "completions/mean_length": 311.953125, "completions/mean_terminated_length": 311.953125, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "entropy": 0.5026039481163025, "epoch": 2.9338235294117645, "frac_reward_zero_std": 0.75, "grad_norm": 0.6313188514367479, "kl": 0.024290181696414948, "learning_rate": 1.7062057465082046e-09, "loss": -0.0059, "num_tokens": 102426684.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.6218171119689941, "sampling/importance_sampling_ratio/mean": 0.999970555305481, "sampling/importance_sampling_ratio/min": 0.6583689451217651, "sampling/sampling_logp_difference/max": 0.4835472106933594, "sampling/sampling_logp_difference/mean": 0.015076215378940105, "step": 2394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 709.0, "completions/max_terminated_length": 709.0, "completions/mean_length": 337.078125, "completions/mean_terminated_length": 337.078125, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "entropy": 0.4485880136489868, "epoch": 2.935049019607843, "frac_reward_zero_std": 1.0, "grad_norm": 0.010504199503866779, "kl": 0.015553511679172516, "learning_rate": 1.6479103221211377e-09, "loss": 0.0002, "num_tokens": 102467633.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9998641610145569, "sampling/importance_sampling_ratio/min": 0.5753878355026245, "sampling/sampling_logp_difference/max": 0.9623703956604004, "sampling/sampling_logp_difference/mean": 0.01417599432170391, "step": 2395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 852.0, "completions/max_terminated_length": 852.0, "completions/mean_length": 403.09375, "completions/mean_terminated_length": 403.09375, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "entropy": 0.4495130181312561, "epoch": 2.936274509803922, "frac_reward_zero_std": 0.75, "grad_norm": 0.5774512747027976, "kl": 0.0152653269469738, "learning_rate": 1.5906265209045254e-09, "loss": 0.0196, "num_tokens": 102511079.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.4487274885177612, "sampling/importance_sampling_ratio/mean": 1.0001089572906494, "sampling/importance_sampling_ratio/min": 0.6370663046836853, "sampling/sampling_logp_difference/max": 0.4508814811706543, "sampling/sampling_logp_difference/mean": 0.013456995598971844, "step": 2396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1744.0, "completions/max_terminated_length": 1744.0, "completions/mean_length": 404.375, "completions/mean_terminated_length": 404.375, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "entropy": 0.3465293049812317, "epoch": 2.9375, "frac_reward_zero_std": 0.75, "grad_norm": 0.6234005886696696, "kl": 0.01594289019703865, "learning_rate": 1.534354459140963e-09, "loss": -0.0563, "num_tokens": 102551151.0, "reward": -0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": -0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.4563392400741577, "sampling/importance_sampling_ratio/mean": 0.9999030828475952, "sampling/importance_sampling_ratio/min": 0.5127838253974915, "sampling/sampling_logp_difference/max": 0.6679009199142456, "sampling/sampling_logp_difference/mean": 0.01130159292370081, "step": 2397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 750.0, "completions/max_terminated_length": 750.0, "completions/mean_length": 293.984375, "completions/mean_terminated_length": 293.984375, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.41846752166748047, "epoch": 2.938725490196078, "frac_reward_zero_std": 1.0, "grad_norm": 0.01722065049955717, "kl": 0.027315281331539154, "learning_rate": 1.4790942510590766e-09, "loss": 0.0003, "num_tokens": 102586878.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4628030061721802, "sampling/importance_sampling_ratio/mean": 1.0002981424331665, "sampling/importance_sampling_ratio/min": 0.6802482008934021, "sampling/sampling_logp_difference/max": 0.3852975368499756, "sampling/sampling_logp_difference/mean": 0.01503305695950985, "step": 2398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1650.0, "completions/max_terminated_length": 1650.0, "completions/mean_length": 352.328125, "completions/mean_terminated_length": 352.328125, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.48209497332572937, "epoch": 2.939950980392157, "frac_reward_zero_std": 0.75, "grad_norm": 0.9679070845417004, "kl": 0.023282263427972794, "learning_rate": 1.4248460088335801e-09, "loss": -0.0195, "num_tokens": 102625875.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.6008986234664917, "sampling/importance_sampling_ratio/mean": 1.0000529289245605, "sampling/importance_sampling_ratio/min": 0.5996454954147339, "sampling/sampling_logp_difference/max": 0.5114166736602783, "sampling/sampling_logp_difference/mean": 0.015698233619332314, "step": 2399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1181.0, "completions/max_terminated_length": 1181.0, "completions/mean_length": 485.796875, "completions/mean_terminated_length": 485.796875, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "entropy": 0.47003060579299927, "epoch": 2.9411764705882355, "frac_reward_zero_std": 0.75, "grad_norm": 0.5547575061554042, "kl": 0.017660362645983696, "learning_rate": 1.371609842585053e-09, "loss": -0.0055, "num_tokens": 102675494.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0002777576446533, "sampling/importance_sampling_ratio/min": 0.5397968292236328, "sampling/sampling_logp_difference/max": 0.8371992111206055, "sampling/sampling_logp_difference/mean": 0.014440581202507019, "step": 2400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1107.0, "completions/max_terminated_length": 1107.0, "completions/mean_length": 420.34375, "completions/mean_terminated_length": 420.34375, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.6927508115768433, "epoch": 2.9424019607843137, "frac_reward_zero_std": 0.5, "grad_norm": 0.667382976562069, "kl": 0.03292712941765785, "learning_rate": 1.319385860379496e-09, "loss": -0.0001, "num_tokens": 102724700.0, "reward": 0.03125, "reward_std": 0.48935678601264954, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.5566211938858032, "sampling/importance_sampling_ratio/mean": 0.9995379447937012, "sampling/importance_sampling_ratio/min": 0.5896794199943542, "sampling/sampling_logp_difference/max": 0.5281763076782227, "sampling/sampling_logp_difference/mean": 0.018674185499548912, "step": 2401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 801.0, "completions/max_terminated_length": 801.0, "completions/mean_length": 452.0, "completions/mean_terminated_length": 452.0, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "entropy": 0.5410182476043701, "epoch": 2.943627450980392, "frac_reward_zero_std": 0.5, "grad_norm": 0.6970243257201746, "kl": 0.027795832604169846, "learning_rate": 1.2681741682282754e-09, "loss": 0.013, "num_tokens": 102768268.0, "reward": 0.4375, "reward_std": 0.44091323018074036, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.4557604789733887, "sampling/importance_sampling_ratio/mean": 0.9999927282333374, "sampling/importance_sampling_ratio/min": 0.7469547390937805, "sampling/sampling_logp_difference/max": 0.3755284547805786, "sampling/sampling_logp_difference/mean": 0.014616057276725769, "step": 2402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1031.0, "completions/max_terminated_length": 1031.0, "completions/mean_length": 439.796875, "completions/mean_terminated_length": 439.796875, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "entropy": 0.4160004258155823, "epoch": 2.9448529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 0.657732317522406, "kl": 0.017930006608366966, "learning_rate": 1.217974870087901e-09, "loss": 0.0167, "num_tokens": 102814111.0, "reward": 0.3125, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.5371429920196533, "sampling/importance_sampling_ratio/mean": 0.9999366998672485, "sampling/importance_sampling_ratio/min": 0.5414462089538574, "sampling/sampling_logp_difference/max": 0.6135115623474121, "sampling/sampling_logp_difference/mean": 0.012757278978824615, "step": 2403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 708.0, "completions/max_terminated_length": 708.0, "completions/mean_length": 334.484375, "completions/mean_terminated_length": 334.484375, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "entropy": 0.600782573223114, "epoch": 2.946078431372549, "frac_reward_zero_std": 0.5, "grad_norm": 0.836416987353054, "kl": 0.04375313222408295, "learning_rate": 1.1687880678596939e-09, "loss": -0.012, "num_tokens": 102861038.0, "reward": 0.4375, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.4565348625183105, "sampling/importance_sampling_ratio/mean": 0.9997093081474304, "sampling/importance_sampling_ratio/min": 0.6984063386917114, "sampling/sampling_logp_difference/max": 0.37606024742126465, "sampling/sampling_logp_difference/mean": 0.017171017825603485, "step": 2404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/max_terminated_length": 491.0, "completions/mean_length": 297.578125, "completions/mean_terminated_length": 297.578125, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.42156916856765747, "epoch": 2.9473039215686274, "frac_reward_zero_std": 1.0, "grad_norm": 0.011638759678400114, "kl": 0.016041137278079987, "learning_rate": 1.1206138613898962e-09, "loss": 0.0002, "num_tokens": 102895267.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4058191776275635, "sampling/importance_sampling_ratio/mean": 0.9997466206550598, "sampling/importance_sampling_ratio/min": 0.6923624277114868, "sampling/sampling_logp_difference/max": 0.3676457405090332, "sampling/sampling_logp_difference/mean": 0.014277536422014236, "step": 2405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 883.0, "completions/max_terminated_length": 883.0, "completions/mean_length": 346.28125, "completions/mean_terminated_length": 346.28125, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.3311030864715576, "epoch": 2.9485294117647056, "frac_reward_zero_std": 1.0, "grad_norm": 0.01122283465911767, "kl": 0.014614081010222435, "learning_rate": 1.0734523484689507e-09, "loss": 0.0001, "num_tokens": 102937189.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999592900276184, "sampling/importance_sampling_ratio/min": 0.6818803548812866, "sampling/sampling_logp_difference/max": 0.702930212020874, "sampling/sampling_logp_difference/mean": 0.011665558442473412, "step": 2406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 632.0, "completions/max_terminated_length": 632.0, "completions/mean_length": 355.296875, "completions/mean_terminated_length": 355.296875, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "entropy": 0.43234983086586, "epoch": 2.9497549019607843, "frac_reward_zero_std": 1.0, "grad_norm": 0.010583544449507524, "kl": 0.013685423880815506, "learning_rate": 1.0273036248318324e-09, "loss": 0.0001, "num_tokens": 102977896.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.5600674152374268, "sampling/importance_sampling_ratio/mean": 0.9999391436576843, "sampling/importance_sampling_ratio/min": 0.6602451801300049, "sampling/sampling_logp_difference/max": 0.4447290897369385, "sampling/sampling_logp_difference/mean": 0.013059508055448532, "step": 2407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1367.0, "completions/max_terminated_length": 1367.0, "completions/mean_length": 468.625, "completions/mean_terminated_length": 468.625, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.48551443219184875, "epoch": 2.950980392156863, "frac_reward_zero_std": 0.5, "grad_norm": 0.6141092395832043, "kl": 0.026434605941176414, "learning_rate": 9.82167784157495e-10, "loss": 0.0621, "num_tokens": 103023232.0, "reward": 0.75, "reward_std": 0.42078250646591187, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.5290569067001343, "sampling/importance_sampling_ratio/mean": 1.0001602172851562, "sampling/importance_sampling_ratio/min": 0.5877507328987122, "sampling/sampling_logp_difference/max": 0.5314522981643677, "sampling/sampling_logp_difference/mean": 0.013840479776263237, "step": 2408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1154.0, "completions/max_terminated_length": 1154.0, "completions/mean_length": 380.9375, "completions/mean_terminated_length": 380.9375, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "entropy": 0.5217939615249634, "epoch": 2.952205882352941, "frac_reward_zero_std": 0.75, "grad_norm": 0.6063252408663188, "kl": 0.043998878449201584, "learning_rate": 9.380449180688143e-10, "loss": -0.0179, "num_tokens": 103064428.0, "reward": 0.84375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.6589281558990479, "sampling/importance_sampling_ratio/mean": 0.9998233318328857, "sampling/importance_sampling_ratio/min": 0.6284029483795166, "sampling/sampling_logp_difference/max": 0.506171703338623, "sampling/sampling_logp_difference/mean": 0.014948352240025997, "step": 2409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 958.0, "completions/max_terminated_length": 958.0, "completions/mean_length": 444.234375, "completions/mean_terminated_length": 444.234375, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "entropy": 0.36924922466278076, "epoch": 2.9534313725490198, "frac_reward_zero_std": 1.0, "grad_norm": 0.008904830811480203, "kl": 0.011585619300603867, "learning_rate": 8.949351161324225e-10, "loss": 0.0001, "num_tokens": 103114715.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5044128894805908, "sampling/importance_sampling_ratio/mean": 1.000239610671997, "sampling/importance_sampling_ratio/min": 0.6341606974601746, "sampling/sampling_logp_difference/max": 0.45545291900634766, "sampling/sampling_logp_difference/mean": 0.01192442700266838, "step": 2410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 457.0, "completions/max_terminated_length": 457.0, "completions/mean_length": 262.40625, "completions/mean_terminated_length": 262.40625, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.48368459939956665, "epoch": 2.954656862745098, "frac_reward_zero_std": 0.25, "grad_norm": 1.200398886571262, "kl": 0.03357073292136192, "learning_rate": 8.528384658584853e-10, "loss": 0.0246, "num_tokens": 103146245.0, "reward": 0.53125, "reward_std": 0.519389271736145, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.437429428100586, "sampling/importance_sampling_ratio/mean": 0.9998984336853027, "sampling/importance_sampling_ratio/min": 0.6196638941764832, "sampling/sampling_logp_difference/max": 0.4785780906677246, "sampling/sampling_logp_difference/mean": 0.01601744256913662, "step": 2411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 704.0, "completions/max_terminated_length": 704.0, "completions/mean_length": 346.015625, "completions/mean_terminated_length": 346.015625, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.38142722845077515, "epoch": 2.9558823529411766, "frac_reward_zero_std": 0.75, "grad_norm": 0.5998761516347489, "kl": 0.018139764666557312, "learning_rate": 8.117550527005912e-10, "loss": -0.0369, "num_tokens": 103184454.0, "reward": 0.78125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.999741792678833, "sampling/importance_sampling_ratio/min": 0.4835608899593353, "sampling/sampling_logp_difference/max": 0.8325601816177368, "sampling/sampling_logp_difference/mean": 0.01214473508298397, "step": 2412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 619.0, "completions/max_terminated_length": 619.0, "completions/mean_length": 286.265625, "completions/mean_terminated_length": 286.265625, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.5261502861976624, "epoch": 2.957107843137255, "frac_reward_zero_std": 0.5, "grad_norm": 0.9837969874983923, "kl": 0.03378202021121979, "learning_rate": 7.716849600554188e-10, "loss": 0.0175, "num_tokens": 103219863.0, "reward": 0.34375, "reward_std": 0.42695626616477966, "rewards/decision_reward_func/mean": 0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 1.3590871095657349, "sampling/importance_sampling_ratio/mean": 0.9997561573982239, "sampling/importance_sampling_ratio/min": 0.6206278800964355, "sampling/sampling_logp_difference/max": 0.4770236015319824, "sampling/sampling_logp_difference/mean": 0.01606847532093525, "step": 2413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 859.0, "completions/max_terminated_length": 859.0, "completions/mean_length": 304.921875, "completions/mean_terminated_length": 304.921875, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.4422142803668976, "epoch": 2.9583333333333335, "frac_reward_zero_std": 0.75, "grad_norm": 0.5513837822087675, "kl": 0.03219069540500641, "learning_rate": 7.326282692626806e-10, "loss": 0.0144, "num_tokens": 103253714.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.3074384927749634, "sampling/importance_sampling_ratio/mean": 0.9992960691452026, "sampling/importance_sampling_ratio/min": 0.6822564601898193, "sampling/sampling_logp_difference/max": 0.38234972953796387, "sampling/sampling_logp_difference/mean": 0.015003910288214684, "step": 2414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 987.0, "completions/max_terminated_length": 987.0, "completions/mean_length": 488.59375, "completions/mean_terminated_length": 488.59375, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "entropy": 0.5303651690483093, "epoch": 2.9595588235294117, "frac_reward_zero_std": 0.75, "grad_norm": 0.4369563118022752, "kl": 0.020159168168902397, "learning_rate": 6.945850596050684e-10, "loss": 0.0239, "num_tokens": 103301304.0, "reward": -0.21875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": -0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 1.8319705724716187, "sampling/importance_sampling_ratio/mean": 1.0001235008239746, "sampling/importance_sampling_ratio/min": 0.4949535131454468, "sampling/sampling_logp_difference/max": 0.7032914161682129, "sampling/sampling_logp_difference/mean": 0.015811029821634293, "step": 2415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1245.0, "completions/max_terminated_length": 1245.0, "completions/mean_length": 326.671875, "completions/mean_terminated_length": 326.671875, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.39553672075271606, "epoch": 2.9607843137254903, "frac_reward_zero_std": 1.0, "grad_norm": 0.011644260051958096, "kl": 0.01648957096040249, "learning_rate": 6.575554083078083e-10, "loss": 0.0001, "num_tokens": 103338691.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6129579544067383, "sampling/importance_sampling_ratio/mean": 0.9999135732650757, "sampling/importance_sampling_ratio/min": 0.4054082930088043, "sampling/sampling_logp_difference/max": 0.9028606414794922, "sampling/sampling_logp_difference/mean": 0.013375054113566875, "step": 2416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1108.0, "completions/max_terminated_length": 1108.0, "completions/mean_length": 352.75, "completions/mean_terminated_length": 352.75, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.359274297952652, "epoch": 2.9620098039215685, "frac_reward_zero_std": 1.0, "grad_norm": 0.01717407098236982, "kl": 0.016016457229852676, "learning_rate": 6.215393905388278e-10, "loss": 0.0001, "num_tokens": 103377443.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6554591655731201, "sampling/importance_sampling_ratio/mean": 1.0001201629638672, "sampling/importance_sampling_ratio/min": 0.5910095572471619, "sampling/sampling_logp_difference/max": 0.5259230136871338, "sampling/sampling_logp_difference/mean": 0.012759308330714703, "step": 2417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 781.0, "completions/max_terminated_length": 781.0, "completions/mean_length": 378.09375, "completions/mean_terminated_length": 378.09375, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.3821951150894165, "epoch": 2.963235294117647, "frac_reward_zero_std": 0.75, "grad_norm": 0.663811882430282, "kl": 0.015591103583574295, "learning_rate": 5.865370794082558e-10, "loss": 0.0076, "num_tokens": 103419689.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.845828890800476, "sampling/importance_sampling_ratio/mean": 1.0002837181091309, "sampling/importance_sampling_ratio/min": 0.7399192452430725, "sampling/sampling_logp_difference/max": 0.6129283905029297, "sampling/sampling_logp_difference/mean": 0.011869165115058422, "step": 2418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/max_terminated_length": 513.0, "completions/mean_length": 268.90625, "completions/mean_terminated_length": 268.90625, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.538526177406311, "epoch": 2.9644607843137254, "frac_reward_zero_std": 1.0, "grad_norm": 0.01859660298388912, "kl": 0.02467973902821541, "learning_rate": 5.525485459687007e-10, "loss": 0.0002, "num_tokens": 103454323.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4777252674102783, "sampling/importance_sampling_ratio/mean": 1.0002177953720093, "sampling/importance_sampling_ratio/min": 0.7340207099914551, "sampling/sampling_logp_difference/max": 0.39050400257110596, "sampling/sampling_logp_difference/mean": 0.017662692815065384, "step": 2419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3266.0, "completions/max_terminated_length": 3266.0, "completions/mean_length": 652.03125, "completions/mean_terminated_length": 652.03125, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.4671842157840729, "epoch": 2.965686274509804, "frac_reward_zero_std": 0.5, "grad_norm": 0.5758205367649151, "kl": 0.016002994030714035, "learning_rate": 5.195738592145838e-10, "loss": -0.0261, "num_tokens": 103525925.0, "reward": 0.5, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6088860034942627, "sampling/importance_sampling_ratio/mean": 1.0001780986785889, "sampling/importance_sampling_ratio/min": 0.5217247009277344, "sampling/sampling_logp_difference/max": 0.6506153345108032, "sampling/sampling_logp_difference/mean": 0.014098657295107841, "step": 2420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 646.0, "completions/max_terminated_length": 646.0, "completions/mean_length": 410.328125, "completions/mean_terminated_length": 410.328125, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "entropy": 0.41944339871406555, "epoch": 2.9669117647058822, "frac_reward_zero_std": 0.75, "grad_norm": 0.5763386125906939, "kl": 0.02698737010359764, "learning_rate": 4.876130860825278e-10, "loss": -0.0049, "num_tokens": 103573130.0, "reward": 0.65625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.804695963859558, "sampling/importance_sampling_ratio/mean": 0.9999831914901733, "sampling/importance_sampling_ratio/min": 0.47729650139808655, "sampling/sampling_logp_difference/max": 0.7396173477172852, "sampling/sampling_logp_difference/mean": 0.013173812068998814, "step": 2421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 772.0, "completions/max_terminated_length": 772.0, "completions/mean_length": 396.953125, "completions/mean_terminated_length": 396.953125, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "entropy": 0.38728001713752747, "epoch": 2.968137254901961, "frac_reward_zero_std": 1.0, "grad_norm": 0.009335645742827066, "kl": 0.01070772111415863, "learning_rate": 4.566662914508579e-10, "loss": 0.0001, "num_tokens": 103615735.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.438698410987854, "sampling/importance_sampling_ratio/mean": 0.9996020197868347, "sampling/importance_sampling_ratio/min": 0.6852322816848755, "sampling/sampling_logp_difference/max": 0.37799739837646484, "sampling/sampling_logp_difference/mean": 0.011463727802038193, "step": 2422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 918.0, "completions/max_terminated_length": 918.0, "completions/mean_length": 394.734375, "completions/mean_terminated_length": 394.734375, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "entropy": 0.3877512812614441, "epoch": 2.969362745098039, "frac_reward_zero_std": 0.75, "grad_norm": 0.580902459446398, "kl": 0.019785141572356224, "learning_rate": 4.267335381396564e-10, "loss": 0.0147, "num_tokens": 103665622.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.2986265420913696, "sampling/importance_sampling_ratio/mean": 0.9995995759963989, "sampling/importance_sampling_ratio/min": 0.575842022895813, "sampling/sampling_logp_difference/max": 0.5519219636917114, "sampling/sampling_logp_difference/mean": 0.01216872874647379, "step": 2423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 875.0, "completions/max_terminated_length": 875.0, "completions/mean_length": 359.546875, "completions/mean_terminated_length": 359.546875, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.5381530523300171, "epoch": 2.9705882352941178, "frac_reward_zero_std": 0.25, "grad_norm": 0.9342399225895607, "kl": 0.04863797873258591, "learning_rate": 3.978148869103748e-10, "loss": -0.0326, "num_tokens": 103707241.0, "reward": 0.15625, "reward_std": 0.6751632690429688, "rewards/decision_reward_func/mean": 0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.3785536289215088, "sampling/importance_sampling_ratio/mean": 1.000213384628296, "sampling/importance_sampling_ratio/min": 0.6056081652641296, "sampling/sampling_logp_difference/max": 0.5015220642089844, "sampling/sampling_logp_difference/mean": 0.015870394185185432, "step": 2424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 959.0, "completions/max_terminated_length": 959.0, "completions/mean_length": 343.75, "completions/mean_terminated_length": 343.75, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.39134925603866577, "epoch": 2.971813725490196, "frac_reward_zero_std": 1.0, "grad_norm": 0.013076180789399807, "kl": 0.014393514953553677, "learning_rate": 3.699103964661665e-10, "loss": 0.0001, "num_tokens": 103761193.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3989907503128052, "sampling/importance_sampling_ratio/mean": 0.9995557069778442, "sampling/importance_sampling_ratio/min": 0.5497682094573975, "sampling/sampling_logp_difference/max": 0.5982584953308105, "sampling/sampling_logp_difference/mean": 0.013529048301279545, "step": 2425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 615.0, "completions/max_terminated_length": 615.0, "completions/mean_length": 277.90625, "completions/mean_terminated_length": 277.90625, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.4316456913948059, "epoch": 2.9730392156862746, "frac_reward_zero_std": 1.0, "grad_norm": 0.014311635196568862, "kl": 0.02448146417737007, "learning_rate": 3.430201234513874e-10, "loss": 0.0002, "num_tokens": 103792563.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3889349699020386, "sampling/importance_sampling_ratio/mean": 0.9991505146026611, "sampling/importance_sampling_ratio/min": 0.31458115577697754, "sampling/sampling_logp_difference/max": 1.1565132141113281, "sampling/sampling_logp_difference/mean": 0.016002381220459938, "step": 2426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 580.0, "completions/max_terminated_length": 580.0, "completions/mean_length": 310.15625, "completions/mean_terminated_length": 310.15625, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.5005419254302979, "epoch": 2.974264705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.02856323945088062, "kl": 0.03115483745932579, "learning_rate": 3.171441224514848e-10, "loss": 0.0003, "num_tokens": 103830813.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.327806830406189, "sampling/importance_sampling_ratio/mean": 1.0000410079956055, "sampling/importance_sampling_ratio/min": 0.6854231357574463, "sampling/sampling_logp_difference/max": 0.3777189254760742, "sampling/sampling_logp_difference/mean": 0.015531059354543686, "step": 2427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 654.0, "completions/max_terminated_length": 654.0, "completions/mean_length": 324.265625, "completions/mean_terminated_length": 324.265625, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "entropy": 0.42049896717071533, "epoch": 2.9754901960784315, "frac_reward_zero_std": 0.5, "grad_norm": 1.254796679764665, "kl": 0.017740968614816666, "learning_rate": 2.922824459931639e-10, "loss": -0.0374, "num_tokens": 103872142.0, "reward": 0.0, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.5685762166976929, "sampling/importance_sampling_ratio/mean": 1.0000388622283936, "sampling/importance_sampling_ratio/min": 0.6456836462020874, "sampling/sampling_logp_difference/max": 0.4501683712005615, "sampling/sampling_logp_difference/mean": 0.0129752978682518, "step": 2428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 570.0, "completions/max_terminated_length": 570.0, "completions/mean_length": 326.4375, "completions/mean_terminated_length": 326.4375, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.523917555809021, "epoch": 2.9767156862745097, "frac_reward_zero_std": 0.75, "grad_norm": 0.7736320844628303, "kl": 0.023589540272951126, "learning_rate": 2.684351445440547e-10, "loss": -0.016, "num_tokens": 103915994.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.4283696413040161, "sampling/importance_sampling_ratio/mean": 0.9996488094329834, "sampling/importance_sampling_ratio/min": 0.5496720671653748, "sampling/sampling_logp_difference/max": 0.5984333753585815, "sampling/sampling_logp_difference/mean": 0.01530832052230835, "step": 2429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1156.0, "completions/max_terminated_length": 1156.0, "completions/mean_length": 377.140625, "completions/mean_terminated_length": 377.140625, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.5998996496200562, "epoch": 2.9779411764705883, "frac_reward_zero_std": 0.5, "grad_norm": 0.9083571688284148, "kl": 0.038709856569767, "learning_rate": 2.456022665127122e-10, "loss": 0.0212, "num_tokens": 103964259.0, "reward": 0.9375, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.6507984399795532, "sampling/importance_sampling_ratio/mean": 0.9999931454658508, "sampling/importance_sampling_ratio/min": 0.5399987101554871, "sampling/sampling_logp_difference/max": 0.6161885261535645, "sampling/sampling_logp_difference/mean": 0.01765395700931549, "step": 2430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 5000.0, "completions/max_terminated_length": 593.0, "completions/mean_length": 369.453125, "completions/mean_terminated_length": 295.952392578125, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.4235979914665222, "epoch": 2.9791666666666665, "frac_reward_zero_std": 0.75, "grad_norm": 0.4886477880634822, "kl": 0.026752591133117676, "learning_rate": 2.2378385824833866e-10, "loss": 0.4858, "num_tokens": 104008352.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.5858746767044067, "sampling/importance_sampling_ratio/mean": 1.0001643896102905, "sampling/importance_sampling_ratio/min": 0.6429020166397095, "sampling/sampling_logp_difference/max": 0.4611361026763916, "sampling/sampling_logp_difference/mean": 0.013758325949311256, "step": 2431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1059.0, "completions/max_terminated_length": 1059.0, "completions/mean_length": 368.5625, "completions/mean_terminated_length": 368.5625, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.5122309327125549, "epoch": 2.980392156862745, "frac_reward_zero_std": 0.75, "grad_norm": 0.7800376650465053, "kl": 0.02252240478992462, "learning_rate": 2.0297996404095018e-10, "loss": -0.0163, "num_tokens": 104049012.0, "reward": 0.65625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.447912573814392, "sampling/importance_sampling_ratio/mean": 1.0000489950180054, "sampling/importance_sampling_ratio/min": 0.0311092771589756, "sampling/sampling_logp_difference/max": 3.4702491760253906, "sampling/sampling_logp_difference/mean": 0.015487036667764187, "step": 2432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1321.0, "completions/max_terminated_length": 1321.0, "completions/mean_length": 378.359375, "completions/mean_terminated_length": 378.359375, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.48667195439338684, "epoch": 2.9816176470588234, "frac_reward_zero_std": 0.75, "grad_norm": 0.5571890862263098, "kl": 0.01618039608001709, "learning_rate": 1.8319062612115467e-10, "loss": -0.0105, "num_tokens": 104091163.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.393373727798462, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 0.6890442371368408, "sampling/sampling_logp_difference/max": 0.3724498748779297, "sampling/sampling_logp_difference/mean": 0.014352986589074135, "step": 2433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 979.0, "completions/max_terminated_length": 979.0, "completions/mean_length": 354.96875, "completions/mean_terminated_length": 354.96875, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.36588239669799805, "epoch": 2.982843137254902, "frac_reward_zero_std": 0.75, "grad_norm": 0.5128644307469641, "kl": 0.018944894894957542, "learning_rate": 1.6441588466009627e-10, "loss": 0.0113, "num_tokens": 104130457.0, "reward": 0.125, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 1.4708377122879028, "sampling/importance_sampling_ratio/mean": 0.9997839331626892, "sampling/importance_sampling_ratio/min": 0.6068662405014038, "sampling/sampling_logp_difference/max": 0.4994468688964844, "sampling/sampling_logp_difference/mean": 0.012597290799021721, "step": 2434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 863.0, "completions/max_terminated_length": 863.0, "completions/mean_length": 323.0625, "completions/mean_terminated_length": 323.0625, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.5146043300628662, "epoch": 2.9840686274509802, "frac_reward_zero_std": 0.75, "grad_norm": 0.5796030123780048, "kl": 0.05912617966532707, "learning_rate": 1.4665577776923343e-10, "loss": 0.0123, "num_tokens": 104170461.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.486657977104187, "sampling/importance_sampling_ratio/mean": 1.0002340078353882, "sampling/importance_sampling_ratio/min": 0.617142379283905, "sampling/sampling_logp_difference/max": 0.48265552520751953, "sampling/sampling_logp_difference/mean": 0.015730474144220352, "step": 2435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1674.0, "completions/max_terminated_length": 1674.0, "completions/mean_length": 360.5, "completions/mean_terminated_length": 360.5, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.48123329877853394, "epoch": 2.985294117647059, "frac_reward_zero_std": 0.75, "grad_norm": 0.48694303780543663, "kl": 0.02737082540988922, "learning_rate": 1.2991034150050538e-10, "loss": 0.017, "num_tokens": 104209421.0, "reward": 0.21875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 1.4277199506759644, "sampling/importance_sampling_ratio/mean": 1.0003471374511719, "sampling/importance_sampling_ratio/min": 0.5914263725280762, "sampling/sampling_logp_difference/max": 0.5252180099487305, "sampling/sampling_logp_difference/mean": 0.014916026033461094, "step": 2436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 662.0, "completions/max_terminated_length": 662.0, "completions/mean_length": 390.78125, "completions/mean_terminated_length": 390.78125, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.433618426322937, "epoch": 2.986519607843137, "frac_reward_zero_std": 0.5, "grad_norm": 0.7329219924909325, "kl": 0.020404143258929253, "learning_rate": 1.1417960984605457e-10, "loss": -0.0396, "num_tokens": 104250095.0, "reward": 0.3125, "reward_std": 0.3943893015384674, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.5744415521621704, "sampling/importance_sampling_ratio/mean": 1.00022292137146, "sampling/importance_sampling_ratio/min": 0.43439018726348877, "sampling/sampling_logp_difference/max": 0.8338121175765991, "sampling/sampling_logp_difference/mean": 0.013894622214138508, "step": 2437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 876.0, "completions/max_terminated_length": 876.0, "completions/mean_length": 450.796875, "completions/mean_terminated_length": 450.796875, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "entropy": 0.4159921407699585, "epoch": 2.9877450980392157, "frac_reward_zero_std": 0.5, "grad_norm": 0.6295326062797157, "kl": 0.015380298718810081, "learning_rate": 9.946361473822662e-11, "loss": 0.0509, "num_tokens": 104298306.0, "reward": 0.46875, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.37203049659729, "sampling/importance_sampling_ratio/mean": 0.999995231628418, "sampling/importance_sampling_ratio/min": 0.6571097373962402, "sampling/sampling_logp_difference/max": 0.4199042320251465, "sampling/sampling_logp_difference/mean": 0.012551460415124893, "step": 2438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1584.0, "completions/max_terminated_length": 1584.0, "completions/mean_length": 488.75, "completions/mean_terminated_length": 488.75, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "entropy": 0.4693135619163513, "epoch": 2.9889705882352944, "frac_reward_zero_std": 1.0, "grad_norm": 0.015506641818515876, "kl": 0.02453150786459446, "learning_rate": 8.576238604968144e-11, "loss": 0.0003, "num_tokens": 104349762.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.792182445526123, "sampling/importance_sampling_ratio/mean": 1.0001132488250732, "sampling/importance_sampling_ratio/min": 0.6258996725082397, "sampling/sampling_logp_difference/max": 0.5834341049194336, "sampling/sampling_logp_difference/mean": 0.013868569396436214, "step": 2439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 804.0, "completions/max_terminated_length": 804.0, "completions/mean_length": 338.953125, "completions/mean_terminated_length": 338.953125, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.4967162013053894, "epoch": 2.9901960784313726, "frac_reward_zero_std": 0.75, "grad_norm": 0.47505557633957723, "kl": 0.04145974665880203, "learning_rate": 7.307595159300461e-11, "loss": 0.0009, "num_tokens": 104391359.0, "reward": 0.59375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.4034730195999146, "sampling/importance_sampling_ratio/mean": 0.9998819828033447, "sampling/importance_sampling_ratio/min": 0.6622384190559387, "sampling/sampling_logp_difference/max": 0.41212964057922363, "sampling/sampling_logp_difference/mean": 0.015240881592035294, "step": 2440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2398.0, "completions/max_terminated_length": 2398.0, "completions/mean_length": 544.015625, "completions/mean_terminated_length": 544.015625, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.5080316662788391, "epoch": 2.991421568627451, "frac_reward_zero_std": 0.5, "grad_norm": 0.5701131206155765, "kl": 0.022138230502605438, "learning_rate": 6.140433712076287e-11, "loss": -0.0202, "num_tokens": 104447088.0, "reward": 0.21875, "reward_std": 0.42516323924064636, "rewards/decision_reward_func/mean": 0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 1.5643929243087769, "sampling/importance_sampling_ratio/mean": 1.000231146812439, "sampling/importance_sampling_ratio/min": 0.5799168348312378, "sampling/sampling_logp_difference/max": 0.5448706150054932, "sampling/sampling_logp_difference/mean": 0.014064192771911621, "step": 2441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1029.0, "completions/max_terminated_length": 1029.0, "completions/mean_length": 368.640625, "completions/mean_terminated_length": 368.640625, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.5020058155059814, "epoch": 2.9926470588235294, "frac_reward_zero_std": 0.75, "grad_norm": 0.6260493561258109, "kl": 0.04164016991853714, "learning_rate": 5.074756632572619e-11, "loss": -0.0329, "num_tokens": 104487737.0, "reward": 0.75, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.4676384925842285, "sampling/importance_sampling_ratio/mean": 1.0002955198287964, "sampling/importance_sampling_ratio/min": 0.6905480623245239, "sampling/sampling_logp_difference/max": 0.3836545944213867, "sampling/sampling_logp_difference/mean": 0.015381896868348122, "step": 2442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1439.0, "completions/max_terminated_length": 1439.0, "completions/mean_length": 407.21875, "completions/mean_terminated_length": 407.21875, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "entropy": 0.4917651116847992, "epoch": 2.993872549019608, "frac_reward_zero_std": 0.75, "grad_norm": 0.5483114678875235, "kl": 0.012788360938429832, "learning_rate": 4.110566084036815e-11, "loss": 0.0072, "num_tokens": 104530583.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.4725840091705322, "sampling/importance_sampling_ratio/mean": 0.9994935393333435, "sampling/importance_sampling_ratio/min": 0.6284888386726379, "sampling/sampling_logp_difference/max": 0.46443700790405273, "sampling/sampling_logp_difference/mean": 0.0151821319013834, "step": 2443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 624.0, "completions/max_terminated_length": 624.0, "completions/mean_length": 353.578125, "completions/mean_terminated_length": 353.578125, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.5791712403297424, "epoch": 2.9950980392156863, "frac_reward_zero_std": 0.25, "grad_norm": 1.0715757852338859, "kl": 0.02879153937101364, "learning_rate": 3.247864023719904e-11, "loss": -0.0114, "num_tokens": 104569052.0, "reward": 0.8125, "reward_std": 0.5123475193977356, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.5531516075134277, "sampling/importance_sampling_ratio/mean": 0.9995863437652588, "sampling/importance_sampling_ratio/min": 0.6652275323867798, "sampling/sampling_logp_difference/max": 0.44028615951538086, "sampling/sampling_logp_difference/mean": 0.016299044713377953, "step": 2444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/max_terminated_length": 490.0, "completions/mean_length": 230.765625, "completions/mean_terminated_length": 230.765625, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.4961737394332886, "epoch": 2.9963235294117645, "frac_reward_zero_std": 0.75, "grad_norm": 0.838834466439169, "kl": 0.03988753631711006, "learning_rate": 2.4866522028488268e-11, "loss": -0.0002, "num_tokens": 104600653.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.445585012435913, "sampling/importance_sampling_ratio/mean": 1.0002175569534302, "sampling/importance_sampling_ratio/min": 0.6874000430107117, "sampling/sampling_logp_difference/max": 0.37483882904052734, "sampling/sampling_logp_difference/mean": 0.016711801290512085, "step": 2445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/max_terminated_length": 547.0, "completions/mean_length": 263.9375, "completions/mean_terminated_length": 263.9375, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.60142982006073, "epoch": 2.997549019607843, "frac_reward_zero_std": 0.75, "grad_norm": 0.7612344483823126, "kl": 0.04584861546754837, "learning_rate": 1.8269321666375403e-11, "loss": 0.046, "num_tokens": 104634793.0, "reward": 0.8125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.434432864189148, "sampling/importance_sampling_ratio/mean": 1.000119924545288, "sampling/importance_sampling_ratio/min": 0.487101674079895, "sampling/sampling_logp_difference/max": 0.7192823886871338, "sampling/sampling_logp_difference/mean": 0.018483776599168777, "step": 2446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 736.0, "completions/max_terminated_length": 736.0, "completions/mean_length": 323.921875, "completions/mean_terminated_length": 323.921875, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "entropy": 0.5273995995521545, "epoch": 2.998774509803922, "frac_reward_zero_std": 1.0, "grad_norm": 0.014869506683263445, "kl": 0.023216627538204193, "learning_rate": 1.2687052542759147e-11, "loss": 0.0002, "num_tokens": 104674372.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4924073219299316, "sampling/importance_sampling_ratio/mean": 1.0001485347747803, "sampling/importance_sampling_ratio/min": 0.5986575484275818, "sampling/sampling_logp_difference/max": 0.5130655765533447, "sampling/sampling_logp_difference/mean": 0.017101943492889404, "step": 2447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 981.0, "completions/max_terminated_length": 981.0, "completions/mean_length": 380.09375, "completions/mean_terminated_length": 380.09375, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "entropy": 0.4392942190170288, "epoch": 3.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.012818967908857438, "kl": 0.01786920800805092, "learning_rate": 8.119725989241822e-12, "loss": 0.0002, "num_tokens": 104713322.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.393803358078003, "sampling/importance_sampling_ratio/mean": 1.0000274181365967, "sampling/importance_sampling_ratio/min": 0.6583831906318665, "sampling/sampling_logp_difference/max": 0.41796815395355225, "sampling/sampling_logp_difference/mean": 0.01417489256709814, "step": 2448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1246.0, "completions/max_terminated_length": 1246.0, "completions/mean_length": 451.375, "completions/mean_terminated_length": 451.375, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.48131370544433594, "epoch": 3.0012254901960786, "frac_reward_zero_std": 0.5, "grad_norm": 0.6817050967535603, "kl": 0.0274809543043375, "learning_rate": 4.5673512772959055e-12, "loss": 0.0562, "num_tokens": 104763442.0, "reward": 0.53125, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.6322232484817505, "sampling/importance_sampling_ratio/mean": 1.0003581047058105, "sampling/importance_sampling_ratio/min": 0.4455420970916748, "sampling/sampling_logp_difference/max": 0.8084635734558105, "sampling/sampling_logp_difference/mean": 0.015051858499646187, "step": 2449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 462.0, "completions/max_terminated_length": 462.0, "completions/mean_length": 241.171875, "completions/mean_terminated_length": 241.171875, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.5539578199386597, "epoch": 3.002450980392157, "frac_reward_zero_std": 0.75, "grad_norm": 0.8726634199554918, "kl": 0.038823604583740234, "learning_rate": 2.0299356179309666e-12, "loss": 0.0176, "num_tokens": 104802493.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.9951317310333252, "sampling/importance_sampling_ratio/mean": 0.9996297955513, "sampling/importance_sampling_ratio/min": 0.5364373922348022, "sampling/sampling_logp_difference/max": 0.6907100677490234, "sampling/sampling_logp_difference/mean": 0.01797829009592533, "step": 2450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 571.0, "completions/max_terminated_length": 571.0, "completions/mean_length": 324.921875, "completions/mean_terminated_length": 324.921875, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "entropy": 0.40754568576812744, "epoch": 3.0036764705882355, "frac_reward_zero_std": 1.0, "grad_norm": 0.010643639005979226, "kl": 0.015995265915989876, "learning_rate": 5.074841620267278e-13, "loss": 0.0002, "num_tokens": 104838568.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4191851615905762, "sampling/importance_sampling_ratio/mean": 1.0001311302185059, "sampling/importance_sampling_ratio/min": 0.5588198900222778, "sampling/sampling_logp_difference/max": 0.581928014755249, "sampling/sampling_logp_difference/mean": 0.012411881238222122, "step": 2451 } ], "logging_steps": 1, "max_steps": 2451, "num_input_tokens_seen": 104838568, "num_train_epochs": 4, "save_steps": 817, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }