{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.107095046854083, "eval_steps": 500, "global_step": 400, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 461.0, "completions/max_terminated_length": 461.0, "completions/mean_length": 150.625, "completions/mean_terminated_length": 150.625, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 0.6267418265342712, "epoch": 0.0002677376171352075, "frac_reward_zero_std": 0.0, "grad_norm": 3.4215750694274902, "learning_rate": 3e-06, "loss": 0.5336, "num_tokens": 2381.0, "reward": 0.5625, "reward_std": 0.875, "rewards/reward_fn/mean": 0.5625, "rewards/reward_fn/std": 0.8210403323173523, "sampling/importance_sampling_ratio/max": 1.315043568611145, "sampling/importance_sampling_ratio/mean": 1.0001176595687866, "sampling/importance_sampling_ratio/min": 0.7038920521736145, "sampling/sampling_logp_difference/max": 0.35113024711608887, "sampling/sampling_logp_difference/mean": 0.013675408437848091, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 82.5, "completions/mean_terminated_length": 82.5, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "entropy": 0.517684455960989, "epoch": 0.000535475234270415, "frac_reward_zero_std": 0.5, "grad_norm": 2.49469256401062, "learning_rate": 2.994e-06, "loss": -0.0037, "num_tokens": 4217.0, "reward": -0.375, "reward_std": 0.4330126941204071, "rewards/reward_fn/mean": -0.375, "rewards/reward_fn/std": 0.5824823975563049, "sampling/importance_sampling_ratio/max": 1.1338849067687988, "sampling/importance_sampling_ratio/mean": 0.9985398650169373, "sampling/importance_sampling_ratio/min": 0.7438814640045166, "sampling/sampling_logp_difference/max": 0.29587364196777344, "sampling/sampling_logp_difference/mean": 0.0117102125659585, "step": 2 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 194.0, "completions/max_terminated_length": 194.0, "completions/mean_length": 94.875, "completions/mean_terminated_length": 94.875, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "entropy": 0.6537462696433067, "epoch": 0.0008032128514056225, "frac_reward_zero_std": 0.0, "grad_norm": 2.606494903564453, "learning_rate": 2.988e-06, "loss": -0.1565, "num_tokens": 6148.0, "reward": -0.0625, "reward_std": 0.9062550067901611, "rewards/reward_fn/mean": -0.0625, "rewards/reward_fn/std": 0.9038608074188232, "sampling/importance_sampling_ratio/max": 1.2161964178085327, "sampling/importance_sampling_ratio/mean": 0.9991714358329773, "sampling/importance_sampling_ratio/min": 0.6878055930137634, "sampling/sampling_logp_difference/max": 0.3742489814758301, "sampling/sampling_logp_difference/mean": 0.010562321171164513, "step": 3 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/max_terminated_length": 138.0, "completions/mean_length": 69.25, "completions/mean_terminated_length": 69.25, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 0.650386281311512, "epoch": 0.00107095046854083, "frac_reward_zero_std": 0.0, "grad_norm": 4.485692501068115, "learning_rate": 2.982e-06, "loss": -0.3087, "num_tokens": 7806.0, "reward": -0.25, "reward_std": 0.6403881907463074, "rewards/reward_fn/mean": -0.25, "rewards/reward_fn/std": 0.8017837405204773, "sampling/importance_sampling_ratio/max": 1.3703899383544922, "sampling/importance_sampling_ratio/mean": 1.0018362998962402, "sampling/importance_sampling_ratio/min": 0.7810623049736023, "sampling/sampling_logp_difference/max": 0.3150954246520996, "sampling/sampling_logp_difference/mean": 0.013195307925343513, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 144.0, "completions/max_terminated_length": 144.0, "completions/mean_length": 94.0, "completions/mean_terminated_length": 94.0, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "entropy": 0.505305927246809, "epoch": 0.0013386880856760374, "frac_reward_zero_std": 0.0, "grad_norm": 3.150624990463257, "learning_rate": 2.976e-06, "loss": 0.1292, "num_tokens": 9742.0, "reward": 0.16249999403953552, "reward_std": 0.8606266975402832, "rewards/reward_fn/mean": 0.16249999403953552, "rewards/reward_fn/std": 0.8601287603378296, "sampling/importance_sampling_ratio/max": 1.2443231344223022, "sampling/importance_sampling_ratio/mean": 1.000427007675171, "sampling/importance_sampling_ratio/min": 0.8292719125747681, "sampling/sampling_logp_difference/max": 0.21859169006347656, "sampling/sampling_logp_difference/mean": 0.011499120853841305, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 128.5, "completions/mean_terminated_length": 128.5, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 0.501326372846961, "epoch": 0.001606425702811245, "frac_reward_zero_std": 0.0, "grad_norm": 3.072824001312256, "learning_rate": 2.97e-06, "loss": 0.0425, "num_tokens": 12098.0, "reward": 0.5, "reward_std": 0.6403881907463074, "rewards/reward_fn/mean": 0.5, "rewards/reward_fn/std": 0.8017837405204773, "sampling/importance_sampling_ratio/max": 1.3843379020690918, "sampling/importance_sampling_ratio/mean": 0.9997293949127197, "sampling/importance_sampling_ratio/min": 0.799883246421814, "sampling/sampling_logp_difference/max": 0.3252220153808594, "sampling/sampling_logp_difference/mean": 0.01259581744670868, "step": 6 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 113.75, "completions/mean_terminated_length": 113.75, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 0.5197930932044983, "epoch": 0.0018741633199464524, "frac_reward_zero_std": 0.0, "grad_norm": 2.8480682373046875, "learning_rate": 2.964e-06, "loss": -0.1574, "num_tokens": 14180.0, "reward": 0.15625, "reward_std": 0.7340351343154907, "rewards/reward_fn/mean": 0.15625, "rewards/reward_fn/std": 0.7432734370231628, "sampling/importance_sampling_ratio/max": 1.201647162437439, "sampling/importance_sampling_ratio/mean": 0.998772382736206, "sampling/importance_sampling_ratio/min": 0.7077836394309998, "sampling/sampling_logp_difference/max": 0.34561681747436523, "sampling/sampling_logp_difference/mean": 0.015118096955120564, "step": 7 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 253.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 92.0, "completions/mean_terminated_length": 92.0, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 0.43194472044706345, "epoch": 0.00214190093708166, "frac_reward_zero_std": 0.0, "grad_norm": 3.927507162094116, "learning_rate": 2.958e-06, "loss": 0.3579, "num_tokens": 16108.0, "reward": 0.5625, "reward_std": 0.7285534143447876, "rewards/reward_fn/mean": 0.5625, "rewards/reward_fn/std": 0.6781013607978821, "sampling/importance_sampling_ratio/max": 1.1596660614013672, "sampling/importance_sampling_ratio/mean": 0.9985054731369019, "sampling/importance_sampling_ratio/min": 0.6453264355659485, "sampling/sampling_logp_difference/max": 0.43799901008605957, "sampling/sampling_logp_difference/mean": 0.01435327809303999, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 311.0, "completions/max_terminated_length": 311.0, "completions/mean_length": 112.875, "completions/mean_terminated_length": 112.875, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.5163675956428051, "epoch": 0.0024096385542168677, "frac_reward_zero_std": 0.0, "grad_norm": 3.3013086318969727, "learning_rate": 2.952e-06, "loss": 0.3342, "num_tokens": 18067.0, "reward": 0.0, "reward_std": 0.7318945527076721, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.744023859500885, "sampling/importance_sampling_ratio/max": 1.279534935951233, "sampling/importance_sampling_ratio/mean": 1.002169132232666, "sampling/importance_sampling_ratio/min": 0.8131555318832397, "sampling/sampling_logp_difference/max": 0.24649667739868164, "sampling/sampling_logp_difference/mean": 0.011817368678748608, "step": 9 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 125.5, "completions/mean_terminated_length": 125.5, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "entropy": 0.595271673053503, "epoch": 0.002677376171352075, "frac_reward_zero_std": 0.5, "grad_norm": 2.944575309753418, "learning_rate": 2.946e-06, "loss": -0.0954, "num_tokens": 20399.0, "reward": 0.34375, "reward_std": 0.1875, "rewards/reward_fn/mean": 0.34375, "rewards/reward_fn/std": 0.7432734370231628, "sampling/importance_sampling_ratio/max": 1.278196096420288, "sampling/importance_sampling_ratio/mean": 0.999613881111145, "sampling/importance_sampling_ratio/min": 0.7013292908668518, "sampling/sampling_logp_difference/max": 0.35477781295776367, "sampling/sampling_logp_difference/mean": 0.015448040328919888, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 185.0, "completions/max_terminated_length": 185.0, "completions/mean_length": 134.5, "completions/mean_terminated_length": 134.5, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "entropy": 0.5094028487801552, "epoch": 0.0029451137884872825, "frac_reward_zero_std": 0.0, "grad_norm": 4.294503211975098, "learning_rate": 2.9400000000000002e-06, "loss": 0.0713, "num_tokens": 22883.0, "reward": 0.625, "reward_std": 0.75, "rewards/reward_fn/mean": 0.625, "rewards/reward_fn/std": 0.6943650841712952, "sampling/importance_sampling_ratio/max": 1.3651939630508423, "sampling/importance_sampling_ratio/mean": 0.9996961951255798, "sampling/importance_sampling_ratio/min": 0.7285423874855042, "sampling/sampling_logp_difference/max": 0.3167095184326172, "sampling/sampling_logp_difference/mean": 0.013134236447513103, "step": 11 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 166.0, "completions/max_terminated_length": 166.0, "completions/mean_length": 104.625, "completions/mean_terminated_length": 104.625, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "entropy": 0.2925115432590246, "epoch": 0.00321285140562249, "frac_reward_zero_std": 0.5, "grad_norm": 4.128819465637207, "learning_rate": 2.934e-06, "loss": 0.0039, "num_tokens": 24812.0, "reward": 0.90625, "reward_std": 0.1875, "rewards/reward_fn/mean": 0.90625, "rewards/reward_fn/std": 0.2651650309562683, "sampling/importance_sampling_ratio/max": 1.2545214891433716, "sampling/importance_sampling_ratio/mean": 0.9988386034965515, "sampling/importance_sampling_ratio/min": 0.5556535720825195, "sampling/sampling_logp_difference/max": 0.5876102447509766, "sampling/sampling_logp_difference/mean": 0.011243902146816254, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 169.0, "completions/max_terminated_length": 169.0, "completions/mean_length": 149.625, "completions/mean_terminated_length": 149.625, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.32133930921554565, "epoch": 0.0034805890227576973, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.928e-06, "loss": 0.0, "num_tokens": 27361.0, "reward": 0.25, "reward_std": 0.0, "rewards/reward_fn/mean": 0.25, "rewards/reward_fn/std": 0.8017837405204773, "sampling/importance_sampling_ratio/max": 1.8010644912719727, "sampling/importance_sampling_ratio/mean": 1.0008350610733032, "sampling/importance_sampling_ratio/min": 0.8277317881584167, "sampling/sampling_logp_difference/max": 0.588377833366394, "sampling/sampling_logp_difference/mean": 0.009649225510656834, "step": 13 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 151.0, "completions/max_terminated_length": 151.0, "completions/mean_length": 92.125, "completions/mean_terminated_length": 92.125, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "entropy": 0.35989174898713827, "epoch": 0.003748326639892905, "frac_reward_zero_std": 0.0, "grad_norm": 6.6780900955200195, "learning_rate": 2.922e-06, "loss": -0.1952, "num_tokens": 29290.0, "reward": 0.16249999403953552, "reward_std": 0.921846330165863, "rewards/reward_fn/mean": 0.16249999403953552, "rewards/reward_fn/std": 0.8601287603378296, "sampling/importance_sampling_ratio/max": 1.534264326095581, "sampling/importance_sampling_ratio/mean": 1.0005788803100586, "sampling/importance_sampling_ratio/min": 0.6528303623199463, "sampling/sampling_logp_difference/max": 0.4280509948730469, "sampling/sampling_logp_difference/mean": 0.013318805955350399, "step": 14 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1965.0, "completions/max_terminated_length": 1965.0, "completions/mean_length": 366.5, "completions/mean_terminated_length": 366.5, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "entropy": 0.2599536660127342, "epoch": 0.004016064257028112, "frac_reward_zero_std": 0.0, "grad_norm": 0.9902627468109131, "learning_rate": 2.916e-06, "loss": 0.8216, "num_tokens": 33454.0, "reward": 0.4375, "reward_std": 0.8080127239227295, "rewards/reward_fn/mean": 0.4375, "rewards/reward_fn/std": 0.7763237953186035, "sampling/importance_sampling_ratio/max": 1.2708953619003296, "sampling/importance_sampling_ratio/mean": 1.0001496076583862, "sampling/importance_sampling_ratio/min": 0.6887672543525696, "sampling/sampling_logp_difference/max": 0.3728518486022949, "sampling/sampling_logp_difference/mean": 0.003686662996187806, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 228.0, "completions/max_terminated_length": 228.0, "completions/mean_length": 152.5, "completions/mean_terminated_length": 152.5, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.41192612797021866, "epoch": 0.00428380187416332, "frac_reward_zero_std": 0.5, "grad_norm": 4.546972751617432, "learning_rate": 2.91e-06, "loss": 0.0223, "num_tokens": 35954.0, "reward": -0.3125, "reward_std": 0.375, "rewards/reward_fn/mean": -0.3125, "rewards/reward_fn/std": 0.5303300619125366, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.001716136932373, "sampling/importance_sampling_ratio/min": 0.6547219753265381, "sampling/sampling_logp_difference/max": 1.000260829925537, "sampling/sampling_logp_difference/mean": 0.017907731235027313, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 188.0, "completions/max_terminated_length": 188.0, "completions/mean_length": 109.625, "completions/mean_terminated_length": 109.625, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.29076643474400043, "epoch": 0.004551539491298527, "frac_reward_zero_std": 0.5, "grad_norm": 3.674966812133789, "learning_rate": 2.904e-06, "loss": -0.0778, "num_tokens": 38083.0, "reward": 0.75, "reward_std": 0.5, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 0.7071067690849304, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0025073289871216, "sampling/importance_sampling_ratio/min": 0.6131579875946045, "sampling/sampling_logp_difference/max": 0.8842039108276367, "sampling/sampling_logp_difference/mean": 0.014615166001021862, "step": 17 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 260.0, "completions/max_terminated_length": 260.0, "completions/mean_length": 82.875, "completions/mean_terminated_length": 82.875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "entropy": 0.2971731638535857, "epoch": 0.004819277108433735, "frac_reward_zero_std": 0.0, "grad_norm": 5.08901309967041, "learning_rate": 2.898e-06, "loss": 0.4084, "num_tokens": 39862.0, "reward": 0.1875, "reward_std": 0.9484008550643921, "rewards/reward_fn/mean": 0.1875, "rewards/reward_fn/std": 0.883883535861969, "sampling/importance_sampling_ratio/max": 1.3198846578598022, "sampling/importance_sampling_ratio/mean": 1.0004154443740845, "sampling/importance_sampling_ratio/min": 0.7257490754127502, "sampling/sampling_logp_difference/max": 0.32055091857910156, "sampling/sampling_logp_difference/mean": 0.01244804635643959, "step": 18 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 160.0, "completions/max_terminated_length": 160.0, "completions/mean_length": 82.375, "completions/mean_terminated_length": 82.375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "entropy": 0.30348968878388405, "epoch": 0.0050870147255689425, "frac_reward_zero_std": 0.5, "grad_norm": 6.898182392120361, "learning_rate": 2.892e-06, "loss": 0.2766, "num_tokens": 41893.0, "reward": 0.375, "reward_std": 0.3061862289905548, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.7791936993598938, "sampling/importance_sampling_ratio/max": 1.2820645570755005, "sampling/importance_sampling_ratio/mean": 0.9992855191230774, "sampling/importance_sampling_ratio/min": 0.7099716663360596, "sampling/sampling_logp_difference/max": 0.3425302505493164, "sampling/sampling_logp_difference/mean": 0.01685711182653904, "step": 19 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 130.0, "completions/max_terminated_length": 130.0, "completions/mean_length": 87.75, "completions/mean_terminated_length": 87.75, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.5225945934653282, "epoch": 0.00535475234270415, "frac_reward_zero_std": 0.0, "grad_norm": 9.742513656616211, "learning_rate": 2.886e-06, "loss": 0.2378, "num_tokens": 44043.0, "reward": -0.1875, "reward_std": 0.5580127239227295, "rewards/reward_fn/mean": -0.1875, "rewards/reward_fn/std": 0.7529703378677368, "sampling/importance_sampling_ratio/max": 1.8927215337753296, "sampling/importance_sampling_ratio/mean": 1.0019861459732056, "sampling/importance_sampling_ratio/min": 0.631376326084137, "sampling/sampling_logp_difference/max": 0.6380157470703125, "sampling/sampling_logp_difference/mean": 0.02639644406735897, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 998.0, "completions/max_terminated_length": 998.0, "completions/mean_length": 205.375, "completions/mean_terminated_length": 205.375, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.42385215125977993, "epoch": 0.005622489959839358, "frac_reward_zero_std": 0.5, "grad_norm": 3.6943769454956055, "learning_rate": 2.88e-06, "loss": 0.7809, "num_tokens": 47086.0, "reward": 0.1875, "reward_std": 0.125, "rewards/reward_fn/mean": 0.1875, "rewards/reward_fn/std": 0.883883535861969, "sampling/importance_sampling_ratio/max": 1.3941782712936401, "sampling/importance_sampling_ratio/mean": 1.0003106594085693, "sampling/importance_sampling_ratio/min": 0.5512010455131531, "sampling/sampling_logp_difference/max": 0.5956556797027588, "sampling/sampling_logp_difference/mean": 0.01745516248047352, "step": 21 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 416.0, "completions/max_terminated_length": 416.0, "completions/mean_length": 140.125, "completions/mean_terminated_length": 140.125, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.2805237462744117, "epoch": 0.005890227576974565, "frac_reward_zero_std": 0.0, "grad_norm": 5.563684940338135, "learning_rate": 2.874e-06, "loss": 0.1792, "num_tokens": 49351.0, "reward": 0.125, "reward_std": 0.8482423424720764, "rewards/reward_fn/mean": 0.125, "rewards/reward_fn/std": 0.9543135166168213, "sampling/importance_sampling_ratio/max": 1.6055837869644165, "sampling/importance_sampling_ratio/mean": 1.0001506805419922, "sampling/importance_sampling_ratio/min": 0.5748274326324463, "sampling/sampling_logp_difference/max": 0.5536854267120361, "sampling/sampling_logp_difference/mean": 0.01618441566824913, "step": 22 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 167.0, "completions/max_terminated_length": 167.0, "completions/mean_length": 84.25, "completions/mean_terminated_length": 84.25, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "entropy": 0.2759034838527441, "epoch": 0.006157965194109772, "frac_reward_zero_std": 0.0, "grad_norm": 7.797671794891357, "learning_rate": 2.868e-06, "loss": -0.088, "num_tokens": 51289.0, "reward": 0.07499998807907104, "reward_std": 0.9020542502403259, "rewards/reward_fn/mean": 0.07499998807907104, "rewards/reward_fn/std": 0.9051440954208374, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0022414922714233, "sampling/importance_sampling_ratio/min": 0.6872978806495667, "sampling/sampling_logp_difference/max": 0.7279520034790039, "sampling/sampling_logp_difference/mean": 0.019796477630734444, "step": 23 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 174.0, "completions/max_terminated_length": 174.0, "completions/mean_length": 100.0, "completions/mean_terminated_length": 100.0, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "entropy": 0.27160007879137993, "epoch": 0.00642570281124498, "frac_reward_zero_std": 0.0, "grad_norm": 8.578551292419434, "learning_rate": 2.862e-06, "loss": 0.0706, "num_tokens": 53221.0, "reward": 0.1875, "reward_std": 0.875, "rewards/reward_fn/mean": 0.1875, "rewards/reward_fn/std": 0.883883535861969, "sampling/importance_sampling_ratio/max": 1.7799687385559082, "sampling/importance_sampling_ratio/mean": 0.9976699352264404, "sampling/importance_sampling_ratio/min": 0.620381772518158, "sampling/sampling_logp_difference/max": 0.5765957832336426, "sampling/sampling_logp_difference/mean": 0.014642905443906784, "step": 24 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/max_terminated_length": 382.0, "completions/mean_length": 174.0, "completions/mean_terminated_length": 174.0, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "entropy": 0.3051988519728184, "epoch": 0.006693440428380187, "frac_reward_zero_std": 0.0, "grad_norm": 2.9776344299316406, "learning_rate": 2.856e-06, "loss": 0.0892, "num_tokens": 55969.0, "reward": -0.125, "reward_std": 0.64433753490448, "rewards/reward_fn/mean": -0.125, "rewards/reward_fn/std": 0.9543135166168213, "sampling/importance_sampling_ratio/max": 1.2057379484176636, "sampling/importance_sampling_ratio/mean": 0.9998922944068909, "sampling/importance_sampling_ratio/min": 0.38125699758529663, "sampling/sampling_logp_difference/max": 0.9642816185951233, "sampling/sampling_logp_difference/mean": 0.014070211909711361, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 185.0, "completions/max_terminated_length": 185.0, "completions/mean_length": 107.25, "completions/mean_terminated_length": 107.25, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "entropy": 0.2979002930223942, "epoch": 0.0069611780455153946, "frac_reward_zero_std": 0.0, "grad_norm": 8.11337947845459, "learning_rate": 2.85e-06, "loss": 0.1724, "num_tokens": 58075.0, "reward": 0.4375, "reward_std": 0.8080127239227295, "rewards/reward_fn/mean": 0.4375, "rewards/reward_fn/std": 0.7763237953186035, "sampling/importance_sampling_ratio/max": 1.583893060684204, "sampling/importance_sampling_ratio/mean": 1.0014485120773315, "sampling/importance_sampling_ratio/min": 0.5956977605819702, "sampling/sampling_logp_difference/max": 0.518021821975708, "sampling/sampling_logp_difference/mean": 0.02049558237195015, "step": 26 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1052.0, "completions/max_terminated_length": 1052.0, "completions/mean_length": 221.875, "completions/mean_terminated_length": 221.875, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.5073911547660828, "epoch": 0.007228915662650603, "frac_reward_zero_std": 0.5, "grad_norm": 2.764246702194214, "learning_rate": 2.844e-06, "loss": 0.5914, "num_tokens": 61214.0, "reward": -0.46875, "reward_std": 0.2576940953731537, "rewards/reward_fn/mean": -0.46875, "rewards/reward_fn/std": 0.33905068039894104, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0010087490081787, "sampling/importance_sampling_ratio/min": 0.5527787208557129, "sampling/sampling_logp_difference/max": 0.7732399702072144, "sampling/sampling_logp_difference/mean": 0.017739886417984962, "step": 27 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 436.0, "completions/max_terminated_length": 436.0, "completions/mean_length": 184.25, "completions/mean_terminated_length": 184.25, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.18406004644930363, "epoch": 0.00749665327978581, "frac_reward_zero_std": 0.0, "grad_norm": 4.100404262542725, "learning_rate": 2.8379999999999998e-06, "loss": 0.3547, "num_tokens": 63916.0, "reward": 0.4375, "reward_std": 0.7023502588272095, "rewards/reward_fn/mean": 0.4375, "rewards/reward_fn/std": 0.9038608074188232, "sampling/importance_sampling_ratio/max": 1.4564456939697266, "sampling/importance_sampling_ratio/mean": 1.0008630752563477, "sampling/importance_sampling_ratio/min": 0.6782644987106323, "sampling/sampling_logp_difference/max": 0.3882179260253906, "sampling/sampling_logp_difference/mean": 0.010364357382059097, "step": 28 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 424.0, "completions/max_terminated_length": 424.0, "completions/mean_length": 123.625, "completions/mean_terminated_length": 123.625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "entropy": 0.220602598041296, "epoch": 0.007764390896921017, "frac_reward_zero_std": 0.0, "grad_norm": 6.565753936767578, "learning_rate": 2.8319999999999997e-06, "loss": 0.4192, "num_tokens": 66257.0, "reward": 0.3499999940395355, "reward_std": 0.9062018990516663, "rewards/reward_fn/mean": 0.3499999940395355, "rewards/reward_fn/std": 0.8585702776908875, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0027188062667847, "sampling/importance_sampling_ratio/min": 0.6068937182426453, "sampling/sampling_logp_difference/max": 0.9231102466583252, "sampling/sampling_logp_difference/mean": 0.012978318147361279, "step": 29 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 192.0, "completions/max_terminated_length": 192.0, "completions/mean_length": 100.625, "completions/mean_terminated_length": 100.625, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "entropy": 0.19841119088232517, "epoch": 0.008032128514056224, "frac_reward_zero_std": 0.0, "grad_norm": 5.201574325561523, "learning_rate": 2.8259999999999997e-06, "loss": -0.3327, "num_tokens": 68158.0, "reward": 0.6000000238418579, "reward_std": 0.48301270604133606, "rewards/reward_fn/mean": 0.6000000238418579, "rewards/reward_fn/std": 0.6824326515197754, "sampling/importance_sampling_ratio/max": 1.433454990386963, "sampling/importance_sampling_ratio/mean": 0.9981546998023987, "sampling/importance_sampling_ratio/min": 0.6369554996490479, "sampling/sampling_logp_difference/max": 0.45105552673339844, "sampling/sampling_logp_difference/mean": 0.014056000858545303, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.0, "completions/max_terminated_length": 532.0, "completions/mean_length": 194.375, "completions/mean_terminated_length": 194.375, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "entropy": 0.47254959121346474, "epoch": 0.008299866131191432, "frac_reward_zero_std": 0.0, "grad_norm": 3.896742820739746, "learning_rate": 2.82e-06, "loss": 0.506, "num_tokens": 70773.0, "reward": -0.30000001192092896, "reward_std": 0.5196152329444885, "rewards/reward_fn/mean": -0.30000001192092896, "rewards/reward_fn/std": 0.7131419777870178, "sampling/importance_sampling_ratio/max": 1.444220781326294, "sampling/importance_sampling_ratio/mean": 0.9993738532066345, "sampling/importance_sampling_ratio/min": 0.5432922840118408, "sampling/sampling_logp_difference/max": 0.6101078987121582, "sampling/sampling_logp_difference/mean": 0.016888868063688278, "step": 31 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/max_terminated_length": 356.0, "completions/mean_length": 130.25, "completions/mean_terminated_length": 130.25, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "entropy": 0.27014038152992725, "epoch": 0.00856760374832664, "frac_reward_zero_std": 0.5, "grad_norm": 3.3009936809539795, "learning_rate": 2.814e-06, "loss": 0.3798, "num_tokens": 72931.0, "reward": 0.5625, "reward_std": 0.5153881907463074, "rewards/reward_fn/mean": 0.5625, "rewards/reward_fn/std": 0.8210403323173523, "sampling/importance_sampling_ratio/max": 1.4877067804336548, "sampling/importance_sampling_ratio/mean": 1.0000128746032715, "sampling/importance_sampling_ratio/min": 0.5045390129089355, "sampling/sampling_logp_difference/max": 0.684110164642334, "sampling/sampling_logp_difference/mean": 0.013844412751495838, "step": 32 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 84.0, "completions/mean_terminated_length": 84.0, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.24295945279300213, "epoch": 0.008835341365461847, "frac_reward_zero_std": 0.0, "grad_norm": 6.447144508361816, "learning_rate": 2.808e-06, "loss": -0.0342, "num_tokens": 74807.0, "reward": 0.1875, "reward_std": 0.7285534143447876, "rewards/reward_fn/mean": 0.1875, "rewards/reward_fn/std": 0.7529703378677368, "sampling/importance_sampling_ratio/max": 1.602846384048462, "sampling/importance_sampling_ratio/mean": 0.9980352520942688, "sampling/importance_sampling_ratio/min": 0.33858928084373474, "sampling/sampling_logp_difference/max": 1.0829675197601318, "sampling/sampling_logp_difference/mean": 0.01759566366672516, "step": 33 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 221.0, "completions/max_terminated_length": 221.0, "completions/mean_length": 145.75, "completions/mean_terminated_length": 145.75, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 0.19305756781250238, "epoch": 0.009103078982597055, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.802e-06, "loss": 0.0, "num_tokens": 77117.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4088163375854492, "sampling/importance_sampling_ratio/mean": 0.9993428587913513, "sampling/importance_sampling_ratio/min": 0.7875460386276245, "sampling/sampling_logp_difference/max": 0.34274983406066895, "sampling/sampling_logp_difference/mean": 0.007951664738357067, "step": 34 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.0, "completions/max_terminated_length": 257.0, "completions/mean_length": 94.125, "completions/mean_terminated_length": 94.125, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "entropy": 0.2284493986517191, "epoch": 0.009370816599732263, "frac_reward_zero_std": 0.0, "grad_norm": 5.592118740081787, "learning_rate": 2.7960000000000004e-06, "loss": 0.1393, "num_tokens": 78958.0, "reward": 0.0625, "reward_std": 0.9718236923217773, "rewards/reward_fn/mean": 0.0625, "rewards/reward_fn/std": 0.9038608074188232, "sampling/importance_sampling_ratio/max": 1.301113247871399, "sampling/importance_sampling_ratio/mean": 0.9990594387054443, "sampling/importance_sampling_ratio/min": 0.6874405741691589, "sampling/sampling_logp_difference/max": 0.37477993965148926, "sampling/sampling_logp_difference/mean": 0.010045681148767471, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 88.5, "completions/mean_terminated_length": 88.5, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 0.2426813654601574, "epoch": 0.00963855421686747, "frac_reward_zero_std": 0.5, "grad_norm": 2.471020460128784, "learning_rate": 2.7900000000000004e-06, "loss": -0.0096, "num_tokens": 80958.0, "reward": 0.9750000238418579, "reward_std": 0.05000000074505806, "rewards/reward_fn/mean": 0.9750000238418579, "rewards/reward_fn/std": 0.0707106739282608, "sampling/importance_sampling_ratio/max": 1.3818279504776, "sampling/importance_sampling_ratio/mean": 1.0000183582305908, "sampling/importance_sampling_ratio/min": 0.6580262780189514, "sampling/sampling_logp_difference/max": 0.41851043701171875, "sampling/sampling_logp_difference/mean": 0.013279739767313004, "step": 36 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 164.0, "completions/max_terminated_length": 164.0, "completions/mean_length": 72.75, "completions/mean_terminated_length": 72.75, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.205844821408391, "epoch": 0.009906291834002677, "frac_reward_zero_std": 0.5, "grad_norm": 8.913527488708496, "learning_rate": 2.7840000000000004e-06, "loss": 0.0098, "num_tokens": 82656.0, "reward": 0.34375, "reward_std": 0.1875, "rewards/reward_fn/mean": 0.34375, "rewards/reward_fn/std": 0.7432734370231628, "sampling/importance_sampling_ratio/max": 1.51563560962677, "sampling/importance_sampling_ratio/mean": 1.001777172088623, "sampling/importance_sampling_ratio/min": 0.5709339380264282, "sampling/sampling_logp_difference/max": 0.5604817867279053, "sampling/sampling_logp_difference/mean": 0.01768590696156025, "step": 37 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 135.0, "completions/max_terminated_length": 135.0, "completions/mean_length": 72.75, "completions/mean_terminated_length": 72.75, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "entropy": 0.2590854074805975, "epoch": 0.010174029451137885, "frac_reward_zero_std": 0.5, "grad_norm": 3.992624044418335, "learning_rate": 2.7780000000000003e-06, "loss": -0.0761, "num_tokens": 84498.0, "reward": 0.4375, "reward_std": 0.375, "rewards/reward_fn/mean": 0.4375, "rewards/reward_fn/std": 0.7763237953186035, "sampling/importance_sampling_ratio/max": 1.4400634765625, "sampling/importance_sampling_ratio/mean": 1.0030049085617065, "sampling/importance_sampling_ratio/min": 0.629483699798584, "sampling/sampling_logp_difference/max": 0.46285533905029297, "sampling/sampling_logp_difference/mean": 0.014058367349207401, "step": 38 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 226.0, "completions/max_terminated_length": 226.0, "completions/mean_length": 142.125, "completions/mean_terminated_length": 142.125, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.2687618378549814, "epoch": 0.010441767068273093, "frac_reward_zero_std": 0.0, "grad_norm": 3.561344861984253, "learning_rate": 2.7720000000000003e-06, "loss": -0.0719, "num_tokens": 86931.0, "reward": 0.6000000238418579, "reward_std": 0.5362532734870911, "rewards/reward_fn/mean": 0.6000000238418579, "rewards/reward_fn/std": 0.5522680282592773, "sampling/importance_sampling_ratio/max": 1.3151479959487915, "sampling/importance_sampling_ratio/mean": 0.9993753433227539, "sampling/importance_sampling_ratio/min": 0.7292529344558716, "sampling/sampling_logp_difference/max": 0.3157346248626709, "sampling/sampling_logp_difference/mean": 0.00986116286367178, "step": 39 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 163.0, "completions/max_terminated_length": 163.0, "completions/mean_length": 94.125, "completions/mean_terminated_length": 94.125, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "entropy": 0.2173369862139225, "epoch": 0.0107095046854083, "frac_reward_zero_std": 0.5, "grad_norm": 3.4309144020080566, "learning_rate": 2.7660000000000003e-06, "loss": -0.1356, "num_tokens": 88848.0, "reward": 0.5625, "reward_std": 0.375, "rewards/reward_fn/mean": 0.5625, "rewards/reward_fn/std": 0.6781013607978821, "sampling/importance_sampling_ratio/max": 1.2830878496170044, "sampling/importance_sampling_ratio/mean": 1.0000488758087158, "sampling/importance_sampling_ratio/min": 0.7855908274650574, "sampling/sampling_logp_difference/max": 0.2492694854736328, "sampling/sampling_logp_difference/mean": 0.010238570161163807, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 171.5, "completions/mean_terminated_length": 171.5, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.2571833338588476, "epoch": 0.010977242302543508, "frac_reward_zero_std": 0.0, "grad_norm": 4.099114418029785, "learning_rate": 2.7600000000000003e-06, "loss": -0.2245, "num_tokens": 91540.0, "reward": 0.7875000238418579, "reward_std": 0.42500001192092896, "rewards/reward_fn/mean": 0.7875000238418579, "rewards/reward_fn/std": 0.5249149799346924, "sampling/importance_sampling_ratio/max": 1.7131301164627075, "sampling/importance_sampling_ratio/mean": 1.0008857250213623, "sampling/importance_sampling_ratio/min": 0.4850271940231323, "sampling/sampling_logp_difference/max": 0.7235503196716309, "sampling/sampling_logp_difference/mean": 0.00923075806349516, "step": 41 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 176.0, "completions/max_terminated_length": 176.0, "completions/mean_length": 96.375, "completions/mean_terminated_length": 96.375, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 0.22775377612560987, "epoch": 0.011244979919678716, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.7540000000000002e-06, "loss": 0.0, "num_tokens": 93415.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.2203527688980103, "sampling/importance_sampling_ratio/mean": 0.999375581741333, "sampling/importance_sampling_ratio/min": 0.6933332085609436, "sampling/sampling_logp_difference/max": 0.3662445545196533, "sampling/sampling_logp_difference/mean": 0.008490525186061859, "step": 42 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 167.0, "completions/max_terminated_length": 167.0, "completions/mean_length": 112.5, "completions/mean_terminated_length": 112.5, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "entropy": 0.26890758890658617, "epoch": 0.011512717536813922, "frac_reward_zero_std": 0.0, "grad_norm": 4.725134372711182, "learning_rate": 2.748e-06, "loss": -0.1069, "num_tokens": 95767.0, "reward": 0.0, "reward_std": 0.5, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.5345224738121033, "sampling/importance_sampling_ratio/max": 1.5795071125030518, "sampling/importance_sampling_ratio/mean": 0.9999008774757385, "sampling/importance_sampling_ratio/min": 0.5512589812278748, "sampling/sampling_logp_difference/max": 0.595550537109375, "sampling/sampling_logp_difference/mean": 0.014199670404195786, "step": 43 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 68.625, "completions/mean_terminated_length": 68.625, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.25742135755717754, "epoch": 0.01178045515394913, "frac_reward_zero_std": 0.5, "grad_norm": 4.877482891082764, "learning_rate": 2.742e-06, "loss": 0.194, "num_tokens": 97444.0, "reward": 0.8125, "reward_std": 0.375, "rewards/reward_fn/mean": 0.8125, "rewards/reward_fn/std": 0.5303300619125366, "sampling/importance_sampling_ratio/max": 1.2213696241378784, "sampling/importance_sampling_ratio/mean": 1.0017311573028564, "sampling/importance_sampling_ratio/min": 0.7508642077445984, "sampling/sampling_logp_difference/max": 0.2865304946899414, "sampling/sampling_logp_difference/mean": 0.010486504063010216, "step": 44 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 223.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 105.75, "completions/mean_terminated_length": 105.75, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.24515704717487097, "epoch": 0.012048192771084338, "frac_reward_zero_std": 0.5, "grad_norm": 3.969291925430298, "learning_rate": 2.736e-06, "loss": -0.0624, "num_tokens": 99474.0, "reward": 0.75, "reward_std": 0.5, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 0.7071067690849304, "sampling/importance_sampling_ratio/max": 1.974042534828186, "sampling/importance_sampling_ratio/mean": 1.0003314018249512, "sampling/importance_sampling_ratio/min": 0.6212564706802368, "sampling/sampling_logp_difference/max": 0.6800835132598877, "sampling/sampling_logp_difference/mean": 0.012072471901774406, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/max_terminated_length": 352.0, "completions/mean_length": 161.25, "completions/mean_terminated_length": 161.25, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.1946712341159582, "epoch": 0.012315930388219544, "frac_reward_zero_std": 0.5, "grad_norm": 2.1209957599639893, "learning_rate": 2.73e-06, "loss": 0.3199, "num_tokens": 102120.0, "reward": 0.75, "reward_std": 0.5, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 0.7071067690849304, "sampling/importance_sampling_ratio/max": 1.3876272439956665, "sampling/importance_sampling_ratio/mean": 0.9997404217720032, "sampling/importance_sampling_ratio/min": 0.4378144145011902, "sampling/sampling_logp_difference/max": 0.8259601593017578, "sampling/sampling_logp_difference/mean": 0.009542742744088173, "step": 46 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 156.0, "completions/max_terminated_length": 156.0, "completions/mean_length": 87.375, "completions/mean_terminated_length": 87.375, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "entropy": 0.3646039832383394, "epoch": 0.012583668005354752, "frac_reward_zero_std": 0.0, "grad_norm": 5.446310520172119, "learning_rate": 2.724e-06, "loss": -0.0862, "num_tokens": 104075.0, "reward": 0.7875000238418579, "reward_std": 0.42500001192092896, "rewards/reward_fn/mean": 0.7875000238418579, "rewards/reward_fn/std": 0.5249149799346924, "sampling/importance_sampling_ratio/max": 1.5235729217529297, "sampling/importance_sampling_ratio/mean": 0.9988483190536499, "sampling/importance_sampling_ratio/min": 0.6516712307929993, "sampling/sampling_logp_difference/max": 0.42821502685546875, "sampling/sampling_logp_difference/mean": 0.014683050103485584, "step": 47 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/max_terminated_length": 499.0, "completions/mean_length": 131.875, "completions/mean_terminated_length": 131.875, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.24918334744870663, "epoch": 0.01285140562248996, "frac_reward_zero_std": 0.5, "grad_norm": 2.8937058448791504, "learning_rate": 2.718e-06, "loss": 0.3915, "num_tokens": 106566.0, "reward": 0.5625, "reward_std": 0.5153881907463074, "rewards/reward_fn/mean": 0.5625, "rewards/reward_fn/std": 0.8210403323173523, "sampling/importance_sampling_ratio/max": 1.509895920753479, "sampling/importance_sampling_ratio/mean": 0.9997563362121582, "sampling/importance_sampling_ratio/min": 0.71351557970047, "sampling/sampling_logp_difference/max": 0.41204071044921875, "sampling/sampling_logp_difference/mean": 0.010215841233730316, "step": 48 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 112.75, "completions/mean_terminated_length": 112.75, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.22544598206877708, "epoch": 0.013119143239625167, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.712e-06, "loss": 0.0, "num_tokens": 108748.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.2738362550735474, "sampling/importance_sampling_ratio/mean": 1.0013220310211182, "sampling/importance_sampling_ratio/min": 0.8442186713218689, "sampling/sampling_logp_difference/max": 0.2420330047607422, "sampling/sampling_logp_difference/mean": 0.007016330026090145, "step": 49 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 170.0, "completions/max_terminated_length": 170.0, "completions/mean_length": 91.5, "completions/mean_terminated_length": 91.5, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "entropy": 0.25372983887791634, "epoch": 0.013386880856760375, "frac_reward_zero_std": 0.5, "grad_norm": 2.5142157077789307, "learning_rate": 2.706e-06, "loss": 0.0612, "num_tokens": 110584.0, "reward": 0.75, "reward_std": 0.5, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 0.7071067690849304, "sampling/importance_sampling_ratio/max": 1.2451112270355225, "sampling/importance_sampling_ratio/mean": 1.000658392906189, "sampling/importance_sampling_ratio/min": 0.7807915806770325, "sampling/sampling_logp_difference/max": 0.24744701385498047, "sampling/sampling_logp_difference/mean": 0.009093496948480606, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 220.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 79.5, "completions/mean_terminated_length": 79.5, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "entropy": 0.2811067271977663, "epoch": 0.013654618473895583, "frac_reward_zero_std": 0.5, "grad_norm": 6.114200115203857, "learning_rate": 2.7e-06, "loss": 0.163, "num_tokens": 112448.0, "reward": 0.375, "reward_std": 0.4330126941204071, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.876274585723877, "sampling/importance_sampling_ratio/max": 1.476252555847168, "sampling/importance_sampling_ratio/mean": 0.999772310256958, "sampling/importance_sampling_ratio/min": 0.7557889819145203, "sampling/sampling_logp_difference/max": 0.38950681686401367, "sampling/sampling_logp_difference/mean": 0.014083062298595905, "step": 51 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 185.0, "completions/max_terminated_length": 185.0, "completions/mean_length": 101.75, "completions/mean_terminated_length": 101.75, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 0.2739456985145807, "epoch": 0.013922356091030789, "frac_reward_zero_std": 0.0, "grad_norm": 5.896408557891846, "learning_rate": 2.694e-06, "loss": -0.0907, "num_tokens": 114590.0, "reward": 0.25, "reward_std": 0.8660253882408142, "rewards/reward_fn/mean": 0.25, "rewards/reward_fn/std": 0.8017837405204773, "sampling/importance_sampling_ratio/max": 1.3719313144683838, "sampling/importance_sampling_ratio/mean": 0.9967583417892456, "sampling/importance_sampling_ratio/min": 0.6102401614189148, "sampling/sampling_logp_difference/max": 0.49390268325805664, "sampling/sampling_logp_difference/mean": 0.016437195241451263, "step": 52 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 178.0, "completions/max_terminated_length": 178.0, "completions/mean_length": 87.75, "completions/mean_terminated_length": 87.75, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.26345325633883476, "epoch": 0.014190093708165997, "frac_reward_zero_std": 0.0, "grad_norm": 5.277225017547607, "learning_rate": 2.688e-06, "loss": -0.0438, "num_tokens": 116408.0, "reward": 0.40625, "reward_std": 0.6660534143447876, "rewards/reward_fn/mean": 0.40625, "rewards/reward_fn/std": 0.6258922219276428, "sampling/importance_sampling_ratio/max": 1.2837468385696411, "sampling/importance_sampling_ratio/mean": 0.9998165369033813, "sampling/importance_sampling_ratio/min": 0.7967652082443237, "sampling/sampling_logp_difference/max": 0.24978303909301758, "sampling/sampling_logp_difference/mean": 0.010942870751023293, "step": 53 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 146.0, "completions/max_terminated_length": 146.0, "completions/mean_length": 100.375, "completions/mean_terminated_length": 100.375, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.23985232692211866, "epoch": 0.014457831325301205, "frac_reward_zero_std": 0.5, "grad_norm": 4.397423267364502, "learning_rate": 2.6820000000000003e-06, "loss": 0.0371, "num_tokens": 118643.0, "reward": 0.5375000238418579, "reward_std": 0.48883363604545593, "rewards/reward_fn/mean": 0.5375000238418579, "rewards/reward_fn/std": 0.8087689280509949, "sampling/importance_sampling_ratio/max": 1.327457308769226, "sampling/importance_sampling_ratio/mean": 0.9997919797897339, "sampling/importance_sampling_ratio/min": 0.5239003300666809, "sampling/sampling_logp_difference/max": 0.646453857421875, "sampling/sampling_logp_difference/mean": 0.012333258986473083, "step": 54 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 238.0, "completions/max_terminated_length": 238.0, "completions/mean_length": 108.875, "completions/mean_terminated_length": 108.875, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.20459097530692816, "epoch": 0.014725568942436412, "frac_reward_zero_std": 0.5, "grad_norm": 6.134036064147949, "learning_rate": 2.6760000000000003e-06, "loss": -0.1746, "num_tokens": 120630.0, "reward": 0.53125, "reward_std": 0.3590351641178131, "rewards/reward_fn/mean": 0.53125, "rewards/reward_fn/std": 0.6870940327644348, "sampling/importance_sampling_ratio/max": 1.6952840089797974, "sampling/importance_sampling_ratio/mean": 1.0011820793151855, "sampling/importance_sampling_ratio/min": 0.015252133831381798, "sampling/sampling_logp_difference/max": 4.183035850524902, "sampling/sampling_logp_difference/mean": 0.01615353301167488, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 104.0, "completions/max_terminated_length": 104.0, "completions/mean_length": 84.75, "completions/mean_terminated_length": 84.75, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "entropy": 0.09753922000527382, "epoch": 0.01499330655957162, "frac_reward_zero_std": 0.5, "grad_norm": 1.521569013595581, "learning_rate": 2.6700000000000003e-06, "loss": -0.0105, "num_tokens": 122424.0, "reward": 0.9750000238418579, "reward_std": 0.05000000074505806, "rewards/reward_fn/mean": 0.9750000238418579, "rewards/reward_fn/std": 0.0707106739282608, "sampling/importance_sampling_ratio/max": 1.1818901300430298, "sampling/importance_sampling_ratio/mean": 1.0005300045013428, "sampling/importance_sampling_ratio/min": 0.8713655471801758, "sampling/sampling_logp_difference/max": 0.1671149730682373, "sampling/sampling_logp_difference/mean": 0.004516108427196741, "step": 56 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 154.0, "completions/max_terminated_length": 154.0, "completions/mean_length": 96.125, "completions/mean_terminated_length": 96.125, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "entropy": 0.36157887428998947, "epoch": 0.015261044176706828, "frac_reward_zero_std": 0.0, "grad_norm": 5.185945510864258, "learning_rate": 2.6640000000000002e-06, "loss": -0.1872, "num_tokens": 124357.0, "reward": 0.1875, "reward_std": 0.4451940953731537, "rewards/reward_fn/mean": 0.1875, "rewards/reward_fn/std": 0.7877408862113953, "sampling/importance_sampling_ratio/max": 1.2824736833572388, "sampling/importance_sampling_ratio/mean": 1.0008665323257446, "sampling/importance_sampling_ratio/min": 0.7096433043479919, "sampling/sampling_logp_difference/max": 0.34299278259277344, "sampling/sampling_logp_difference/mean": 0.013522002846002579, "step": 57 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 128.0, "completions/max_terminated_length": 128.0, "completions/mean_length": 77.875, "completions/mean_terminated_length": 77.875, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.25272761285305023, "epoch": 0.015528781793842034, "frac_reward_zero_std": 0.0, "grad_norm": 4.247508525848389, "learning_rate": 2.6580000000000002e-06, "loss": -0.0313, "num_tokens": 126048.0, "reward": 0.5625, "reward_std": 0.875, "rewards/reward_fn/mean": 0.5625, "rewards/reward_fn/std": 0.8210403323173523, "sampling/importance_sampling_ratio/max": 1.2059277296066284, "sampling/importance_sampling_ratio/mean": 0.9999945163726807, "sampling/importance_sampling_ratio/min": 0.7942938208580017, "sampling/sampling_logp_difference/max": 0.2303018569946289, "sampling/sampling_logp_difference/mean": 0.009428811259567738, "step": 58 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 315.0, "completions/max_terminated_length": 315.0, "completions/mean_length": 130.625, "completions/mean_terminated_length": 130.625, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "entropy": 0.2612511180341244, "epoch": 0.015796519410977244, "frac_reward_zero_std": 0.0, "grad_norm": 4.331783294677734, "learning_rate": 2.652e-06, "loss": 0.1581, "num_tokens": 128217.0, "reward": 0.3125, "reward_std": 1.0153882503509521, "rewards/reward_fn/mean": 0.3125, "rewards/reward_fn/std": 0.9613049626350403, "sampling/importance_sampling_ratio/max": 1.4104771614074707, "sampling/importance_sampling_ratio/mean": 0.9989643096923828, "sampling/importance_sampling_ratio/min": 0.6406038999557495, "sampling/sampling_logp_difference/max": 0.4453439712524414, "sampling/sampling_logp_difference/mean": 0.007658245507627726, "step": 59 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 211.0, "completions/max_terminated_length": 211.0, "completions/mean_length": 126.25, "completions/mean_terminated_length": 126.25, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 0.17846582364290953, "epoch": 0.01606425702811245, "frac_reward_zero_std": 0.5, "grad_norm": 1.8587411642074585, "learning_rate": 2.646e-06, "loss": 0.1075, "num_tokens": 130603.0, "reward": 0.6875, "reward_std": 0.4732423424720764, "rewards/reward_fn/mean": 0.6875, "rewards/reward_fn/std": 0.7039430141448975, "sampling/importance_sampling_ratio/max": 1.1391346454620361, "sampling/importance_sampling_ratio/mean": 0.9991238713264465, "sampling/importance_sampling_ratio/min": 0.7876356840133667, "sampling/sampling_logp_difference/max": 0.238719642162323, "sampling/sampling_logp_difference/mean": 0.00715804286301136, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 268.0, "completions/max_terminated_length": 268.0, "completions/mean_length": 146.875, "completions/mean_terminated_length": 146.875, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 0.2997130174189806, "epoch": 0.016331994645247656, "frac_reward_zero_std": 0.5, "grad_norm": 2.792605400085449, "learning_rate": 2.64e-06, "loss": 0.0375, "num_tokens": 133050.0, "reward": -0.21875, "reward_std": 0.3590351641178131, "rewards/reward_fn/mean": -0.21875, "rewards/reward_fn/std": 0.5580178499221802, "sampling/importance_sampling_ratio/max": 1.7371859550476074, "sampling/importance_sampling_ratio/mean": 1.0002954006195068, "sampling/importance_sampling_ratio/min": 0.5776150226593018, "sampling/sampling_logp_difference/max": 0.5522665977478027, "sampling/sampling_logp_difference/mean": 0.015750933438539505, "step": 61 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 129.0, "completions/max_terminated_length": 129.0, "completions/mean_length": 97.5, "completions/mean_terminated_length": 97.5, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.22677360847592354, "epoch": 0.016599732262382864, "frac_reward_zero_std": 0.0, "grad_norm": 4.421785831451416, "learning_rate": 2.634e-06, "loss": 0.0918, "num_tokens": 135034.0, "reward": 0.7250000238418579, "reward_std": 0.48642081022262573, "rewards/reward_fn/mean": 0.7250000238418579, "rewards/reward_fn/std": 0.5257647633552551, "sampling/importance_sampling_ratio/max": 1.3608678579330444, "sampling/importance_sampling_ratio/mean": 1.0003385543823242, "sampling/importance_sampling_ratio/min": 0.707626223564148, "sampling/sampling_logp_difference/max": 0.345839262008667, "sampling/sampling_logp_difference/mean": 0.011226016096770763, "step": 62 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 108.875, "completions/mean_terminated_length": 108.875, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "entropy": 0.1964818872511387, "epoch": 0.016867469879518072, "frac_reward_zero_std": 0.5, "grad_norm": 4.118081569671631, "learning_rate": 2.628e-06, "loss": 0.1052, "num_tokens": 137161.0, "reward": 0.75, "reward_std": 0.3535533845424652, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 0.5345224738121033, "sampling/importance_sampling_ratio/max": 1.4815773963928223, "sampling/importance_sampling_ratio/mean": 1.002424716949463, "sampling/importance_sampling_ratio/min": 0.6341012120246887, "sampling/sampling_logp_difference/max": 0.4555467367172241, "sampling/sampling_logp_difference/mean": 0.011239099316298962, "step": 63 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 655.0, "completions/max_terminated_length": 655.0, "completions/mean_length": 243.625, "completions/mean_terminated_length": 243.625, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.29449538327753544, "epoch": 0.01713520749665328, "frac_reward_zero_std": 0.0, "grad_norm": 2.0751688480377197, "learning_rate": 2.622e-06, "loss": 0.0933, "num_tokens": 140482.0, "reward": 0.125, "reward_std": 0.8482423424720764, "rewards/reward_fn/mean": 0.125, "rewards/reward_fn/std": 0.9543135166168213, "sampling/importance_sampling_ratio/max": 1.2323927879333496, "sampling/importance_sampling_ratio/mean": 0.9996010661125183, "sampling/importance_sampling_ratio/min": 0.5414842963218689, "sampling/sampling_logp_difference/max": 0.6134412288665771, "sampling/sampling_logp_difference/mean": 0.008634842932224274, "step": 64 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 128.0, "completions/max_terminated_length": 128.0, "completions/mean_length": 104.875, "completions/mean_terminated_length": 104.875, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "entropy": 0.38300235383212566, "epoch": 0.01740294511378849, "frac_reward_zero_std": 0.5, "grad_norm": 2.5706310272216797, "learning_rate": 2.616e-06, "loss": -0.0694, "num_tokens": 142457.0, "reward": 0.15625, "reward_std": 0.1875, "rewards/reward_fn/mean": 0.15625, "rewards/reward_fn/std": 0.7432734370231628, "sampling/importance_sampling_ratio/max": 1.2911205291748047, "sampling/importance_sampling_ratio/mean": 0.9999396800994873, "sampling/importance_sampling_ratio/min": 0.5555576682090759, "sampling/sampling_logp_difference/max": 0.5877828598022461, "sampling/sampling_logp_difference/mean": 0.013837832026183605, "step": 65 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 159.0, "completions/max_terminated_length": 159.0, "completions/mean_length": 109.75, "completions/mean_terminated_length": 109.75, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.31897392123937607, "epoch": 0.017670682730923693, "frac_reward_zero_std": 0.5, "grad_norm": 4.0501298904418945, "learning_rate": 2.61e-06, "loss": 0.0796, "num_tokens": 144559.0, "reward": 0.75, "reward_std": 0.5, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 0.7071067690849304, "sampling/importance_sampling_ratio/max": 1.3830671310424805, "sampling/importance_sampling_ratio/mean": 1.002236008644104, "sampling/importance_sampling_ratio/min": 0.5980319380760193, "sampling/sampling_logp_difference/max": 0.5141110420227051, "sampling/sampling_logp_difference/mean": 0.015623632818460464, "step": 66 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 150.0, "completions/max_terminated_length": 150.0, "completions/mean_length": 112.875, "completions/mean_terminated_length": 112.875, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.3415717799216509, "epoch": 0.0179384203480589, "frac_reward_zero_std": 0.5, "grad_norm": 3.6382815837860107, "learning_rate": 2.604e-06, "loss": 0.034, "num_tokens": 146522.0, "reward": 0.84375, "reward_std": 0.1875, "rewards/reward_fn/mean": 0.84375, "rewards/reward_fn/std": 0.29693374037742615, "sampling/importance_sampling_ratio/max": 1.4246344566345215, "sampling/importance_sampling_ratio/mean": 0.9987221360206604, "sampling/importance_sampling_ratio/min": 0.654655933380127, "sampling/sampling_logp_difference/max": 0.4236454963684082, "sampling/sampling_logp_difference/mean": 0.01621454581618309, "step": 67 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 130.0, "completions/max_terminated_length": 130.0, "completions/mean_length": 90.75, "completions/mean_terminated_length": 90.75, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.21179581992328167, "epoch": 0.01820615796519411, "frac_reward_zero_std": 0.5, "grad_norm": 2.866631031036377, "learning_rate": 2.598e-06, "loss": 0.0106, "num_tokens": 148376.0, "reward": -0.25, "reward_std": 0.5773502588272095, "rewards/reward_fn/mean": -0.25, "rewards/reward_fn/std": 0.8017837405204773, "sampling/importance_sampling_ratio/max": 1.415460467338562, "sampling/importance_sampling_ratio/mean": 1.0009872913360596, "sampling/importance_sampling_ratio/min": 0.7910581231117249, "sampling/sampling_logp_difference/max": 0.34745490550994873, "sampling/sampling_logp_difference/mean": 0.009363332763314247, "step": 68 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 143.625, "completions/mean_terminated_length": 143.625, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.20050527341663837, "epoch": 0.018473895582329317, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.592e-06, "loss": 0.0, "num_tokens": 150837.0, "reward": 0.25, "reward_std": 0.0, "rewards/reward_fn/mean": 0.25, "rewards/reward_fn/std": 0.8017837405204773, "sampling/importance_sampling_ratio/max": 1.552464485168457, "sampling/importance_sampling_ratio/mean": 1.0010474920272827, "sampling/importance_sampling_ratio/min": 0.5466071963310242, "sampling/sampling_logp_difference/max": 0.6040248870849609, "sampling/sampling_logp_difference/mean": 0.012746752239763737, "step": 69 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 100.0, "completions/max_terminated_length": 100.0, "completions/mean_length": 55.75, "completions/mean_terminated_length": 55.75, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "entropy": 0.2765138279646635, "epoch": 0.018741633199464525, "frac_reward_zero_std": 0.0, "grad_norm": 7.493396282196045, "learning_rate": 2.586e-06, "loss": 0.0782, "num_tokens": 152691.0, "reward": 0.25, "reward_std": 0.8660253882408142, "rewards/reward_fn/mean": 0.25, "rewards/reward_fn/std": 0.8017837405204773, "sampling/importance_sampling_ratio/max": 1.2081669569015503, "sampling/importance_sampling_ratio/mean": 0.9967964291572571, "sampling/importance_sampling_ratio/min": 0.7396815419197083, "sampling/sampling_logp_difference/max": 0.30153560638427734, "sampling/sampling_logp_difference/mean": 0.014937438070774078, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 250.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 128.75, "completions/mean_terminated_length": 128.75, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "entropy": 0.3995748218148947, "epoch": 0.019009370816599733, "frac_reward_zero_std": 0.0, "grad_norm": 5.553465843200684, "learning_rate": 2.58e-06, "loss": -0.0843, "num_tokens": 155081.0, "reward": 0.4375, "reward_std": 0.8145764470100403, "rewards/reward_fn/mean": 0.4375, "rewards/reward_fn/std": 0.7763237953186035, "sampling/importance_sampling_ratio/max": 1.5574434995651245, "sampling/importance_sampling_ratio/mean": 1.0002764463424683, "sampling/importance_sampling_ratio/min": 0.7050127983093262, "sampling/sampling_logp_difference/max": 0.44304561614990234, "sampling/sampling_logp_difference/mean": 0.01915440335869789, "step": 71 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 238.0, "completions/max_terminated_length": 238.0, "completions/mean_length": 95.0, "completions/mean_terminated_length": 95.0, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.34289540629833937, "epoch": 0.01927710843373494, "frac_reward_zero_std": 0.5, "grad_norm": 4.288081645965576, "learning_rate": 2.574e-06, "loss": 0.2863, "num_tokens": 157085.0, "reward": 0.5625, "reward_std": 0.5153881907463074, "rewards/reward_fn/mean": 0.5625, "rewards/reward_fn/std": 0.8210403323173523, "sampling/importance_sampling_ratio/max": 1.3903757333755493, "sampling/importance_sampling_ratio/mean": 1.0013322830200195, "sampling/importance_sampling_ratio/min": 0.7349072098731995, "sampling/sampling_logp_difference/max": 0.3295741081237793, "sampling/sampling_logp_difference/mean": 0.013408519327640533, "step": 72 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 172.0, "completions/max_terminated_length": 172.0, "completions/mean_length": 89.5, "completions/mean_terminated_length": 89.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "entropy": 0.25123918429017067, "epoch": 0.019544846050870146, "frac_reward_zero_std": 0.0, "grad_norm": 4.050495147705078, "learning_rate": 2.568e-06, "loss": 0.057, "num_tokens": 159201.0, "reward": 0.6937500238418579, "reward_std": 0.40903517603874207, "rewards/reward_fn/mean": 0.6937500238418579, "rewards/reward_fn/std": 0.5480077862739563, "sampling/importance_sampling_ratio/max": 1.1969836950302124, "sampling/importance_sampling_ratio/mean": 1.001542091369629, "sampling/importance_sampling_ratio/min": 0.7300276160240173, "sampling/sampling_logp_difference/max": 0.31467294692993164, "sampling/sampling_logp_difference/mean": 0.010944816283881664, "step": 73 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 160.0, "completions/max_terminated_length": 160.0, "completions/mean_length": 75.875, "completions/mean_terminated_length": 75.875, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.31323628686368465, "epoch": 0.019812583668005354, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.562e-06, "loss": 0.0, "num_tokens": 160880.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.8903917074203491, "sampling/importance_sampling_ratio/mean": 1.0002018213272095, "sampling/importance_sampling_ratio/min": 0.7874852418899536, "sampling/sampling_logp_difference/max": 0.6367840766906738, "sampling/sampling_logp_difference/mean": 0.015023726038634777, "step": 74 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 250.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 142.875, "completions/mean_terminated_length": 142.875, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.24280969519168139, "epoch": 0.020080321285140562, "frac_reward_zero_std": 0.0, "grad_norm": 5.37821626663208, "learning_rate": 2.556e-06, "loss": -0.0166, "num_tokens": 163451.0, "reward": 0.4375, "reward_std": 0.8080127239227295, "rewards/reward_fn/mean": 0.4375, "rewards/reward_fn/std": 0.7763237953186035, "sampling/importance_sampling_ratio/max": 1.5987595319747925, "sampling/importance_sampling_ratio/mean": 1.0017035007476807, "sampling/importance_sampling_ratio/min": 0.6664620637893677, "sampling/sampling_logp_difference/max": 0.46922802925109863, "sampling/sampling_logp_difference/mean": 0.012849228456616402, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 119.0, "completions/max_terminated_length": 119.0, "completions/mean_length": 90.75, "completions/mean_terminated_length": 90.75, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.10534905549138784, "epoch": 0.02034805890227577, "frac_reward_zero_std": 0.5, "grad_norm": 2.570324659347534, "learning_rate": 2.55e-06, "loss": -0.0304, "num_tokens": 165437.0, "reward": 0.8125, "reward_std": 0.375, "rewards/reward_fn/mean": 0.8125, "rewards/reward_fn/std": 0.5303300619125366, "sampling/importance_sampling_ratio/max": 1.319058895111084, "sampling/importance_sampling_ratio/mean": 1.0003224611282349, "sampling/importance_sampling_ratio/min": 0.7843183875083923, "sampling/sampling_logp_difference/max": 0.27691853046417236, "sampling/sampling_logp_difference/mean": 0.00569203682243824, "step": 76 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 144.0, "completions/max_terminated_length": 144.0, "completions/mean_length": 76.625, "completions/mean_terminated_length": 76.625, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "entropy": 0.24680935312062502, "epoch": 0.020615796519410978, "frac_reward_zero_std": 0.0, "grad_norm": 6.51108455657959, "learning_rate": 2.544e-06, "loss": -0.1456, "num_tokens": 167322.0, "reward": 0.4124999940395355, "reward_std": 0.42500001192092896, "rewards/reward_fn/mean": 0.4124999940395355, "rewards/reward_fn/std": 0.758640706539154, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0016412734985352, "sampling/importance_sampling_ratio/min": 0.6878054738044739, "sampling/sampling_logp_difference/max": 0.8050769567489624, "sampling/sampling_logp_difference/mean": 0.011967209167778492, "step": 77 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 198.0, "completions/max_terminated_length": 198.0, "completions/mean_length": 96.5, "completions/mean_terminated_length": 96.5, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 0.23884586337953806, "epoch": 0.020883534136546186, "frac_reward_zero_std": 0.5, "grad_norm": 4.4442596435546875, "learning_rate": 2.538e-06, "loss": -0.1993, "num_tokens": 169110.0, "reward": 0.3125, "reward_std": 0.375, "rewards/reward_fn/mean": 0.3125, "rewards/reward_fn/std": 0.5303300619125366, "sampling/importance_sampling_ratio/max": 1.434143304824829, "sampling/importance_sampling_ratio/mean": 0.9987870454788208, "sampling/importance_sampling_ratio/min": 0.3750075697898865, "sampling/sampling_logp_difference/max": 0.9808090925216675, "sampling/sampling_logp_difference/mean": 0.01440919004380703, "step": 78 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 183.0, "completions/mean_terminated_length": 183.0, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.28300489019602537, "epoch": 0.02115127175368139, "frac_reward_zero_std": 0.0, "grad_norm": 2.5385549068450928, "learning_rate": 2.532e-06, "loss": 0.0275, "num_tokens": 171994.0, "reward": 0.34375, "reward_std": 0.7920478582382202, "rewards/reward_fn/mean": 0.34375, "rewards/reward_fn/std": 0.7432734370231628, "sampling/importance_sampling_ratio/max": 1.3080979585647583, "sampling/importance_sampling_ratio/mean": 0.9999258518218994, "sampling/importance_sampling_ratio/min": 0.7645941376686096, "sampling/sampling_logp_difference/max": 0.2685741186141968, "sampling/sampling_logp_difference/mean": 0.010039621964097023, "step": 79 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.0, "completions/max_terminated_length": 257.0, "completions/mean_length": 180.875, "completions/mean_terminated_length": 180.875, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.20224987342953682, "epoch": 0.0214190093708166, "frac_reward_zero_std": 0.0, "grad_norm": 2.0858154296875, "learning_rate": 2.526e-06, "loss": -0.094, "num_tokens": 174749.0, "reward": -0.125, "reward_std": 0.8273502588272095, "rewards/reward_fn/mean": -0.125, "rewards/reward_fn/std": 0.8345229625701904, "sampling/importance_sampling_ratio/max": 1.230350375175476, "sampling/importance_sampling_ratio/mean": 0.99974524974823, "sampling/importance_sampling_ratio/min": 0.6989374160766602, "sampling/sampling_logp_difference/max": 0.3581939935684204, "sampling/sampling_logp_difference/mean": 0.007732843514531851, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 149.0, "completions/max_terminated_length": 149.0, "completions/mean_length": 92.0, "completions/mean_terminated_length": 92.0, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "entropy": 0.2627595867961645, "epoch": 0.021686746987951807, "frac_reward_zero_std": 0.5, "grad_norm": 6.057287216186523, "learning_rate": 2.52e-06, "loss": 0.0327, "num_tokens": 176629.0, "reward": 0.375, "reward_std": 0.4330126941204071, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.876274585723877, "sampling/importance_sampling_ratio/max": 1.7755582332611084, "sampling/importance_sampling_ratio/mean": 1.0007256269454956, "sampling/importance_sampling_ratio/min": 0.7762119174003601, "sampling/sampling_logp_difference/max": 0.5741147994995117, "sampling/sampling_logp_difference/mean": 0.012666086666285992, "step": 81 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 164.0, "completions/max_terminated_length": 164.0, "completions/mean_length": 135.375, "completions/mean_terminated_length": 135.375, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.18980143871158361, "epoch": 0.021954484605087015, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.514e-06, "loss": 0.0, "num_tokens": 179012.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.2734839916229248, "sampling/importance_sampling_ratio/mean": 0.9977608323097229, "sampling/importance_sampling_ratio/min": 0.7102720141410828, "sampling/sampling_logp_difference/max": 0.34210729598999023, "sampling/sampling_logp_difference/mean": 0.009089219383895397, "step": 82 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 86.625, "completions/mean_terminated_length": 86.625, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "entropy": 0.24167847074568272, "epoch": 0.022222222222222223, "frac_reward_zero_std": 0.5, "grad_norm": 3.905122995376587, "learning_rate": 2.508e-06, "loss": -0.0735, "num_tokens": 180785.0, "reward": 0.8125, "reward_std": 0.375, "rewards/reward_fn/mean": 0.8125, "rewards/reward_fn/std": 0.5303300619125366, "sampling/importance_sampling_ratio/max": 1.3646670579910278, "sampling/importance_sampling_ratio/mean": 1.0001091957092285, "sampling/importance_sampling_ratio/min": 0.5573540925979614, "sampling/sampling_logp_difference/max": 0.5845545530319214, "sampling/sampling_logp_difference/mean": 0.009825773537158966, "step": 83 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 120.0, "completions/max_terminated_length": 120.0, "completions/mean_length": 80.125, "completions/mean_terminated_length": 80.125, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 0.21141298115253448, "epoch": 0.02248995983935743, "frac_reward_zero_std": 0.5, "grad_norm": 3.050516366958618, "learning_rate": 2.502e-06, "loss": -0.0815, "num_tokens": 182490.0, "reward": 0.9375, "reward_std": 0.125, "rewards/reward_fn/mean": 0.9375, "rewards/reward_fn/std": 0.1767766922712326, "sampling/importance_sampling_ratio/max": 1.172448754310608, "sampling/importance_sampling_ratio/mean": 0.9977245330810547, "sampling/importance_sampling_ratio/min": 0.6509853005409241, "sampling/sampling_logp_difference/max": 0.4292681813240051, "sampling/sampling_logp_difference/mean": 0.010387144982814789, "step": 84 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 153.0, "completions/max_terminated_length": 153.0, "completions/mean_length": 106.0, "completions/mean_terminated_length": 106.0, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "entropy": 0.22644580155611038, "epoch": 0.022757697456492636, "frac_reward_zero_std": 0.0, "grad_norm": 3.4553427696228027, "learning_rate": 2.496e-06, "loss": -0.2151, "num_tokens": 184602.0, "reward": 0.625, "reward_std": 0.75, "rewards/reward_fn/mean": 0.625, "rewards/reward_fn/std": 0.6943650841712952, "sampling/importance_sampling_ratio/max": 1.256630301475525, "sampling/importance_sampling_ratio/mean": 0.9999874830245972, "sampling/importance_sampling_ratio/min": 0.7259413599967957, "sampling/sampling_logp_difference/max": 0.3202860355377197, "sampling/sampling_logp_difference/mean": 0.010207288898527622, "step": 85 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 144.0, "completions/max_terminated_length": 144.0, "completions/mean_length": 109.25, "completions/mean_terminated_length": 109.25, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.25891055539250374, "epoch": 0.023025435073627844, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.49e-06, "loss": 0.0, "num_tokens": 186608.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6286104917526245, "sampling/importance_sampling_ratio/mean": 1.0024967193603516, "sampling/importance_sampling_ratio/min": 0.7893041968345642, "sampling/sampling_logp_difference/max": 0.48772716522216797, "sampling/sampling_logp_difference/mean": 0.010254220105707645, "step": 86 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 221.0, "completions/max_terminated_length": 221.0, "completions/mean_length": 117.875, "completions/mean_terminated_length": 117.875, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 0.2685732766985893, "epoch": 0.023293172690763052, "frac_reward_zero_std": 0.5, "grad_norm": 2.779574155807495, "learning_rate": 2.484e-06, "loss": 0.0313, "num_tokens": 188619.0, "reward": 0.4375, "reward_std": 0.375, "rewards/reward_fn/mean": 0.4375, "rewards/reward_fn/std": 0.7763237953186035, "sampling/importance_sampling_ratio/max": 1.2839053869247437, "sampling/importance_sampling_ratio/mean": 1.0002130270004272, "sampling/importance_sampling_ratio/min": 0.594695508480072, "sampling/sampling_logp_difference/max": 0.5197057723999023, "sampling/sampling_logp_difference/mean": 0.010993986390531063, "step": 87 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 107.0, "completions/max_terminated_length": 107.0, "completions/mean_length": 79.625, "completions/mean_terminated_length": 79.625, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "entropy": 0.3131292425096035, "epoch": 0.02356091030789826, "frac_reward_zero_std": 0.5, "grad_norm": 5.821944713592529, "learning_rate": 2.478e-06, "loss": 0.0528, "num_tokens": 190436.0, "reward": 0.75, "reward_std": 0.5, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 0.7071067690849304, "sampling/importance_sampling_ratio/max": 1.255246639251709, "sampling/importance_sampling_ratio/mean": 0.9973951578140259, "sampling/importance_sampling_ratio/min": 0.5817364454269409, "sampling/sampling_logp_difference/max": 0.5417377948760986, "sampling/sampling_logp_difference/mean": 0.015365981496870518, "step": 88 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 112.0, "completions/max_terminated_length": 112.0, "completions/mean_length": 84.25, "completions/mean_terminated_length": 84.25, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 0.23585979267954826, "epoch": 0.023828647925033468, "frac_reward_zero_std": 0.5, "grad_norm": 2.5388827323913574, "learning_rate": 2.472e-06, "loss": -0.0523, "num_tokens": 192330.0, "reward": 0.375, "reward_std": 0.4330126941204071, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.876274585723877, "sampling/importance_sampling_ratio/max": 1.2505664825439453, "sampling/importance_sampling_ratio/mean": 0.9987168908119202, "sampling/importance_sampling_ratio/min": 0.8175767660140991, "sampling/sampling_logp_difference/max": 0.22359657287597656, "sampling/sampling_logp_difference/mean": 0.007615532726049423, "step": 89 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 140.0, "completions/max_terminated_length": 140.0, "completions/mean_length": 110.625, "completions/mean_terminated_length": 110.625, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.19435907527804375, "epoch": 0.024096385542168676, "frac_reward_zero_std": 0.5, "grad_norm": 4.908689975738525, "learning_rate": 2.4659999999999998e-06, "loss": 0.0086, "num_tokens": 194591.0, "reward": 0.9375, "reward_std": 0.125, "rewards/reward_fn/mean": 0.9375, "rewards/reward_fn/std": 0.1767766922712326, "sampling/importance_sampling_ratio/max": 1.6073299646377563, "sampling/importance_sampling_ratio/mean": 1.0001590251922607, "sampling/importance_sampling_ratio/min": 0.7790418267250061, "sampling/sampling_logp_difference/max": 0.47457432746887207, "sampling/sampling_logp_difference/mean": 0.010009697638452053, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 284.0, "completions/max_terminated_length": 284.0, "completions/mean_length": 175.25, "completions/mean_terminated_length": 175.25, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.14038583729416132, "epoch": 0.02436412315930388, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.4599999999999997e-06, "loss": 0.0, "num_tokens": 197417.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3682745695114136, "sampling/importance_sampling_ratio/mean": 1.0007840394973755, "sampling/importance_sampling_ratio/min": 0.7976399064064026, "sampling/sampling_logp_difference/max": 0.3135504722595215, "sampling/sampling_logp_difference/mean": 0.005546721164137125, "step": 91 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 212.0, "completions/max_terminated_length": 212.0, "completions/mean_length": 156.125, "completions/mean_terminated_length": 156.125, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.15062626916915178, "epoch": 0.02463186077643909, "frac_reward_zero_std": 0.5, "grad_norm": 1.321321964263916, "learning_rate": 2.4539999999999997e-06, "loss": -0.0589, "num_tokens": 199946.0, "reward": 0.625, "reward_std": 0.4330126941204071, "rewards/reward_fn/mean": 0.625, "rewards/reward_fn/std": 0.6943650841712952, "sampling/importance_sampling_ratio/max": 1.2792158126831055, "sampling/importance_sampling_ratio/mean": 0.9991571307182312, "sampling/importance_sampling_ratio/min": 0.8192594647407532, "sampling/sampling_logp_difference/max": 0.2462472915649414, "sampling/sampling_logp_difference/mean": 0.007396632339805365, "step": 92 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/max_terminated_length": 392.0, "completions/mean_length": 182.125, "completions/mean_terminated_length": 182.125, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.17029361380264163, "epoch": 0.024899598393574297, "frac_reward_zero_std": 0.5, "grad_norm": 1.9197070598602295, "learning_rate": 2.448e-06, "loss": 0.1739, "num_tokens": 202687.0, "reward": 0.75, "reward_std": 0.5, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 0.7071067690849304, "sampling/importance_sampling_ratio/max": 1.4206163883209229, "sampling/importance_sampling_ratio/mean": 1.0017296075820923, "sampling/importance_sampling_ratio/min": 0.8054620027542114, "sampling/sampling_logp_difference/max": 0.35109078884124756, "sampling/sampling_logp_difference/mean": 0.007110211998224258, "step": 93 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 181.0, "completions/max_terminated_length": 181.0, "completions/mean_length": 133.0, "completions/mean_terminated_length": 133.0, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "entropy": 0.185949195176363, "epoch": 0.025167336010709505, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.442e-06, "loss": 0.0, "num_tokens": 205003.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 0.26726123690605164, "sampling/importance_sampling_ratio/max": 1.4351508617401123, "sampling/importance_sampling_ratio/mean": 0.9993758797645569, "sampling/importance_sampling_ratio/min": 0.7852245569229126, "sampling/sampling_logp_difference/max": 0.3612699508666992, "sampling/sampling_logp_difference/mean": 0.007057466544210911, "step": 94 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.0, "completions/max_terminated_length": 302.0, "completions/mean_length": 185.875, "completions/mean_terminated_length": 185.875, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.13264656718820333, "epoch": 0.025435073627844713, "frac_reward_zero_std": 0.5, "grad_norm": 1.6917721033096313, "learning_rate": 2.436e-06, "loss": 0.0519, "num_tokens": 207602.0, "reward": 0.5625, "reward_std": 0.5153881907463074, "rewards/reward_fn/mean": 0.5625, "rewards/reward_fn/std": 0.8210403323173523, "sampling/importance_sampling_ratio/max": 1.2144798040390015, "sampling/importance_sampling_ratio/mean": 0.9996753931045532, "sampling/importance_sampling_ratio/min": 0.7561350464820862, "sampling/sampling_logp_difference/max": 0.27953529357910156, "sampling/sampling_logp_difference/mean": 0.00649234326556325, "step": 95 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 235.0, "completions/max_terminated_length": 235.0, "completions/mean_length": 142.25, "completions/mean_terminated_length": 142.25, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 0.1533768316730857, "epoch": 0.02570281124497992, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.43e-06, "loss": 0.0, "num_tokens": 209952.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4895378351211548, "sampling/importance_sampling_ratio/mean": 1.000785231590271, "sampling/importance_sampling_ratio/min": 0.8004284501075745, "sampling/sampling_logp_difference/max": 0.3984658718109131, "sampling/sampling_logp_difference/mean": 0.007067009806632996, "step": 96 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 271.0, "completions/max_terminated_length": 271.0, "completions/mean_length": 112.75, "completions/mean_terminated_length": 112.75, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 0.3796777315437794, "epoch": 0.02597054886211513, "frac_reward_zero_std": 0.5, "grad_norm": 2.2682905197143555, "learning_rate": 2.4240000000000004e-06, "loss": 0.0803, "num_tokens": 212038.0, "reward": 0.0625, "reward_std": 0.375, "rewards/reward_fn/mean": 0.0625, "rewards/reward_fn/std": 0.7763237953186035, "sampling/importance_sampling_ratio/max": 1.2537404298782349, "sampling/importance_sampling_ratio/mean": 1.0006046295166016, "sampling/importance_sampling_ratio/min": 0.7864208221435547, "sampling/sampling_logp_difference/max": 0.2402632236480713, "sampling/sampling_logp_difference/mean": 0.013749061152338982, "step": 97 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 163.0, "completions/max_terminated_length": 163.0, "completions/mean_length": 120.75, "completions/mean_terminated_length": 120.75, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.20226233545690775, "epoch": 0.026238286479250333, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.4180000000000004e-06, "loss": 0.0, "num_tokens": 214200.0, "reward": 0.25, "reward_std": 0.0, "rewards/reward_fn/mean": 0.25, "rewards/reward_fn/std": 0.8017837405204773, "sampling/importance_sampling_ratio/max": 1.5575392246246338, "sampling/importance_sampling_ratio/mean": 1.0004669427871704, "sampling/importance_sampling_ratio/min": 0.7789872884750366, "sampling/sampling_logp_difference/max": 0.44310712814331055, "sampling/sampling_logp_difference/mean": 0.009702039882540703, "step": 98 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 169.0, "completions/max_terminated_length": 169.0, "completions/mean_length": 89.875, "completions/mean_terminated_length": 89.875, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.2729686088860035, "epoch": 0.02650602409638554, "frac_reward_zero_std": 0.5, "grad_norm": 3.424119234085083, "learning_rate": 2.4120000000000004e-06, "loss": -0.1942, "num_tokens": 216023.0, "reward": 0.375, "reward_std": 0.4330126941204071, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.876274585723877, "sampling/importance_sampling_ratio/max": 1.4507851600646973, "sampling/importance_sampling_ratio/mean": 1.000223994255066, "sampling/importance_sampling_ratio/min": 0.7110286951065063, "sampling/sampling_logp_difference/max": 0.3721048831939697, "sampling/sampling_logp_difference/mean": 0.010684235952794552, "step": 99 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 145.75, "completions/mean_terminated_length": 145.75, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.2266270937398076, "epoch": 0.02677376171352075, "frac_reward_zero_std": 0.0, "grad_norm": 2.948132276535034, "learning_rate": 2.4060000000000003e-06, "loss": 0.1902, "num_tokens": 218549.0, "reward": 0.8812500238418579, "reward_std": 0.23749999701976776, "rewards/reward_fn/mean": 0.8812500238418579, "rewards/reward_fn/std": 0.2644907534122467, "sampling/importance_sampling_ratio/max": 1.438185691833496, "sampling/importance_sampling_ratio/mean": 0.9994520545005798, "sampling/importance_sampling_ratio/min": 0.7251313328742981, "sampling/sampling_logp_difference/max": 0.36338233947753906, "sampling/sampling_logp_difference/mean": 0.009078041650354862, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 182.0, "completions/max_terminated_length": 182.0, "completions/mean_length": 136.0, "completions/mean_terminated_length": 136.0, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.23243628442287445, "epoch": 0.027041499330655958, "frac_reward_zero_std": 0.0, "grad_norm": 4.070948123931885, "learning_rate": 2.4000000000000003e-06, "loss": 0.0187, "num_tokens": 220845.0, "reward": 0.887499988079071, "reward_std": 0.18273502588272095, "rewards/reward_fn/mean": 0.887499988079071, "rewards/reward_fn/std": 0.18077215552330017, "sampling/importance_sampling_ratio/max": 1.5512168407440186, "sampling/importance_sampling_ratio/mean": 0.9993858337402344, "sampling/importance_sampling_ratio/min": 0.5511173009872437, "sampling/sampling_logp_difference/max": 0.5958075523376465, "sampling/sampling_logp_difference/mean": 0.013366767205297947, "step": 101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/max_terminated_length": 327.0, "completions/mean_length": 185.875, "completions/mean_terminated_length": 185.875, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.14303583186119795, "epoch": 0.027309236947791166, "frac_reward_zero_std": 0.5, "grad_norm": 1.2002592086791992, "learning_rate": 2.3940000000000003e-06, "loss": -0.0358, "num_tokens": 223572.0, "reward": 0.75, "reward_std": 0.5, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 0.7071067690849304, "sampling/importance_sampling_ratio/max": 1.2535724639892578, "sampling/importance_sampling_ratio/mean": 1.0001161098480225, "sampling/importance_sampling_ratio/min": 0.8326457738876343, "sampling/sampling_logp_difference/max": 0.2259974479675293, "sampling/sampling_logp_difference/mean": 0.005484994500875473, "step": 102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 272.0, "completions/max_terminated_length": 272.0, "completions/mean_length": 166.5, "completions/mean_terminated_length": 166.5, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.13238139264285564, "epoch": 0.027576974564926374, "frac_reward_zero_std": 0.5, "grad_norm": 1.7607340812683105, "learning_rate": 2.3880000000000003e-06, "loss": 0.0468, "num_tokens": 226136.0, "reward": 0.4375, "reward_std": 0.375, "rewards/reward_fn/mean": 0.4375, "rewards/reward_fn/std": 0.7763237953186035, "sampling/importance_sampling_ratio/max": 1.4224809408187866, "sampling/importance_sampling_ratio/mean": 1.000011682510376, "sampling/importance_sampling_ratio/min": 0.7550831437110901, "sampling/sampling_logp_difference/max": 0.3524024486541748, "sampling/sampling_logp_difference/mean": 0.006454914808273315, "step": 103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 175.0, "completions/max_terminated_length": 175.0, "completions/mean_length": 128.625, "completions/mean_terminated_length": 128.625, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 0.20957333222031593, "epoch": 0.027844712182061578, "frac_reward_zero_std": 0.5, "grad_norm": 2.0295491218566895, "learning_rate": 2.3820000000000002e-06, "loss": 0.0099, "num_tokens": 228385.0, "reward": 0.4375, "reward_std": 0.375, "rewards/reward_fn/mean": 0.4375, "rewards/reward_fn/std": 0.7763237953186035, "sampling/importance_sampling_ratio/max": 1.3866550922393799, "sampling/importance_sampling_ratio/mean": 1.0003459453582764, "sampling/importance_sampling_ratio/min": 0.8113100528717041, "sampling/sampling_logp_difference/max": 0.32689452171325684, "sampling/sampling_logp_difference/mean": 0.009721055626869202, "step": 104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 243.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 134.0, "completions/mean_terminated_length": 134.0, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "entropy": 0.17423648480325937, "epoch": 0.028112449799196786, "frac_reward_zero_std": 0.5, "grad_norm": 1.4738492965698242, "learning_rate": 2.376e-06, "loss": -0.0696, "num_tokens": 230553.0, "reward": 0.8125, "reward_std": 0.375, "rewards/reward_fn/mean": 0.8125, "rewards/reward_fn/std": 0.5303300619125366, "sampling/importance_sampling_ratio/max": 1.4256815910339355, "sampling/importance_sampling_ratio/mean": 1.0007965564727783, "sampling/importance_sampling_ratio/min": 0.7396317720413208, "sampling/sampling_logp_difference/max": 0.35465002059936523, "sampling/sampling_logp_difference/mean": 0.006559280678629875, "step": 105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 119.75, "completions/mean_terminated_length": 119.75, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "entropy": 0.20387152954936028, "epoch": 0.028380187416331994, "frac_reward_zero_std": 0.5, "grad_norm": 3.317683458328247, "learning_rate": 2.37e-06, "loss": -0.0129, "num_tokens": 232667.0, "reward": 0.625, "reward_std": 0.4330126941204071, "rewards/reward_fn/mean": 0.625, "rewards/reward_fn/std": 0.6943650841712952, "sampling/importance_sampling_ratio/max": 1.500115156173706, "sampling/importance_sampling_ratio/mean": 0.9998576641082764, "sampling/importance_sampling_ratio/min": 0.7979407906532288, "sampling/sampling_logp_difference/max": 0.40554189682006836, "sampling/sampling_logp_difference/mean": 0.010160931386053562, "step": 106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 175.0, "completions/max_terminated_length": 175.0, "completions/mean_length": 111.5, "completions/mean_terminated_length": 111.5, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.21202905289828777, "epoch": 0.028647925033467202, "frac_reward_zero_std": 0.5, "grad_norm": 1.3823963403701782, "learning_rate": 2.364e-06, "loss": 0.0166, "num_tokens": 234683.0, "reward": 0.8812500238418579, "reward_std": 0.17721809446811676, "rewards/reward_fn/mean": 0.8812500238418579, "rewards/reward_fn/std": 0.2644907534122467, "sampling/importance_sampling_ratio/max": 1.3264269828796387, "sampling/importance_sampling_ratio/mean": 0.9977993369102478, "sampling/importance_sampling_ratio/min": 0.5652405023574829, "sampling/sampling_logp_difference/max": 0.5705039501190186, "sampling/sampling_logp_difference/mean": 0.010206852108240128, "step": 107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 414.0, "completions/max_terminated_length": 414.0, "completions/mean_length": 174.625, "completions/mean_terminated_length": 174.625, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "entropy": 0.3202286772429943, "epoch": 0.02891566265060241, "frac_reward_zero_std": 0.5, "grad_norm": 2.491598129272461, "learning_rate": 2.358e-06, "loss": 0.0366, "num_tokens": 237416.0, "reward": 0.375, "reward_std": 0.4330126941204071, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.876274585723877, "sampling/importance_sampling_ratio/max": 1.3530874252319336, "sampling/importance_sampling_ratio/mean": 0.9985620379447937, "sampling/importance_sampling_ratio/min": 0.6989337205886841, "sampling/sampling_logp_difference/max": 0.3581993579864502, "sampling/sampling_logp_difference/mean": 0.011670985259115696, "step": 108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 168.0, "completions/max_terminated_length": 168.0, "completions/mean_length": 131.5, "completions/mean_terminated_length": 131.5, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.14006977854296565, "epoch": 0.02918340026773762, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.352e-06, "loss": 0.0, "num_tokens": 239704.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.1224853992462158, "sampling/importance_sampling_ratio/mean": 0.9986975789070129, "sampling/importance_sampling_ratio/min": 0.6985504031181335, "sampling/sampling_logp_difference/max": 0.3587479591369629, "sampling/sampling_logp_difference/mean": 0.007393364794552326, "step": 109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 264.0, "completions/max_terminated_length": 264.0, "completions/mean_length": 130.625, "completions/mean_terminated_length": 130.625, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.12173171993345022, "epoch": 0.029451137884872823, "frac_reward_zero_std": 0.5, "grad_norm": 1.8890520334243774, "learning_rate": 2.346e-06, "loss": -0.0493, "num_tokens": 242073.0, "reward": 0.8125, "reward_std": 0.375, "rewards/reward_fn/mean": 0.8125, "rewards/reward_fn/std": 0.5303300619125366, "sampling/importance_sampling_ratio/max": 1.400027871131897, "sampling/importance_sampling_ratio/mean": 0.9995740652084351, "sampling/importance_sampling_ratio/min": 0.7493809461593628, "sampling/sampling_logp_difference/max": 0.3364921808242798, "sampling/sampling_logp_difference/mean": 0.006560345645993948, "step": 110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 152.0, "completions/max_terminated_length": 152.0, "completions/mean_length": 120.75, "completions/mean_terminated_length": 120.75, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "entropy": 0.26829423382878304, "epoch": 0.02971887550200803, "frac_reward_zero_std": 0.5, "grad_norm": 3.542264699935913, "learning_rate": 2.34e-06, "loss": -0.0686, "num_tokens": 244355.0, "reward": 0.53125, "reward_std": 0.3590351641178131, "rewards/reward_fn/mean": 0.53125, "rewards/reward_fn/std": 0.6870940327644348, "sampling/importance_sampling_ratio/max": 1.5435947179794312, "sampling/importance_sampling_ratio/mean": 1.0001094341278076, "sampling/importance_sampling_ratio/min": 0.5170567035675049, "sampling/sampling_logp_difference/max": 0.6596026420593262, "sampling/sampling_logp_difference/mean": 0.012898379936814308, "step": 111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 320.0, "completions/max_terminated_length": 320.0, "completions/mean_length": 144.25, "completions/mean_terminated_length": 144.25, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.18507568817585707, "epoch": 0.02998661311914324, "frac_reward_zero_std": 0.5, "grad_norm": 2.4346961975097656, "learning_rate": 2.334e-06, "loss": 0.2288, "num_tokens": 246697.0, "reward": 0.1875, "reward_std": 0.125, "rewards/reward_fn/mean": 0.1875, "rewards/reward_fn/std": 0.883883535861969, "sampling/importance_sampling_ratio/max": 1.2774293422698975, "sampling/importance_sampling_ratio/mean": 1.0001187324523926, "sampling/importance_sampling_ratio/min": 0.7958757281303406, "sampling/sampling_logp_difference/max": 0.24484968185424805, "sampling/sampling_logp_difference/mean": 0.008667342364788055, "step": 112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 225.0, "completions/max_terminated_length": 225.0, "completions/mean_length": 156.625, "completions/mean_terminated_length": 156.625, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.1882070517167449, "epoch": 0.030254350736278447, "frac_reward_zero_std": 0.5, "grad_norm": 1.6697791814804077, "learning_rate": 2.328e-06, "loss": -0.0402, "num_tokens": 249150.0, "reward": 0.4375, "reward_std": 0.375, "rewards/reward_fn/mean": 0.4375, "rewards/reward_fn/std": 0.7763237953186035, "sampling/importance_sampling_ratio/max": 1.2797223329544067, "sampling/importance_sampling_ratio/mean": 0.9986733198165894, "sampling/importance_sampling_ratio/min": 0.7505319714546204, "sampling/sampling_logp_difference/max": 0.2869729995727539, "sampling/sampling_logp_difference/mean": 0.008141128346323967, "step": 113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 179.0, "completions/mean_terminated_length": 179.0, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "entropy": 0.17339822556823492, "epoch": 0.030522088353413655, "frac_reward_zero_std": 0.0, "grad_norm": 2.249896287918091, "learning_rate": 2.322e-06, "loss": -0.0978, "num_tokens": 252014.0, "reward": 0.7875000238418579, "reward_std": 0.42500001192092896, "rewards/reward_fn/mean": 0.7875000238418579, "rewards/reward_fn/std": 0.5249149799346924, "sampling/importance_sampling_ratio/max": 1.3203942775726318, "sampling/importance_sampling_ratio/mean": 0.9989036321640015, "sampling/importance_sampling_ratio/min": 0.7339946031570435, "sampling/sampling_logp_difference/max": 0.3092536926269531, "sampling/sampling_logp_difference/mean": 0.007623068057000637, "step": 114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 156.0, "completions/max_terminated_length": 156.0, "completions/mean_length": 96.5, "completions/mean_terminated_length": 96.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "entropy": 0.27563464641571045, "epoch": 0.030789825970548863, "frac_reward_zero_std": 0.5, "grad_norm": 2.578338146209717, "learning_rate": 2.316e-06, "loss": -0.2205, "num_tokens": 253950.0, "reward": 0.4375, "reward_std": 0.375, "rewards/reward_fn/mean": 0.4375, "rewards/reward_fn/std": 0.7763237953186035, "sampling/importance_sampling_ratio/max": 1.277443289756775, "sampling/importance_sampling_ratio/mean": 0.9999399781227112, "sampling/importance_sampling_ratio/min": 0.732779324054718, "sampling/sampling_logp_difference/max": 0.310910701751709, "sampling/sampling_logp_difference/mean": 0.008280403912067413, "step": 115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 170.0, "completions/max_terminated_length": 170.0, "completions/mean_length": 106.625, "completions/mean_terminated_length": 106.625, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "entropy": 0.12464311625808477, "epoch": 0.031057563587684068, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.31e-06, "loss": 0.0, "num_tokens": 256051.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5931379795074463, "sampling/importance_sampling_ratio/mean": 0.9993311166763306, "sampling/importance_sampling_ratio/min": 0.7802641987800598, "sampling/sampling_logp_difference/max": 0.46570563316345215, "sampling/sampling_logp_difference/mean": 0.00699082063511014, "step": 116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 113.5, "completions/mean_terminated_length": 113.5, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.12846943363547325, "epoch": 0.03132530120481928, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.3040000000000003e-06, "loss": 0.0, "num_tokens": 257983.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4204576015472412, "sampling/importance_sampling_ratio/mean": 1.0003553628921509, "sampling/importance_sampling_ratio/min": 0.8497739434242249, "sampling/sampling_logp_difference/max": 0.3509790897369385, "sampling/sampling_logp_difference/mean": 0.005486751440912485, "step": 117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 203.375, "completions/mean_terminated_length": 203.375, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.22829977329820395, "epoch": 0.03159303882195449, "frac_reward_zero_std": 0.5, "grad_norm": 1.9379832744598389, "learning_rate": 2.2980000000000003e-06, "loss": 0.0517, "num_tokens": 260870.0, "reward": 0.90625, "reward_std": 0.1875, "rewards/reward_fn/mean": 0.90625, "rewards/reward_fn/std": 0.2651650309562683, "sampling/importance_sampling_ratio/max": 1.6432201862335205, "sampling/importance_sampling_ratio/mean": 1.0023590326309204, "sampling/importance_sampling_ratio/min": 0.7736832499504089, "sampling/sampling_logp_difference/max": 0.4966578483581543, "sampling/sampling_logp_difference/mean": 0.009948228485882282, "step": 118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 165.0, "completions/max_terminated_length": 165.0, "completions/mean_length": 129.0, "completions/mean_terminated_length": 129.0, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.11198347993195057, "epoch": 0.031860776439089696, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.2920000000000002e-06, "loss": 0.0, "num_tokens": 263026.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3209507465362549, "sampling/importance_sampling_ratio/mean": 1.000182867050171, "sampling/importance_sampling_ratio/min": 0.8282703161239624, "sampling/sampling_logp_difference/max": 0.2783517837524414, "sampling/sampling_logp_difference/mean": 0.004711005836725235, "step": 119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 178.0, "completions/max_terminated_length": 178.0, "completions/mean_length": 104.0, "completions/mean_terminated_length": 104.0, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.23496036604046822, "epoch": 0.0321285140562249, "frac_reward_zero_std": 0.5, "grad_norm": 2.3917245864868164, "learning_rate": 2.2860000000000002e-06, "loss": 0.0234, "num_tokens": 264930.0, "reward": 0.8125, "reward_std": 0.375, "rewards/reward_fn/mean": 0.8125, "rewards/reward_fn/std": 0.5303300619125366, "sampling/importance_sampling_ratio/max": 1.338865876197815, "sampling/importance_sampling_ratio/mean": 1.0001548528671265, "sampling/importance_sampling_ratio/min": 0.8474820256233215, "sampling/sampling_logp_difference/max": 0.2918229103088379, "sampling/sampling_logp_difference/mean": 0.008633099496364594, "step": 120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 248.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 150.5, "completions/mean_terminated_length": 150.5, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.1694166110828519, "epoch": 0.032396251673360105, "frac_reward_zero_std": 0.5, "grad_norm": 4.506484031677246, "learning_rate": 2.28e-06, "loss": -0.1015, "num_tokens": 267554.0, "reward": 0.8125, "reward_std": 0.375, "rewards/reward_fn/mean": 0.8125, "rewards/reward_fn/std": 0.5303300619125366, "sampling/importance_sampling_ratio/max": 1.560531735420227, "sampling/importance_sampling_ratio/mean": 1.0024428367614746, "sampling/importance_sampling_ratio/min": 0.7109222412109375, "sampling/sampling_logp_difference/max": 0.4450266361236572, "sampling/sampling_logp_difference/mean": 0.01158700417727232, "step": 121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 152.0, "completions/max_terminated_length": 152.0, "completions/mean_length": 106.75, "completions/mean_terminated_length": 106.75, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "entropy": 0.17113200575113297, "epoch": 0.03266398929049531, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.274e-06, "loss": 0.0, "num_tokens": 269516.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.1605746746063232, "sampling/importance_sampling_ratio/mean": 0.9981253147125244, "sampling/importance_sampling_ratio/min": 0.7417328357696533, "sampling/sampling_logp_difference/max": 0.2987661361694336, "sampling/sampling_logp_difference/mean": 0.008806992322206497, "step": 122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 187.0, "completions/max_terminated_length": 187.0, "completions/mean_length": 114.75, "completions/mean_terminated_length": 114.75, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.13504554331302643, "epoch": 0.03293172690763052, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.268e-06, "loss": 0.0, "num_tokens": 271494.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.2669535875320435, "sampling/importance_sampling_ratio/mean": 0.9995626211166382, "sampling/importance_sampling_ratio/min": 0.6890845894813538, "sampling/sampling_logp_difference/max": 0.3723912239074707, "sampling/sampling_logp_difference/mean": 0.007686463650316, "step": 123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/max_terminated_length": 262.0, "completions/mean_length": 192.0, "completions/mean_terminated_length": 192.0, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.13430788833647966, "epoch": 0.03319946452476573, "frac_reward_zero_std": 0.5, "grad_norm": 1.2987910509109497, "learning_rate": 2.262e-06, "loss": -0.0161, "num_tokens": 274278.0, "reward": 0.9375, "reward_std": 0.125, "rewards/reward_fn/mean": 0.9375, "rewards/reward_fn/std": 0.1767766922712326, "sampling/importance_sampling_ratio/max": 1.3010574579238892, "sampling/importance_sampling_ratio/mean": 0.9996674060821533, "sampling/importance_sampling_ratio/min": 0.8018923997879028, "sampling/sampling_logp_difference/max": 0.26317739486694336, "sampling/sampling_logp_difference/mean": 0.00550411781296134, "step": 124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/max_terminated_length": 373.0, "completions/mean_length": 142.0, "completions/mean_terminated_length": 142.0, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "entropy": 0.2571197347715497, "epoch": 0.03346720214190094, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.256e-06, "loss": 0.0, "num_tokens": 276590.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.2242223024368286, "sampling/importance_sampling_ratio/mean": 1.0005159378051758, "sampling/importance_sampling_ratio/min": 0.7294586896896362, "sampling/sampling_logp_difference/max": 0.31545257568359375, "sampling/sampling_logp_difference/mean": 0.010721201077103615, "step": 125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 221.0, "completions/max_terminated_length": 221.0, "completions/mean_length": 136.5, "completions/mean_terminated_length": 136.5, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "entropy": 0.20888752676546574, "epoch": 0.033734939759036145, "frac_reward_zero_std": 0.5, "grad_norm": 2.076802968978882, "learning_rate": 2.25e-06, "loss": -0.121, "num_tokens": 278826.0, "reward": 0.4375, "reward_std": 0.375, "rewards/reward_fn/mean": 0.4375, "rewards/reward_fn/std": 0.7763237953186035, "sampling/importance_sampling_ratio/max": 1.4493030309677124, "sampling/importance_sampling_ratio/mean": 0.9986593723297119, "sampling/importance_sampling_ratio/min": 0.6159276366233826, "sampling/sampling_logp_difference/max": 0.48462581634521484, "sampling/sampling_logp_difference/mean": 0.01020803302526474, "step": 126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 123.5, "completions/mean_terminated_length": 123.5, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.26402824744582176, "epoch": 0.03400267737617135, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.244e-06, "loss": 0.0, "num_tokens": 281018.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4470518827438354, "sampling/importance_sampling_ratio/mean": 0.9991505146026611, "sampling/importance_sampling_ratio/min": 0.7827907800674438, "sampling/sampling_logp_difference/max": 0.36952829360961914, "sampling/sampling_logp_difference/mean": 0.014181448146700859, "step": 127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 145.0, "completions/max_terminated_length": 145.0, "completions/mean_length": 92.0, "completions/mean_terminated_length": 92.0, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 0.24097895622253418, "epoch": 0.03427041499330656, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.238e-06, "loss": 0.0, "num_tokens": 282958.0, "reward": 0.25, "reward_std": 0.0, "rewards/reward_fn/mean": 0.25, "rewards/reward_fn/std": 0.8017837405204773, "sampling/importance_sampling_ratio/max": 1.3295269012451172, "sampling/importance_sampling_ratio/mean": 1.0019434690475464, "sampling/importance_sampling_ratio/min": 0.656845211982727, "sampling/sampling_logp_difference/max": 0.420306921005249, "sampling/sampling_logp_difference/mean": 0.014589911326766014, "step": 128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 146.0, "completions/max_terminated_length": 146.0, "completions/mean_length": 113.875, "completions/mean_terminated_length": 113.875, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.13661128282546997, "epoch": 0.03453815261044177, "frac_reward_zero_std": 0.5, "grad_norm": 2.6774256229400635, "learning_rate": 2.232e-06, "loss": -0.0222, "num_tokens": 285049.0, "reward": 0.75, "reward_std": 0.5, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 0.7071067690849304, "sampling/importance_sampling_ratio/max": 1.3431710004806519, "sampling/importance_sampling_ratio/mean": 0.9995543360710144, "sampling/importance_sampling_ratio/min": 0.6321345567703247, "sampling/sampling_logp_difference/max": 0.45865297317504883, "sampling/sampling_logp_difference/mean": 0.005473929923027754, "step": 129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 218.0, "completions/max_terminated_length": 218.0, "completions/mean_length": 141.0, "completions/mean_terminated_length": 141.0, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.14292040653526783, "epoch": 0.03480589022757698, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.226e-06, "loss": 0.0, "num_tokens": 287373.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.320968508720398, "sampling/importance_sampling_ratio/mean": 1.0010684728622437, "sampling/importance_sampling_ratio/min": 0.8045834302902222, "sampling/sampling_logp_difference/max": 0.2783651351928711, "sampling/sampling_logp_difference/mean": 0.006312296260148287, "step": 130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 178.0, "completions/max_terminated_length": 178.0, "completions/mean_length": 107.125, "completions/mean_terminated_length": 107.125, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "entropy": 0.33385249972343445, "epoch": 0.035073627844712185, "frac_reward_zero_std": 0.0, "grad_norm": 4.646759986877441, "learning_rate": 2.22e-06, "loss": 0.0407, "num_tokens": 289358.0, "reward": 0.375, "reward_std": 0.9330127239227295, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.876274585723877, "sampling/importance_sampling_ratio/max": 1.1768659353256226, "sampling/importance_sampling_ratio/mean": 0.9978495836257935, "sampling/importance_sampling_ratio/min": 0.7492095232009888, "sampling/sampling_logp_difference/max": 0.28873658180236816, "sampling/sampling_logp_difference/mean": 0.013364615850150585, "step": 131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 156.0, "completions/max_terminated_length": 156.0, "completions/mean_length": 110.25, "completions/mean_terminated_length": 110.25, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.18047708086669445, "epoch": 0.035341365461847386, "frac_reward_zero_std": 0.5, "grad_norm": 3.871786594390869, "learning_rate": 2.214e-06, "loss": -0.0522, "num_tokens": 291476.0, "reward": 0.75, "reward_std": 0.5, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 0.7071067690849304, "sampling/importance_sampling_ratio/max": 1.4948492050170898, "sampling/importance_sampling_ratio/mean": 1.0001641511917114, "sampling/importance_sampling_ratio/min": 0.7230113744735718, "sampling/sampling_logp_difference/max": 0.40202534198760986, "sampling/sampling_logp_difference/mean": 0.011675515212118626, "step": 132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/max_terminated_length": 329.0, "completions/mean_length": 150.0, "completions/mean_terminated_length": 150.0, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.12612602952867746, "epoch": 0.035609103078982594, "frac_reward_zero_std": 0.5, "grad_norm": 2.328770875930786, "learning_rate": 2.208e-06, "loss": 0.221, "num_tokens": 293832.0, "reward": 0.8125, "reward_std": 0.375, "rewards/reward_fn/mean": 0.8125, "rewards/reward_fn/std": 0.5303300619125366, "sampling/importance_sampling_ratio/max": 1.272207498550415, "sampling/importance_sampling_ratio/mean": 0.9995013475418091, "sampling/importance_sampling_ratio/min": 0.8340058326721191, "sampling/sampling_logp_difference/max": 0.2407536506652832, "sampling/sampling_logp_difference/mean": 0.006674936972558498, "step": 133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 144.0, "completions/max_terminated_length": 144.0, "completions/mean_length": 103.875, "completions/mean_terminated_length": 103.875, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 0.18558892887085676, "epoch": 0.0358768406961178, "frac_reward_zero_std": 0.0, "grad_norm": 6.383874893188477, "learning_rate": 2.202e-06, "loss": -0.0588, "num_tokens": 295771.0, "reward": 0.625, "reward_std": 0.5915063619613647, "rewards/reward_fn/mean": 0.625, "rewards/reward_fn/std": 0.5669467449188232, "sampling/importance_sampling_ratio/max": 1.318709135055542, "sampling/importance_sampling_ratio/mean": 0.9996612668037415, "sampling/importance_sampling_ratio/min": 0.7021632194519043, "sampling/sampling_logp_difference/max": 0.35358941555023193, "sampling/sampling_logp_difference/mean": 0.01038048043847084, "step": 134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 172.0, "completions/max_terminated_length": 172.0, "completions/mean_length": 133.875, "completions/mean_terminated_length": 133.875, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.20175390411168337, "epoch": 0.03614457831325301, "frac_reward_zero_std": 0.5, "grad_norm": 1.9256740808486938, "learning_rate": 2.196e-06, "loss": 0.0103, "num_tokens": 298094.0, "reward": 0.625, "reward_std": 0.4330126941204071, "rewards/reward_fn/mean": 0.625, "rewards/reward_fn/std": 0.6943650841712952, "sampling/importance_sampling_ratio/max": 1.2462482452392578, "sampling/importance_sampling_ratio/mean": 1.000543236732483, "sampling/importance_sampling_ratio/min": 0.6306689381599426, "sampling/sampling_logp_difference/max": 0.46097421646118164, "sampling/sampling_logp_difference/mean": 0.010469979606568813, "step": 135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 145.0, "completions/max_terminated_length": 145.0, "completions/mean_length": 88.75, "completions/mean_terminated_length": 88.75, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 0.22806713357567787, "epoch": 0.03641231593038822, "frac_reward_zero_std": 0.5, "grad_norm": 2.9493393898010254, "learning_rate": 2.19e-06, "loss": -0.1244, "num_tokens": 300076.0, "reward": 0.625, "reward_std": 0.4330126941204071, "rewards/reward_fn/mean": 0.625, "rewards/reward_fn/std": 0.6943650841712952, "sampling/importance_sampling_ratio/max": 1.2751857042312622, "sampling/importance_sampling_ratio/mean": 1.0003217458724976, "sampling/importance_sampling_ratio/min": 0.7223330140113831, "sampling/sampling_logp_difference/max": 0.3252689838409424, "sampling/sampling_logp_difference/mean": 0.013457144610583782, "step": 136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 133.0, "completions/mean_terminated_length": 133.0, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "entropy": 0.1383275780826807, "epoch": 0.03668005354752343, "frac_reward_zero_std": 0.5, "grad_norm": 1.3462580442428589, "learning_rate": 2.184e-06, "loss": -0.0337, "num_tokens": 302176.0, "reward": 0.90625, "reward_std": 0.1875, "rewards/reward_fn/mean": 0.90625, "rewards/reward_fn/std": 0.2651650309562683, "sampling/importance_sampling_ratio/max": 1.1955373287200928, "sampling/importance_sampling_ratio/mean": 1.0007842779159546, "sampling/importance_sampling_ratio/min": 0.8246067762374878, "sampling/sampling_logp_difference/max": 0.19284868240356445, "sampling/sampling_logp_difference/mean": 0.005738415289670229, "step": 137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 188.0, "completions/max_terminated_length": 188.0, "completions/mean_length": 124.5, "completions/mean_terminated_length": 124.5, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "entropy": 0.174608014523983, "epoch": 0.036947791164658635, "frac_reward_zero_std": 0.0, "grad_norm": 3.3169422149658203, "learning_rate": 2.178e-06, "loss": -0.1312, "num_tokens": 304256.0, "reward": 0.25, "reward_std": 0.75, "rewards/reward_fn/mean": 0.25, "rewards/reward_fn/std": 0.8017837405204773, "sampling/importance_sampling_ratio/max": 1.2528345584869385, "sampling/importance_sampling_ratio/mean": 0.99929279088974, "sampling/importance_sampling_ratio/min": 0.7677478790283203, "sampling/sampling_logp_difference/max": 0.264293909072876, "sampling/sampling_logp_difference/mean": 0.008284189738333225, "step": 138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 196.0, "completions/max_terminated_length": 196.0, "completions/mean_length": 159.375, "completions/mean_terminated_length": 159.375, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.21820338629186153, "epoch": 0.03721552878179384, "frac_reward_zero_std": 0.5, "grad_norm": 2.6290009021759033, "learning_rate": 2.172e-06, "loss": -0.0125, "num_tokens": 306651.0, "reward": -0.15000000596046448, "reward_std": 0.40620189905166626, "rewards/reward_fn/mean": -0.15000000596046448, "rewards/reward_fn/std": 0.6502746939659119, "sampling/importance_sampling_ratio/max": 1.520743727684021, "sampling/importance_sampling_ratio/mean": 1.001164436340332, "sampling/importance_sampling_ratio/min": 0.7981969118118286, "sampling/sampling_logp_difference/max": 0.41919946670532227, "sampling/sampling_logp_difference/mean": 0.008044005371630192, "step": 139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/max_terminated_length": 343.0, "completions/mean_length": 193.25, "completions/mean_terminated_length": 193.25, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 0.07171386759728193, "epoch": 0.03748326639892905, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.166e-06, "loss": 0.0, "num_tokens": 309425.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.277637243270874, "sampling/importance_sampling_ratio/mean": 1.0003767013549805, "sampling/importance_sampling_ratio/min": 0.664783239364624, "sampling/sampling_logp_difference/max": 0.4082942008972168, "sampling/sampling_logp_difference/mean": 0.00462719053030014, "step": 140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 143.0, "completions/max_terminated_length": 143.0, "completions/mean_length": 75.75, "completions/mean_terminated_length": 75.75, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "entropy": 0.18488798383623362, "epoch": 0.03775100401606426, "frac_reward_zero_std": 0.5, "grad_norm": 4.185678005218506, "learning_rate": 2.16e-06, "loss": -0.1958, "num_tokens": 311183.0, "reward": 0.4375, "reward_std": 0.375, "rewards/reward_fn/mean": 0.4375, "rewards/reward_fn/std": 0.7763237953186035, "sampling/importance_sampling_ratio/max": 1.5449814796447754, "sampling/importance_sampling_ratio/mean": 1.0015133619308472, "sampling/importance_sampling_ratio/min": 0.7386823892593384, "sampling/sampling_logp_difference/max": 0.4350118637084961, "sampling/sampling_logp_difference/mean": 0.012682321481406689, "step": 141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 101.25, "completions/mean_terminated_length": 101.25, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "entropy": 0.14654972217977047, "epoch": 0.03801874163319947, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.154e-06, "loss": 0.0, "num_tokens": 313237.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3118747472763062, "sampling/importance_sampling_ratio/mean": 0.9980568885803223, "sampling/importance_sampling_ratio/min": 0.6781652569770813, "sampling/sampling_logp_difference/max": 0.388364315032959, "sampling/sampling_logp_difference/mean": 0.008574594743549824, "step": 142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1946.0, "completions/max_terminated_length": 1946.0, "completions/mean_length": 342.125, "completions/mean_terminated_length": 342.125, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 0.22448250371962786, "epoch": 0.038286479250334675, "frac_reward_zero_std": 0.0, "grad_norm": 1.7679530382156372, "learning_rate": 2.148e-06, "loss": 0.5471, "num_tokens": 317198.0, "reward": 0.16249999403953552, "reward_std": 0.7944334745407104, "rewards/reward_fn/mean": 0.16249999403953552, "rewards/reward_fn/std": 0.8601287603378296, "sampling/importance_sampling_ratio/max": 1.6141529083251953, "sampling/importance_sampling_ratio/mean": 1.0003881454467773, "sampling/importance_sampling_ratio/min": 0.7059998512268066, "sampling/sampling_logp_difference/max": 0.47881031036376953, "sampling/sampling_logp_difference/mean": 0.010533890686929226, "step": 143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 158.0, "completions/max_terminated_length": 158.0, "completions/mean_length": 127.625, "completions/mean_terminated_length": 127.625, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.11053166631609201, "epoch": 0.03855421686746988, "frac_reward_zero_std": 0.5, "grad_norm": 2.3715879917144775, "learning_rate": 2.142e-06, "loss": 0.0414, "num_tokens": 319407.0, "reward": 0.8125, "reward_std": 0.375, "rewards/reward_fn/mean": 0.8125, "rewards/reward_fn/std": 0.5303300619125366, "sampling/importance_sampling_ratio/max": 1.4557615518569946, "sampling/importance_sampling_ratio/mean": 1.000117301940918, "sampling/importance_sampling_ratio/min": 0.7103351950645447, "sampling/sampling_logp_difference/max": 0.3755291700363159, "sampling/sampling_logp_difference/mean": 0.007177350111305714, "step": 144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 156.0, "completions/max_terminated_length": 156.0, "completions/mean_length": 111.875, "completions/mean_terminated_length": 111.875, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "entropy": 0.12222401611506939, "epoch": 0.038821954484605084, "frac_reward_zero_std": 0.5, "grad_norm": 1.8860392570495605, "learning_rate": 2.136e-06, "loss": 0.0446, "num_tokens": 321378.0, "reward": 0.8125, "reward_std": 0.375, "rewards/reward_fn/mean": 0.8125, "rewards/reward_fn/std": 0.5303300619125366, "sampling/importance_sampling_ratio/max": 1.387290358543396, "sampling/importance_sampling_ratio/mean": 1.000460147857666, "sampling/importance_sampling_ratio/min": 0.8024726510047913, "sampling/sampling_logp_difference/max": 0.32735252380371094, "sampling/sampling_logp_difference/mean": 0.005885968916118145, "step": 145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 103.875, "completions/mean_terminated_length": 103.875, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.24429608695209026, "epoch": 0.03908969210174029, "frac_reward_zero_std": 0.5, "grad_norm": 4.499346733093262, "learning_rate": 2.13e-06, "loss": -0.1233, "num_tokens": 323425.0, "reward": 0.0625, "reward_std": 0.375, "rewards/reward_fn/mean": 0.0625, "rewards/reward_fn/std": 0.7763237953186035, "sampling/importance_sampling_ratio/max": 1.3140403032302856, "sampling/importance_sampling_ratio/mean": 1.0014277696609497, "sampling/importance_sampling_ratio/min": 0.5460754632949829, "sampling/sampling_logp_difference/max": 0.6049981117248535, "sampling/sampling_logp_difference/mean": 0.010036547668278217, "step": 146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 250.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 195.625, "completions/mean_terminated_length": 195.625, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.26544077787548304, "epoch": 0.0393574297188755, "frac_reward_zero_std": 0.0, "grad_norm": 2.755140542984009, "learning_rate": 2.124e-06, "loss": 0.0515, "num_tokens": 326326.0, "reward": 0.9125000238418579, "reward_std": 0.17499999701976776, "rewards/reward_fn/mean": 0.9125000238418579, "rewards/reward_fn/std": 0.18077215552330017, "sampling/importance_sampling_ratio/max": 1.4127007722854614, "sampling/importance_sampling_ratio/mean": 1.0005333423614502, "sampling/importance_sampling_ratio/min": 0.7674274444580078, "sampling/sampling_logp_difference/max": 0.3455033302307129, "sampling/sampling_logp_difference/mean": 0.010211068205535412, "step": 147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 103.0, "completions/max_terminated_length": 103.0, "completions/mean_length": 90.0, "completions/mean_terminated_length": 90.0, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.18117266707122326, "epoch": 0.03962516733601071, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.118e-06, "loss": 0.0, "num_tokens": 328162.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.205213189125061, "sampling/importance_sampling_ratio/mean": 1.0007671117782593, "sampling/importance_sampling_ratio/min": 0.7823426723480225, "sampling/sampling_logp_difference/max": 0.24546241760253906, "sampling/sampling_logp_difference/mean": 0.007345684338361025, "step": 148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.0, "completions/max_terminated_length": 217.0, "completions/mean_length": 136.5, "completions/mean_terminated_length": 136.5, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.11692395340651274, "epoch": 0.039892904953145916, "frac_reward_zero_std": 0.5, "grad_norm": 1.543579339981079, "learning_rate": 2.112e-06, "loss": -0.1075, "num_tokens": 330590.0, "reward": 0.71875, "reward_std": 0.3590351641178131, "rewards/reward_fn/mean": 0.71875, "rewards/reward_fn/std": 0.5580178499221802, "sampling/importance_sampling_ratio/max": 1.1353919506072998, "sampling/importance_sampling_ratio/mean": 0.9996322393417358, "sampling/importance_sampling_ratio/min": 0.8314812779426575, "sampling/sampling_logp_difference/max": 0.18454647064208984, "sampling/sampling_logp_difference/mean": 0.0041941087692976, "step": 149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 177.0, "completions/max_terminated_length": 177.0, "completions/mean_length": 126.375, "completions/mean_terminated_length": 126.375, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.1379283107817173, "epoch": 0.040160642570281124, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.106e-06, "loss": 0.0, "num_tokens": 332741.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.2834445238113403, "sampling/importance_sampling_ratio/mean": 0.9999051690101624, "sampling/importance_sampling_ratio/min": 0.8262094259262085, "sampling/sampling_logp_difference/max": 0.24954748153686523, "sampling/sampling_logp_difference/mean": 0.005259039346128702, "step": 150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 242.0, "completions/max_terminated_length": 242.0, "completions/mean_length": 151.0, "completions/mean_terminated_length": 151.0, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.15097802504897118, "epoch": 0.04042838018741633, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.1e-06, "loss": 0.0, "num_tokens": 335017.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.2357263565063477, "sampling/importance_sampling_ratio/mean": 0.9999412894248962, "sampling/importance_sampling_ratio/min": 0.7698772549629211, "sampling/sampling_logp_difference/max": 0.2615242004394531, "sampling/sampling_logp_difference/mean": 0.007444350048899651, "step": 151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 159.0, "completions/max_terminated_length": 159.0, "completions/mean_length": 122.25, "completions/mean_terminated_length": 122.25, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.1285302508622408, "epoch": 0.04069611780455154, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.0939999999999998e-06, "loss": 0.0, "num_tokens": 337027.0, "reward": 0.625, "reward_std": 0.0, "rewards/reward_fn/mean": 0.625, "rewards/reward_fn/std": 0.40089187026023865, "sampling/importance_sampling_ratio/max": 1.2037813663482666, "sampling/importance_sampling_ratio/mean": 1.0000494718551636, "sampling/importance_sampling_ratio/min": 0.7322882413864136, "sampling/sampling_logp_difference/max": 0.3115811347961426, "sampling/sampling_logp_difference/mean": 0.006796116940677166, "step": 152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 185.0, "completions/max_terminated_length": 185.0, "completions/mean_length": 138.125, "completions/mean_terminated_length": 138.125, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.1348671531304717, "epoch": 0.04096385542168675, "frac_reward_zero_std": 0.5, "grad_norm": 1.431638479232788, "learning_rate": 2.0879999999999997e-06, "loss": 0.0714, "num_tokens": 339280.0, "reward": 0.8125, "reward_std": 0.375, "rewards/reward_fn/mean": 0.8125, "rewards/reward_fn/std": 0.5303300619125366, "sampling/importance_sampling_ratio/max": 1.253427505493164, "sampling/importance_sampling_ratio/mean": 0.9997843503952026, "sampling/importance_sampling_ratio/min": 0.7747681140899658, "sampling/sampling_logp_difference/max": 0.2551915645599365, "sampling/sampling_logp_difference/mean": 0.005940718576312065, "step": 153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/max_terminated_length": 309.0, "completions/mean_length": 215.5, "completions/mean_terminated_length": 215.5, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.10917935241013765, "epoch": 0.041231593038821956, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.0819999999999997e-06, "loss": 0.0, "num_tokens": 342364.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.2318006753921509, "sampling/importance_sampling_ratio/mean": 0.9994737505912781, "sampling/importance_sampling_ratio/min": 0.793952465057373, "sampling/sampling_logp_difference/max": 0.23073172569274902, "sampling/sampling_logp_difference/mean": 0.0052405656315386295, "step": 154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 165.0, "completions/max_terminated_length": 165.0, "completions/mean_length": 131.25, "completions/mean_terminated_length": 131.25, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.17408661730587482, "epoch": 0.041499330655957165, "frac_reward_zero_std": 0.5, "grad_norm": 1.6004939079284668, "learning_rate": 2.0759999999999997e-06, "loss": -0.0465, "num_tokens": 344570.0, "reward": 0.0625, "reward_std": 0.375, "rewards/reward_fn/mean": 0.0625, "rewards/reward_fn/std": 0.7763237953186035, "sampling/importance_sampling_ratio/max": 1.256798505783081, "sampling/importance_sampling_ratio/mean": 1.000610113143921, "sampling/importance_sampling_ratio/min": 0.8510998487472534, "sampling/sampling_logp_difference/max": 0.22856760025024414, "sampling/sampling_logp_difference/mean": 0.006965050473809242, "step": 155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/max_terminated_length": 325.0, "completions/mean_length": 224.625, "completions/mean_terminated_length": 224.625, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.09486238192766905, "epoch": 0.04176706827309237, "frac_reward_zero_std": 0.5, "grad_norm": 0.8256387710571289, "learning_rate": 2.07e-06, "loss": -0.0799, "num_tokens": 347807.0, "reward": 0.949999988079071, "reward_std": 0.05773502215743065, "rewards/reward_fn/mean": 0.949999988079071, "rewards/reward_fn/std": 0.09258200973272324, "sampling/importance_sampling_ratio/max": 1.278249979019165, "sampling/importance_sampling_ratio/mean": 1.0002652406692505, "sampling/importance_sampling_ratio/min": 0.7606942057609558, "sampling/sampling_logp_difference/max": 0.27352380752563477, "sampling/sampling_logp_difference/mean": 0.004068722017109394, "step": 156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/max_terminated_length": 328.0, "completions/mean_length": 151.375, "completions/mean_terminated_length": 151.375, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "entropy": 0.16872367076575756, "epoch": 0.042034805890227574, "frac_reward_zero_std": 0.0, "grad_norm": 2.133493423461914, "learning_rate": 2.064e-06, "loss": 0.1057, "num_tokens": 350142.0, "reward": 0.0, "reward_std": 0.6830127239227295, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.7071067690849304, "sampling/importance_sampling_ratio/max": 1.1331462860107422, "sampling/importance_sampling_ratio/mean": 0.9998410940170288, "sampling/importance_sampling_ratio/min": 0.7335894703865051, "sampling/sampling_logp_difference/max": 0.3098057508468628, "sampling/sampling_logp_difference/mean": 0.00704143987968564, "step": 157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 126.0, "completions/max_terminated_length": 126.0, "completions/mean_length": 90.0, "completions/mean_terminated_length": 90.0, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.11321096494793892, "epoch": 0.04230254350736278, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.058e-06, "loss": 0.0, "num_tokens": 351930.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.224395990371704, "sampling/importance_sampling_ratio/mean": 0.9996070265769958, "sampling/importance_sampling_ratio/min": 0.8626297116279602, "sampling/sampling_logp_difference/max": 0.20244765281677246, "sampling/sampling_logp_difference/mean": 0.00577831594273448, "step": 158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 161.0, "completions/max_terminated_length": 161.0, "completions/mean_length": 126.625, "completions/mean_terminated_length": 126.625, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "entropy": 0.1564363380894065, "epoch": 0.04257028112449799, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.052e-06, "loss": 0.0, "num_tokens": 354119.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3430919647216797, "sampling/importance_sampling_ratio/mean": 1.0001670122146606, "sampling/importance_sampling_ratio/min": 0.8146264553070068, "sampling/sampling_logp_difference/max": 0.29497432708740234, "sampling/sampling_logp_difference/mean": 0.007129319477826357, "step": 159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.0, "completions/max_terminated_length": 299.0, "completions/mean_length": 184.0, "completions/mean_terminated_length": 184.0, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.1128898486495018, "epoch": 0.0428380187416332, "frac_reward_zero_std": 0.5, "grad_norm": 1.0882681608200073, "learning_rate": 2.0460000000000004e-06, "loss": -0.0361, "num_tokens": 356935.0, "reward": 0.6875, "reward_std": 0.3145764470100403, "rewards/reward_fn/mean": 0.6875, "rewards/reward_fn/std": 0.5303300619125366, "sampling/importance_sampling_ratio/max": 1.256117343902588, "sampling/importance_sampling_ratio/mean": 0.9990549683570862, "sampling/importance_sampling_ratio/min": 0.7040555477142334, "sampling/sampling_logp_difference/max": 0.35089802742004395, "sampling/sampling_logp_difference/mean": 0.0052712177857756615, "step": 160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 272.0, "completions/max_terminated_length": 272.0, "completions/mean_length": 159.625, "completions/mean_terminated_length": 159.625, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.2268071947619319, "epoch": 0.043105756358768406, "frac_reward_zero_std": 0.5, "grad_norm": 1.9403034448623657, "learning_rate": 2.0400000000000004e-06, "loss": -0.0062, "num_tokens": 359628.0, "reward": 0.8125, "reward_std": 0.375, "rewards/reward_fn/mean": 0.8125, "rewards/reward_fn/std": 0.5303300619125366, "sampling/importance_sampling_ratio/max": 1.2751139402389526, "sampling/importance_sampling_ratio/mean": 0.9981908202171326, "sampling/importance_sampling_ratio/min": 0.5172673463821411, "sampling/sampling_logp_difference/max": 0.6591954231262207, "sampling/sampling_logp_difference/mean": 0.007588887587189674, "step": 161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 248.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 132.875, "completions/mean_terminated_length": 132.875, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.2016276028007269, "epoch": 0.043373493975903614, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.0340000000000003e-06, "loss": 0.0, "num_tokens": 361855.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.2196344137191772, "sampling/importance_sampling_ratio/mean": 1.0001060962677002, "sampling/importance_sampling_ratio/min": 0.7881242036819458, "sampling/sampling_logp_difference/max": 0.2380995750427246, "sampling/sampling_logp_difference/mean": 0.006893336307257414, "step": 162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 258.0, "completions/max_terminated_length": 258.0, "completions/mean_length": 158.0, "completions/mean_terminated_length": 158.0, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.16863445285707712, "epoch": 0.04364123159303882, "frac_reward_zero_std": 0.5, "grad_norm": 1.6151620149612427, "learning_rate": 2.0280000000000003e-06, "loss": 0.1104, "num_tokens": 364355.0, "reward": 0.8125, "reward_std": 0.375, "rewards/reward_fn/mean": 0.8125, "rewards/reward_fn/std": 0.5303300619125366, "sampling/importance_sampling_ratio/max": 1.5645502805709839, "sampling/importance_sampling_ratio/mean": 1.0007193088531494, "sampling/importance_sampling_ratio/min": 0.7892642617225647, "sampling/sampling_logp_difference/max": 0.4475984573364258, "sampling/sampling_logp_difference/mean": 0.006656427402049303, "step": 163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 181.0, "completions/max_terminated_length": 181.0, "completions/mean_length": 123.25, "completions/mean_terminated_length": 123.25, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.10508401971310377, "epoch": 0.04390896921017403, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.0220000000000003e-06, "loss": 0.0, "num_tokens": 366441.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.2756962776184082, "sampling/importance_sampling_ratio/mean": 1.0002022981643677, "sampling/importance_sampling_ratio/min": 0.7948617339134216, "sampling/sampling_logp_difference/max": 0.24349212646484375, "sampling/sampling_logp_difference/mean": 0.005061520263552666, "step": 164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 229.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 149.0, "completions/mean_terminated_length": 149.0, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.08289468474686146, "epoch": 0.04417670682730924, "frac_reward_zero_std": 0.5, "grad_norm": 1.0460221767425537, "learning_rate": 2.0160000000000003e-06, "loss": -0.0111, "num_tokens": 368805.0, "reward": 0.90625, "reward_std": 0.1875, "rewards/reward_fn/mean": 0.90625, "rewards/reward_fn/std": 0.2651650309562683, "sampling/importance_sampling_ratio/max": 1.1850723028182983, "sampling/importance_sampling_ratio/mean": 1.000496745109558, "sampling/importance_sampling_ratio/min": 0.8865936994552612, "sampling/sampling_logp_difference/max": 0.16980385780334473, "sampling/sampling_logp_difference/mean": 0.0031509220134466887, "step": 165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 179.0, "completions/max_terminated_length": 179.0, "completions/mean_length": 130.0, "completions/mean_terminated_length": 130.0, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.14756314363330603, "epoch": 0.044444444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.0100000000000002e-06, "loss": 0.0, "num_tokens": 371173.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.357284426689148, "sampling/importance_sampling_ratio/mean": 0.9999274611473083, "sampling/importance_sampling_ratio/min": 0.6491801738739014, "sampling/sampling_logp_difference/max": 0.43204498291015625, "sampling/sampling_logp_difference/mean": 0.008031354285776615, "step": 166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 188.0, "completions/max_terminated_length": 188.0, "completions/mean_length": 160.375, "completions/mean_terminated_length": 160.375, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.12598482007160783, "epoch": 0.044712182061579654, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.004e-06, "loss": 0.0, "num_tokens": 373596.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.1683262586593628, "sampling/importance_sampling_ratio/mean": 1.0001490116119385, "sampling/importance_sampling_ratio/min": 0.7973448038101196, "sampling/sampling_logp_difference/max": 0.22646808624267578, "sampling/sampling_logp_difference/mean": 0.0048505510203540325, "step": 167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 275.0, "completions/max_terminated_length": 275.0, "completions/mean_length": 146.25, "completions/mean_terminated_length": 146.25, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.1856307704001665, "epoch": 0.04497991967871486, "frac_reward_zero_std": 0.5, "grad_norm": 1.4156043529510498, "learning_rate": 1.998e-06, "loss": -0.093, "num_tokens": 375954.0, "reward": 0.1875, "reward_std": 0.125, "rewards/reward_fn/mean": 0.1875, "rewards/reward_fn/std": 0.883883535861969, "sampling/importance_sampling_ratio/max": 1.3600119352340698, "sampling/importance_sampling_ratio/mean": 0.9993915557861328, "sampling/importance_sampling_ratio/min": 0.8379209041595459, "sampling/sampling_logp_difference/max": 0.3074934482574463, "sampling/sampling_logp_difference/mean": 0.006502452772110701, "step": 168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 155.0, "completions/mean_terminated_length": 155.0, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.11742838565260172, "epoch": 0.04524765729585007, "frac_reward_zero_std": 0.5, "grad_norm": 1.3175822496414185, "learning_rate": 1.992e-06, "loss": -0.029, "num_tokens": 378482.0, "reward": 0.8125, "reward_std": 0.375, "rewards/reward_fn/mean": 0.8125, "rewards/reward_fn/std": 0.5303300619125366, "sampling/importance_sampling_ratio/max": 1.23760187625885, "sampling/importance_sampling_ratio/mean": 0.9996522665023804, "sampling/importance_sampling_ratio/min": 0.8194323778152466, "sampling/sampling_logp_difference/max": 0.21317553520202637, "sampling/sampling_logp_difference/mean": 0.005590934306383133, "step": 169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/max_terminated_length": 301.0, "completions/mean_length": 178.125, "completions/mean_terminated_length": 178.125, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.24812849145382643, "epoch": 0.04551539491298527, "frac_reward_zero_std": 0.0, "grad_norm": 2.2669782638549805, "learning_rate": 1.986e-06, "loss": 0.0545, "num_tokens": 381111.0, "reward": 0.36250001192092896, "reward_std": 0.42499998211860657, "rewards/reward_fn/mean": 0.36250001192092896, "rewards/reward_fn/std": 0.7190023064613342, "sampling/importance_sampling_ratio/max": 1.397337555885315, "sampling/importance_sampling_ratio/mean": 0.9994187355041504, "sampling/importance_sampling_ratio/min": 0.6189570426940918, "sampling/sampling_logp_difference/max": 0.4797194004058838, "sampling/sampling_logp_difference/mean": 0.009224156849086285, "step": 170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 283.0, "completions/max_terminated_length": 283.0, "completions/mean_length": 226.625, "completions/mean_terminated_length": 226.625, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "entropy": 0.2087248731404543, "epoch": 0.04578313253012048, "frac_reward_zero_std": 0.5, "grad_norm": 1.3458194732666016, "learning_rate": 1.98e-06, "loss": -0.0181, "num_tokens": 384404.0, "reward": 0.625, "reward_std": 0.3061862289905548, "rewards/reward_fn/mean": 0.625, "rewards/reward_fn/std": 0.5669467449188232, "sampling/importance_sampling_ratio/max": 1.2463483810424805, "sampling/importance_sampling_ratio/mean": 1.0004358291625977, "sampling/importance_sampling_ratio/min": 0.8319239020347595, "sampling/sampling_logp_difference/max": 0.22021794319152832, "sampling/sampling_logp_difference/mean": 0.0074347336776554585, "step": 171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/max_terminated_length": 381.0, "completions/mean_length": 161.5, "completions/mean_terminated_length": 161.5, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "entropy": 0.15279534831643105, "epoch": 0.04605087014725569, "frac_reward_zero_std": 0.5, "grad_norm": 2.8451249599456787, "learning_rate": 1.974e-06, "loss": 0.2288, "num_tokens": 387040.0, "reward": 0.75, "reward_std": 0.5, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 0.7071067690849304, "sampling/importance_sampling_ratio/max": 1.22036612033844, "sampling/importance_sampling_ratio/mean": 0.9996545314788818, "sampling/importance_sampling_ratio/min": 0.6266293525695801, "sampling/sampling_logp_difference/max": 0.46740007400512695, "sampling/sampling_logp_difference/mean": 0.0070432210341095924, "step": 172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 179.375, "completions/mean_terminated_length": 179.375, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.13768529891967773, "epoch": 0.046318607764390896, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.968e-06, "loss": 0.0, "num_tokens": 389723.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.2940220832824707, "sampling/importance_sampling_ratio/mean": 1.0009649991989136, "sampling/importance_sampling_ratio/min": 0.8115485310554504, "sampling/sampling_logp_difference/max": 0.2577552795410156, "sampling/sampling_logp_difference/mean": 0.005450798198580742, "step": 173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 166.0, "completions/max_terminated_length": 166.0, "completions/mean_length": 132.375, "completions/mean_terminated_length": 132.375, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.15965103451162577, "epoch": 0.046586345381526104, "frac_reward_zero_std": 0.5, "grad_norm": 2.5277624130249023, "learning_rate": 1.962e-06, "loss": 0.0605, "num_tokens": 392010.0, "reward": 0.8125, "reward_std": 0.375, "rewards/reward_fn/mean": 0.8125, "rewards/reward_fn/std": 0.5303300619125366, "sampling/importance_sampling_ratio/max": 1.2175135612487793, "sampling/importance_sampling_ratio/mean": 0.9987398386001587, "sampling/importance_sampling_ratio/min": 0.7726929783821106, "sampling/sampling_logp_difference/max": 0.25787353515625, "sampling/sampling_logp_difference/mean": 0.005913982167840004, "step": 174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 159.0, "completions/mean_terminated_length": 159.0, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "entropy": 0.14390943851321936, "epoch": 0.04685408299866131, "frac_reward_zero_std": 0.5, "grad_norm": 1.1645482778549194, "learning_rate": 1.956e-06, "loss": 0.1263, "num_tokens": 394550.0, "reward": 0.8125, "reward_std": 0.375, "rewards/reward_fn/mean": 0.8125, "rewards/reward_fn/std": 0.5303300619125366, "sampling/importance_sampling_ratio/max": 1.2160688638687134, "sampling/importance_sampling_ratio/mean": 1.0002634525299072, "sampling/importance_sampling_ratio/min": 0.8833932280540466, "sampling/sampling_logp_difference/max": 0.19562339782714844, "sampling/sampling_logp_difference/mean": 0.005250552669167519, "step": 175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 138.0, "completions/mean_terminated_length": 138.0, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.15502149425446987, "epoch": 0.04712182061579652, "frac_reward_zero_std": 0.0, "grad_norm": 2.823361396789551, "learning_rate": 1.95e-06, "loss": -0.045, "num_tokens": 396814.0, "reward": 0.0625, "reward_std": 0.8080127239227295, "rewards/reward_fn/mean": 0.0625, "rewards/reward_fn/std": 0.7763237953186035, "sampling/importance_sampling_ratio/max": 1.297332763671875, "sampling/importance_sampling_ratio/mean": 0.999724268913269, "sampling/importance_sampling_ratio/min": 0.6343339085578918, "sampling/sampling_logp_difference/max": 0.4551798105239868, "sampling/sampling_logp_difference/mean": 0.00748742138966918, "step": 176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.0, "completions/max_terminated_length": 299.0, "completions/mean_length": 223.0, "completions/mean_terminated_length": 223.0, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.1662858659401536, "epoch": 0.04738955823293173, "frac_reward_zero_std": 0.5, "grad_norm": 1.0878444910049438, "learning_rate": 1.944e-06, "loss": 0.0038, "num_tokens": 399870.0, "reward": -0.125, "reward_std": 0.4330126941204071, "rewards/reward_fn/mean": -0.125, "rewards/reward_fn/std": 0.6943650841712952, "sampling/importance_sampling_ratio/max": 1.3510700464248657, "sampling/importance_sampling_ratio/mean": 0.999899685382843, "sampling/importance_sampling_ratio/min": 0.7924180626869202, "sampling/sampling_logp_difference/max": 0.30089688301086426, "sampling/sampling_logp_difference/mean": 0.006145750172436237, "step": 177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.0, "completions/max_terminated_length": 217.0, "completions/mean_length": 168.75, "completions/mean_terminated_length": 168.75, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.11201621871441603, "epoch": 0.047657295850066936, "frac_reward_zero_std": 0.5, "grad_norm": 1.7199177742004395, "learning_rate": 1.938e-06, "loss": 0.0671, "num_tokens": 402332.0, "reward": 0.5625, "reward_std": 0.5153881907463074, "rewards/reward_fn/mean": 0.5625, "rewards/reward_fn/std": 0.8210403323173523, "sampling/importance_sampling_ratio/max": 1.3434845209121704, "sampling/importance_sampling_ratio/mean": 1.0000642538070679, "sampling/importance_sampling_ratio/min": 0.7986145615577698, "sampling/sampling_logp_difference/max": 0.29526662826538086, "sampling/sampling_logp_difference/mean": 0.005664341151714325, "step": 178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 213.0, "completions/max_terminated_length": 213.0, "completions/mean_length": 137.5, "completions/mean_terminated_length": 137.5, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.12632913328707218, "epoch": 0.047925033467202144, "frac_reward_zero_std": 0.5, "grad_norm": 1.377305030822754, "learning_rate": 1.9320000000000003e-06, "loss": 0.011, "num_tokens": 404636.0, "reward": 0.8125, "reward_std": 0.375, "rewards/reward_fn/mean": 0.8125, "rewards/reward_fn/std": 0.5303300619125366, "sampling/importance_sampling_ratio/max": 1.3724702596664429, "sampling/importance_sampling_ratio/mean": 1.001038670539856, "sampling/importance_sampling_ratio/min": 0.8019245266914368, "sampling/sampling_logp_difference/max": 0.31661224365234375, "sampling/sampling_logp_difference/mean": 0.006172348279505968, "step": 179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 273.0, "completions/max_terminated_length": 273.0, "completions/mean_length": 188.25, "completions/mean_terminated_length": 188.25, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.12593185156583786, "epoch": 0.04819277108433735, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.9260000000000003e-06, "loss": 0.0, "num_tokens": 407378.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.2438749074935913, "sampling/importance_sampling_ratio/mean": 1.0004035234451294, "sampling/importance_sampling_ratio/min": 0.7642562985420227, "sampling/sampling_logp_difference/max": 0.2688521146774292, "sampling/sampling_logp_difference/mean": 0.004781014751642942, "step": 180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 227.0, "completions/max_terminated_length": 227.0, "completions/mean_length": 150.25, "completions/mean_terminated_length": 150.25, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "entropy": 0.22453744895756245, "epoch": 0.04846050870147256, "frac_reward_zero_std": 0.5, "grad_norm": 3.054194927215576, "learning_rate": 1.9200000000000003e-06, "loss": 0.0323, "num_tokens": 409812.0, "reward": 0.625, "reward_std": 0.4330126941204071, "rewards/reward_fn/mean": 0.625, "rewards/reward_fn/std": 0.6943650841712952, "sampling/importance_sampling_ratio/max": 1.2631661891937256, "sampling/importance_sampling_ratio/mean": 0.9990540742874146, "sampling/importance_sampling_ratio/min": 0.6897914409637451, "sampling/sampling_logp_difference/max": 0.371366024017334, "sampling/sampling_logp_difference/mean": 0.008767911233007908, "step": 181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/max_terminated_length": 304.0, "completions/mean_length": 179.875, "completions/mean_terminated_length": 179.875, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.14376072958111763, "epoch": 0.04872824631860776, "frac_reward_zero_std": 0.5, "grad_norm": 1.0632572174072266, "learning_rate": 1.9140000000000002e-06, "loss": 0.0038, "num_tokens": 412311.0, "reward": 0.8187500238418579, "reward_std": 0.16504418849945068, "rewards/reward_fn/mean": 0.8187500238418579, "rewards/reward_fn/std": 0.29024311900138855, "sampling/importance_sampling_ratio/max": 1.5218056440353394, "sampling/importance_sampling_ratio/mean": 1.0007307529449463, "sampling/importance_sampling_ratio/min": 0.7701411247253418, "sampling/sampling_logp_difference/max": 0.41989755630493164, "sampling/sampling_logp_difference/mean": 0.00804669689387083, "step": 182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 413.0, "completions/max_terminated_length": 413.0, "completions/mean_length": 273.0, "completions/mean_terminated_length": 273.0, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.30245892982929945, "epoch": 0.04899598393574297, "frac_reward_zero_std": 0.0, "grad_norm": 1.3696372509002686, "learning_rate": 1.908e-06, "loss": 0.1582, "num_tokens": 415767.0, "reward": -0.4375, "reward_std": 0.9732423424720764, "rewards/reward_fn/mean": -0.4375, "rewards/reward_fn/std": 0.9038608074188232, "sampling/importance_sampling_ratio/max": 1.6243844032287598, "sampling/importance_sampling_ratio/mean": 0.9994616508483887, "sampling/importance_sampling_ratio/min": 0.721356213092804, "sampling/sampling_logp_difference/max": 0.48512887954711914, "sampling/sampling_logp_difference/mean": 0.007613599766045809, "step": 183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 152.0, "completions/max_terminated_length": 152.0, "completions/mean_length": 91.5, "completions/mean_terminated_length": 91.5, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.1733332071453333, "epoch": 0.04926372155287818, "frac_reward_zero_std": 0.5, "grad_norm": 2.360130786895752, "learning_rate": 1.9020000000000002e-06, "loss": 0.1369, "num_tokens": 417535.0, "reward": 0.8125, "reward_std": 0.375, "rewards/reward_fn/mean": 0.8125, "rewards/reward_fn/std": 0.5303300619125366, "sampling/importance_sampling_ratio/max": 1.2598021030426025, "sampling/importance_sampling_ratio/mean": 1.0022797584533691, "sampling/importance_sampling_ratio/min": 0.8836413621902466, "sampling/sampling_logp_difference/max": 0.23095464706420898, "sampling/sampling_logp_difference/mean": 0.006996835116297007, "step": 184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 234.0, "completions/max_terminated_length": 234.0, "completions/mean_length": 153.625, "completions/mean_terminated_length": 153.625, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.17564447596669197, "epoch": 0.049531459170013385, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.8960000000000001e-06, "loss": 0.0, "num_tokens": 420140.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.2041821479797363, "sampling/importance_sampling_ratio/mean": 0.9996619820594788, "sampling/importance_sampling_ratio/min": 0.7053455114364624, "sampling/sampling_logp_difference/max": 0.34906744956970215, "sampling/sampling_logp_difference/mean": 0.007546583190560341, "step": 185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 162.0, "completions/max_terminated_length": 162.0, "completions/mean_length": 132.5, "completions/mean_terminated_length": 132.5, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.2495424384251237, "epoch": 0.04979919678714859, "frac_reward_zero_std": 0.5, "grad_norm": 1.8864716291427612, "learning_rate": 1.8900000000000001e-06, "loss": 0.0118, "num_tokens": 422532.0, "reward": 0.8125, "reward_std": 0.375, "rewards/reward_fn/mean": 0.8125, "rewards/reward_fn/std": 0.5303300619125366, "sampling/importance_sampling_ratio/max": 1.2728646993637085, "sampling/importance_sampling_ratio/mean": 0.9985761642456055, "sampling/importance_sampling_ratio/min": 0.6547141671180725, "sampling/sampling_logp_difference/max": 0.4235565662384033, "sampling/sampling_logp_difference/mean": 0.008066646754741669, "step": 186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/max_terminated_length": 364.0, "completions/mean_length": 296.875, "completions/mean_terminated_length": 296.875, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "entropy": 0.20992303546518087, "epoch": 0.0500669344042838, "frac_reward_zero_std": 0.0, "grad_norm": 1.30056631565094, "learning_rate": 1.884e-06, "loss": -0.0142, "num_tokens": 426459.0, "reward": 0.25, "reward_std": 0.75, "rewards/reward_fn/mean": 0.25, "rewards/reward_fn/std": 0.8017837405204773, "sampling/importance_sampling_ratio/max": 1.2471423149108887, "sampling/importance_sampling_ratio/mean": 0.9999903440475464, "sampling/importance_sampling_ratio/min": 0.7977421879768372, "sampling/sampling_logp_difference/max": 0.22596979141235352, "sampling/sampling_logp_difference/mean": 0.007151725236326456, "step": 187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 454.0, "completions/max_terminated_length": 454.0, "completions/mean_length": 225.0, "completions/mean_terminated_length": 225.0, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.3369969381019473, "epoch": 0.05033467202141901, "frac_reward_zero_std": 0.0, "grad_norm": 1.985388994216919, "learning_rate": 1.878e-06, "loss": -0.1078, "num_tokens": 429883.0, "reward": 0.15625, "reward_std": 0.7920478582382202, "rewards/reward_fn/mean": 0.15625, "rewards/reward_fn/std": 0.7432734370231628, "sampling/importance_sampling_ratio/max": 1.3274774551391602, "sampling/importance_sampling_ratio/mean": 0.9989834427833557, "sampling/importance_sampling_ratio/min": 0.5080180764198303, "sampling/sampling_logp_difference/max": 0.6772382259368896, "sampling/sampling_logp_difference/mean": 0.009781278669834137, "step": 188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 162.125, "completions/mean_terminated_length": 162.125, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.22908513620495796, "epoch": 0.05060240963855422, "frac_reward_zero_std": 0.5, "grad_norm": 1.0999587774276733, "learning_rate": 1.872e-06, "loss": -0.005, "num_tokens": 432516.0, "reward": 0.75, "reward_std": 0.5, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 0.7071067690849304, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0006202459335327, "sampling/importance_sampling_ratio/min": 0.7790079712867737, "sampling/sampling_logp_difference/max": 1.2620112895965576, "sampling/sampling_logp_difference/mean": 0.008702249266207218, "step": 189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 238.0, "completions/max_terminated_length": 238.0, "completions/mean_length": 143.125, "completions/mean_terminated_length": 143.125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.15932972449809313, "epoch": 0.050870147255689425, "frac_reward_zero_std": 0.5, "grad_norm": 2.233430862426758, "learning_rate": 1.866e-06, "loss": 0.1401, "num_tokens": 434817.0, "reward": 0.75, "reward_std": 0.5, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 0.7071067690849304, "sampling/importance_sampling_ratio/max": 1.1661478281021118, "sampling/importance_sampling_ratio/mean": 0.9993828535079956, "sampling/importance_sampling_ratio/min": 0.6685177087783813, "sampling/sampling_logp_difference/max": 0.4026923179626465, "sampling/sampling_logp_difference/mean": 0.006852747406810522, "step": 190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/max_terminated_length": 357.0, "completions/mean_length": 171.125, "completions/mean_terminated_length": 171.125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.18045215867459774, "epoch": 0.051137884872824634, "frac_reward_zero_std": 0.0, "grad_norm": 2.198357582092285, "learning_rate": 1.86e-06, "loss": 0.2737, "num_tokens": 437426.0, "reward": 0.5625, "reward_std": 0.875, "rewards/reward_fn/mean": 0.5625, "rewards/reward_fn/std": 0.8210403323173523, "sampling/importance_sampling_ratio/max": 1.3387577533721924, "sampling/importance_sampling_ratio/mean": 0.9994440674781799, "sampling/importance_sampling_ratio/min": 0.7586665153503418, "sampling/sampling_logp_difference/max": 0.29174208641052246, "sampling/sampling_logp_difference/mean": 0.00650465814396739, "step": 191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/max_terminated_length": 316.0, "completions/mean_length": 173.75, "completions/mean_terminated_length": 173.75, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.14669587183743715, "epoch": 0.05140562248995984, "frac_reward_zero_std": 0.5, "grad_norm": 1.128441572189331, "learning_rate": 1.854e-06, "loss": 0.1731, "num_tokens": 439936.0, "reward": 0.5625, "reward_std": 0.5153881907463074, "rewards/reward_fn/mean": 0.5625, "rewards/reward_fn/std": 0.8210403323173523, "sampling/importance_sampling_ratio/max": 1.203328013420105, "sampling/importance_sampling_ratio/mean": 0.9998972415924072, "sampling/importance_sampling_ratio/min": 0.8285154104232788, "sampling/sampling_logp_difference/max": 0.18811988830566406, "sampling/sampling_logp_difference/mean": 0.005137268453836441, "step": 192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 113.0, "completions/max_terminated_length": 113.0, "completions/mean_length": 100.25, "completions/mean_terminated_length": 100.25, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.09449647273868322, "epoch": 0.05167336010709505, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.848e-06, "loss": 0.0, "num_tokens": 441842.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.0898371934890747, "sampling/importance_sampling_ratio/mean": 1.0000016689300537, "sampling/importance_sampling_ratio/min": 0.8981171250343323, "sampling/sampling_logp_difference/max": 0.10745477676391602, "sampling/sampling_logp_difference/mean": 0.002759030321612954, "step": 193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 111.125, "completions/mean_terminated_length": 111.125, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "entropy": 0.13560949638485909, "epoch": 0.05194109772423026, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.8420000000000001e-06, "loss": 0.0, "num_tokens": 443855.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.1419775485992432, "sampling/importance_sampling_ratio/mean": 1.0005353689193726, "sampling/importance_sampling_ratio/min": 0.888958215713501, "sampling/sampling_logp_difference/max": 0.13276147842407227, "sampling/sampling_logp_difference/mean": 0.005092698149383068, "step": 194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 181.0, "completions/max_terminated_length": 181.0, "completions/mean_length": 129.75, "completions/mean_terminated_length": 129.75, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 0.11229478102177382, "epoch": 0.05220883534136546, "frac_reward_zero_std": 0.5, "grad_norm": 1.7758033275604248, "learning_rate": 1.836e-06, "loss": 0.1158, "num_tokens": 446033.0, "reward": 0.5625, "reward_std": 0.5153881907463074, "rewards/reward_fn/mean": 0.5625, "rewards/reward_fn/std": 0.8210403323173523, "sampling/importance_sampling_ratio/max": 1.4780118465423584, "sampling/importance_sampling_ratio/mean": 1.0000649690628052, "sampling/importance_sampling_ratio/min": 0.7794687747955322, "sampling/sampling_logp_difference/max": 0.3906978368759155, "sampling/sampling_logp_difference/mean": 0.004935659002512693, "step": 195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 235.0, "completions/max_terminated_length": 235.0, "completions/mean_length": 166.75, "completions/mean_terminated_length": 166.75, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.16254311380907893, "epoch": 0.05247657295850067, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.83e-06, "loss": 0.0, "num_tokens": 448527.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.1245315074920654, "sampling/importance_sampling_ratio/mean": 0.9989565014839172, "sampling/importance_sampling_ratio/min": 0.8537182807922363, "sampling/sampling_logp_difference/max": 0.15815401077270508, "sampling/sampling_logp_difference/mean": 0.004565043840557337, "step": 196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 184.0, "completions/max_terminated_length": 184.0, "completions/mean_length": 133.125, "completions/mean_terminated_length": 133.125, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.22793197073042393, "epoch": 0.052744310575635875, "frac_reward_zero_std": 0.5, "grad_norm": 1.4934121370315552, "learning_rate": 1.824e-06, "loss": -0.0953, "num_tokens": 450672.0, "reward": 0.75, "reward_std": 0.3535533845424652, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 0.5345224738121033, "sampling/importance_sampling_ratio/max": 1.2362028360366821, "sampling/importance_sampling_ratio/mean": 0.9991835951805115, "sampling/importance_sampling_ratio/min": 0.7866268754005432, "sampling/sampling_logp_difference/max": 0.24000120162963867, "sampling/sampling_logp_difference/mean": 0.008309881202876568, "step": 197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 179.0, "completions/max_terminated_length": 179.0, "completions/mean_length": 134.125, "completions/mean_terminated_length": 134.125, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.18533807806670666, "epoch": 0.05301204819277108, "frac_reward_zero_std": 0.0, "grad_norm": 2.3772056102752686, "learning_rate": 1.818e-06, "loss": -0.1018, "num_tokens": 453005.0, "reward": 0.71875, "reward_std": 0.5625, "rewards/reward_fn/mean": 0.71875, "rewards/reward_fn/std": 0.5580178499221802, "sampling/importance_sampling_ratio/max": 1.199474573135376, "sampling/importance_sampling_ratio/mean": 0.9997886419296265, "sampling/importance_sampling_ratio/min": 0.8903831839561462, "sampling/sampling_logp_difference/max": 0.1818835735321045, "sampling/sampling_logp_difference/mean": 0.006151249166578054, "step": 198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 175.0, "completions/max_terminated_length": 175.0, "completions/mean_length": 149.125, "completions/mean_terminated_length": 149.125, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.16554655320942402, "epoch": 0.05327978580990629, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.812e-06, "loss": 0.0, "num_tokens": 455350.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.1237701177597046, "sampling/importance_sampling_ratio/mean": 1.0002704858779907, "sampling/importance_sampling_ratio/min": 0.836478590965271, "sampling/sampling_logp_difference/max": 0.17855429649353027, "sampling/sampling_logp_difference/mean": 0.005652608349919319, "step": 199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 421.0, "completions/max_terminated_length": 421.0, "completions/mean_length": 201.0, "completions/mean_terminated_length": 201.0, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.20059594605118036, "epoch": 0.0535475234270415, "frac_reward_zero_std": 0.5, "grad_norm": 0.9740797281265259, "learning_rate": 1.806e-06, "loss": 0.3176, "num_tokens": 458090.0, "reward": 0.5, "reward_std": 0.5773502588272095, "rewards/reward_fn/mean": 0.5, "rewards/reward_fn/std": 0.9258201122283936, "sampling/importance_sampling_ratio/max": 1.1676054000854492, "sampling/importance_sampling_ratio/mean": 0.9997439980506897, "sampling/importance_sampling_ratio/min": 0.8131464719772339, "sampling/sampling_logp_difference/max": 0.20684409141540527, "sampling/sampling_logp_difference/mean": 0.006033932790160179, "step": 200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.0, "completions/max_terminated_length": 299.0, "completions/mean_length": 190.0, "completions/mean_terminated_length": 190.0, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.19494741596281528, "epoch": 0.05381526104417671, "frac_reward_zero_std": 0.5, "grad_norm": 2.217597007751465, "learning_rate": 1.8e-06, "loss": 0.0339, "num_tokens": 460798.0, "reward": 0.8125, "reward_std": 0.375, "rewards/reward_fn/mean": 0.8125, "rewards/reward_fn/std": 0.5303300619125366, "sampling/importance_sampling_ratio/max": 1.1937893629074097, "sampling/importance_sampling_ratio/mean": 1.0003128051757812, "sampling/importance_sampling_ratio/min": 0.7874367833137512, "sampling/sampling_logp_difference/max": 0.23897218704223633, "sampling/sampling_logp_difference/mean": 0.0060877627693116665, "step": 201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 709.0, "completions/max_terminated_length": 709.0, "completions/mean_length": 286.25, "completions/mean_terminated_length": 286.25, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.13882583286613226, "epoch": 0.054082998661311915, "frac_reward_zero_std": 0.5, "grad_norm": 1.0446916818618774, "learning_rate": 1.7939999999999999e-06, "loss": 0.3959, "num_tokens": 464428.0, "reward": 0.125, "reward_std": 0.14433756470680237, "rewards/reward_fn/mean": 0.125, "rewards/reward_fn/std": 0.9543135166168213, "sampling/importance_sampling_ratio/max": 1.1894177198410034, "sampling/importance_sampling_ratio/mean": 0.9996605515480042, "sampling/importance_sampling_ratio/min": 0.7030169367790222, "sampling/sampling_logp_difference/max": 0.3523743152618408, "sampling/sampling_logp_difference/mean": 0.004058383405208588, "step": 202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 715.0, "completions/max_terminated_length": 715.0, "completions/mean_length": 271.25, "completions/mean_terminated_length": 271.25, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.42140844091773033, "epoch": 0.05435073627844712, "frac_reward_zero_std": 0.0, "grad_norm": 1.9301598072052002, "learning_rate": 1.7879999999999999e-06, "loss": 0.2022, "num_tokens": 467838.0, "reward": 0.375, "reward_std": 0.625, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.7791936993598938, "sampling/importance_sampling_ratio/max": 1.3211241960525513, "sampling/importance_sampling_ratio/mean": 0.999210000038147, "sampling/importance_sampling_ratio/min": 0.7939528226852417, "sampling/sampling_logp_difference/max": 0.2784830331802368, "sampling/sampling_logp_difference/mean": 0.011777552776038647, "step": 203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 190.0, "completions/max_terminated_length": 190.0, "completions/mean_length": 145.5, "completions/mean_terminated_length": 145.5, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.11766148265451193, "epoch": 0.05461847389558233, "frac_reward_zero_std": 0.5, "grad_norm": 1.3052304983139038, "learning_rate": 1.782e-06, "loss": -0.0592, "num_tokens": 470114.0, "reward": 0.9375, "reward_std": 0.125, "rewards/reward_fn/mean": 0.9375, "rewards/reward_fn/std": 0.1767766922712326, "sampling/importance_sampling_ratio/max": 1.1783447265625, "sampling/importance_sampling_ratio/mean": 0.9994886517524719, "sampling/importance_sampling_ratio/min": 0.8662769198417664, "sampling/sampling_logp_difference/max": 0.16411066055297852, "sampling/sampling_logp_difference/mean": 0.0037222786340862513, "step": 204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/max_terminated_length": 348.0, "completions/mean_length": 161.5, "completions/mean_terminated_length": 161.5, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.17434159200638533, "epoch": 0.05488621151271754, "frac_reward_zero_std": 0.5, "grad_norm": 1.2709463834762573, "learning_rate": 1.776e-06, "loss": 0.1511, "num_tokens": 472558.0, "reward": 0.5, "reward_std": 0.5773502588272095, "rewards/reward_fn/mean": 0.5, "rewards/reward_fn/std": 0.9258201122283936, "sampling/importance_sampling_ratio/max": 1.2267013788223267, "sampling/importance_sampling_ratio/mean": 0.9995797276496887, "sampling/importance_sampling_ratio/min": 0.7808828353881836, "sampling/sampling_logp_difference/max": 0.2473301887512207, "sampling/sampling_logp_difference/mean": 0.006734929047524929, "step": 205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 237.0, "completions/max_terminated_length": 237.0, "completions/mean_length": 173.5, "completions/mean_terminated_length": 173.5, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.14746389538049698, "epoch": 0.05515394912985275, "frac_reward_zero_std": 0.0, "grad_norm": 1.9019854068756104, "learning_rate": 1.77e-06, "loss": 0.0239, "num_tokens": 475210.0, "reward": 0.875, "reward_std": 0.25, "rewards/reward_fn/mean": 0.875, "rewards/reward_fn/std": 0.2314550280570984, "sampling/importance_sampling_ratio/max": 1.3390800952911377, "sampling/importance_sampling_ratio/mean": 1.0016798973083496, "sampling/importance_sampling_ratio/min": 0.8553012609481812, "sampling/sampling_logp_difference/max": 0.29198288917541504, "sampling/sampling_logp_difference/mean": 0.005372337065637112, "step": 206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 264.0, "completions/max_terminated_length": 264.0, "completions/mean_length": 185.375, "completions/mean_terminated_length": 185.375, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.1620370652526617, "epoch": 0.05542168674698795, "frac_reward_zero_std": 0.5, "grad_norm": 0.975816547870636, "learning_rate": 1.764e-06, "loss": -0.0562, "num_tokens": 477901.0, "reward": 0.75, "reward_std": 0.5, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 0.7071067690849304, "sampling/importance_sampling_ratio/max": 1.2823892831802368, "sampling/importance_sampling_ratio/mean": 0.9998041987419128, "sampling/importance_sampling_ratio/min": 0.8479940295219421, "sampling/sampling_logp_difference/max": 0.24872493743896484, "sampling/sampling_logp_difference/mean": 0.005501179955899715, "step": 207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 204.0, "completions/max_terminated_length": 204.0, "completions/mean_length": 133.625, "completions/mean_terminated_length": 133.625, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "entropy": 0.11755446903407574, "epoch": 0.055689424364123156, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.758e-06, "loss": 0.0, "num_tokens": 480182.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.1500084400177002, "sampling/importance_sampling_ratio/mean": 1.000614047050476, "sampling/importance_sampling_ratio/min": 0.8049580454826355, "sampling/sampling_logp_difference/max": 0.21696507930755615, "sampling/sampling_logp_difference/mean": 0.005056788679212332, "step": 208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.0, "completions/max_terminated_length": 257.0, "completions/mean_length": 111.375, "completions/mean_terminated_length": 111.375, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "entropy": 0.1608228087425232, "epoch": 0.055957161981258365, "frac_reward_zero_std": 0.5, "grad_norm": 4.325232028961182, "learning_rate": 1.752e-06, "loss": -0.2189, "num_tokens": 482157.0, "reward": 0.625, "reward_std": 0.4330126941204071, "rewards/reward_fn/mean": 0.625, "rewards/reward_fn/std": 0.6943650841712952, "sampling/importance_sampling_ratio/max": 1.282123327255249, "sampling/importance_sampling_ratio/mean": 0.9995783567428589, "sampling/importance_sampling_ratio/min": 0.7026236057281494, "sampling/sampling_logp_difference/max": 0.35293400287628174, "sampling/sampling_logp_difference/mean": 0.009472329169511795, "step": 209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 168.0, "completions/max_terminated_length": 168.0, "completions/mean_length": 111.25, "completions/mean_terminated_length": 111.25, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "entropy": 0.18304255045950413, "epoch": 0.05622489959839357, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.7459999999999999e-06, "loss": 0.0, "num_tokens": 484211.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.212424397468567, "sampling/importance_sampling_ratio/mean": 1.0004751682281494, "sampling/importance_sampling_ratio/min": 0.8234138488769531, "sampling/sampling_logp_difference/max": 0.19429636001586914, "sampling/sampling_logp_difference/mean": 0.006673275958746672, "step": 210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/max_terminated_length": 368.0, "completions/mean_length": 213.0, "completions/mean_terminated_length": 213.0, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.22142543271183968, "epoch": 0.05649263721552878, "frac_reward_zero_std": 0.0, "grad_norm": 1.4450099468231201, "learning_rate": 1.7399999999999999e-06, "loss": 0.1243, "num_tokens": 487439.0, "reward": -0.5, "reward_std": 0.5773502588272095, "rewards/reward_fn/mean": -0.5, "rewards/reward_fn/std": 0.6546536684036255, "sampling/importance_sampling_ratio/max": 1.2356390953063965, "sampling/importance_sampling_ratio/mean": 0.9989750981330872, "sampling/importance_sampling_ratio/min": 0.7114080190658569, "sampling/sampling_logp_difference/max": 0.34050917625427246, "sampling/sampling_logp_difference/mean": 0.007468825671821833, "step": 211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/max_terminated_length": 138.0, "completions/mean_length": 94.375, "completions/mean_terminated_length": 94.375, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 0.09350183140486479, "epoch": 0.05676037483266399, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.7339999999999998e-06, "loss": 0.0, "num_tokens": 489286.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.2534582614898682, "sampling/importance_sampling_ratio/mean": 1.0008443593978882, "sampling/importance_sampling_ratio/min": 0.8552350401878357, "sampling/sampling_logp_difference/max": 0.2259063720703125, "sampling/sampling_logp_difference/mean": 0.004478222690522671, "step": 212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 178.0, "completions/max_terminated_length": 178.0, "completions/mean_length": 128.125, "completions/mean_terminated_length": 128.125, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.1525287050753832, "epoch": 0.0570281124497992, "frac_reward_zero_std": 0.5, "grad_norm": 1.5707823038101196, "learning_rate": 1.728e-06, "loss": 0.0822, "num_tokens": 491547.0, "reward": 0.7875000238418579, "reward_std": 0.36142081022262573, "rewards/reward_fn/mean": 0.7875000238418579, "rewards/reward_fn/std": 0.5249149799346924, "sampling/importance_sampling_ratio/max": 1.1554964780807495, "sampling/importance_sampling_ratio/mean": 0.9997482299804688, "sampling/importance_sampling_ratio/min": 0.8723042011260986, "sampling/sampling_logp_difference/max": 0.1445300579071045, "sampling/sampling_logp_difference/mean": 0.005027392413467169, "step": 213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 97.0, "completions/max_terminated_length": 97.0, "completions/mean_length": 80.0, "completions/mean_terminated_length": 80.0, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "entropy": 0.10039297165349126, "epoch": 0.057295850066934405, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.722e-06, "loss": 0.0, "num_tokens": 493239.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.1255422830581665, "sampling/importance_sampling_ratio/mean": 0.9997009634971619, "sampling/importance_sampling_ratio/min": 0.7212033271789551, "sampling/sampling_logp_difference/max": 0.32683420181274414, "sampling/sampling_logp_difference/mean": 0.004127423744648695, "step": 214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 169.0, "completions/mean_length": 390.5, "completions/mean_terminated_length": 153.71429443359375, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.14686391642317176, "epoch": 0.05756358768406961, "frac_reward_zero_std": 0.0, "grad_norm": 1.2741726636886597, "learning_rate": 1.716e-06, "loss": 0.9027, "num_tokens": 497827.0, "reward": 0.375, "reward_std": 0.9330127239227295, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.876274585723877, "sampling/importance_sampling_ratio/max": 1.2853373289108276, "sampling/importance_sampling_ratio/mean": 0.9996297359466553, "sampling/importance_sampling_ratio/min": 0.6879914999008179, "sampling/sampling_logp_difference/max": 0.373978853225708, "sampling/sampling_logp_difference/mean": 0.005864859092980623, "step": 215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 152.875, "completions/mean_terminated_length": 152.875, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.12656874861568213, "epoch": 0.05783132530120482, "frac_reward_zero_std": 0.0, "grad_norm": 1.697898268699646, "learning_rate": 1.71e-06, "loss": -0.0425, "num_tokens": 500174.0, "reward": 0.7875000238418579, "reward_std": 0.42500001192092896, "rewards/reward_fn/mean": 0.7875000238418579, "rewards/reward_fn/std": 0.5249149799346924, "sampling/importance_sampling_ratio/max": 1.4526164531707764, "sampling/importance_sampling_ratio/mean": 0.9999053478240967, "sampling/importance_sampling_ratio/min": 0.8057124018669128, "sampling/sampling_logp_difference/max": 0.3733663558959961, "sampling/sampling_logp_difference/mean": 0.004169121850281954, "step": 216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 412.0, "completions/max_terminated_length": 412.0, "completions/mean_length": 211.875, "completions/mean_terminated_length": 211.875, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.2233483735471964, "epoch": 0.05809906291834003, "frac_reward_zero_std": 0.5, "grad_norm": 1.062464952468872, "learning_rate": 1.704e-06, "loss": 0.1715, "num_tokens": 503161.0, "reward": 0.6875, "reward_std": 0.4732423424720764, "rewards/reward_fn/mean": 0.6875, "rewards/reward_fn/std": 0.7039430141448975, "sampling/importance_sampling_ratio/max": 1.2612603902816772, "sampling/importance_sampling_ratio/mean": 1.0002590417861938, "sampling/importance_sampling_ratio/min": 0.7937425971031189, "sampling/sampling_logp_difference/max": 0.23211145401000977, "sampling/sampling_logp_difference/mean": 0.007756231352686882, "step": 217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 144.0, "completions/max_terminated_length": 144.0, "completions/mean_length": 118.0, "completions/mean_terminated_length": 118.0, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "entropy": 0.13996618799865246, "epoch": 0.05836680053547524, "frac_reward_zero_std": 0.5, "grad_norm": 2.35202693939209, "learning_rate": 1.6979999999999999e-06, "loss": 0.0438, "num_tokens": 505261.0, "reward": 0.75, "reward_std": 0.5, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 0.7071067690849304, "sampling/importance_sampling_ratio/max": 1.245059609413147, "sampling/importance_sampling_ratio/mean": 0.9999819397926331, "sampling/importance_sampling_ratio/min": 0.855168342590332, "sampling/sampling_logp_difference/max": 0.21918344497680664, "sampling/sampling_logp_difference/mean": 0.0061225006356835365, "step": 218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 292.0, "completions/max_terminated_length": 292.0, "completions/mean_length": 202.625, "completions/mean_terminated_length": 202.625, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.14822505321353674, "epoch": 0.058634538152610445, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.6919999999999999e-06, "loss": 0.0, "num_tokens": 508246.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.275173544883728, "sampling/importance_sampling_ratio/mean": 1.0003948211669922, "sampling/importance_sampling_ratio/min": 0.7462239265441895, "sampling/sampling_logp_difference/max": 0.29272961616516113, "sampling/sampling_logp_difference/mean": 0.006081519182771444, "step": 219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 229.625, "completions/mean_terminated_length": 229.625, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.1392422802746296, "epoch": 0.058902275769745646, "frac_reward_zero_std": 0.5, "grad_norm": 0.7881863117218018, "learning_rate": 1.6860000000000002e-06, "loss": 0.0444, "num_tokens": 511531.0, "reward": 0.8125, "reward_std": 0.375, "rewards/reward_fn/mean": 0.8125, "rewards/reward_fn/std": 0.5303300619125366, "sampling/importance_sampling_ratio/max": 1.1860865354537964, "sampling/importance_sampling_ratio/mean": 0.9990138411521912, "sampling/importance_sampling_ratio/min": 0.8280349373817444, "sampling/sampling_logp_difference/max": 0.18869996070861816, "sampling/sampling_logp_difference/mean": 0.004599208943545818, "step": 220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.0, "completions/max_terminated_length": 463.0, "completions/mean_length": 191.0, "completions/mean_terminated_length": 191.0, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.19087077025324106, "epoch": 0.059170013386880854, "frac_reward_zero_std": 0.5, "grad_norm": 1.2096396684646606, "learning_rate": 1.6800000000000002e-06, "loss": 0.2441, "num_tokens": 514659.0, "reward": 0.375, "reward_std": 0.4330126941204071, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.876274585723877, "sampling/importance_sampling_ratio/max": 1.3123021125793457, "sampling/importance_sampling_ratio/mean": 1.0000944137573242, "sampling/importance_sampling_ratio/min": 0.7024246454238892, "sampling/sampling_logp_difference/max": 0.35321712493896484, "sampling/sampling_logp_difference/mean": 0.00735221104696393, "step": 221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 308.0, "completions/max_terminated_length": 308.0, "completions/mean_length": 216.375, "completions/mean_terminated_length": 216.375, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.07204665942117572, "epoch": 0.05943775100401606, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.6740000000000002e-06, "loss": 0.0, "num_tokens": 517578.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5102534294128418, "sampling/importance_sampling_ratio/mean": 1.00096595287323, "sampling/importance_sampling_ratio/min": 0.7972874045372009, "sampling/sampling_logp_difference/max": 0.4122774600982666, "sampling/sampling_logp_difference/mean": 0.0051913135685026646, "step": 222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 221.0, "completions/max_terminated_length": 221.0, "completions/mean_length": 175.875, "completions/mean_terminated_length": 175.875, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.17150017898529768, "epoch": 0.05970548862115127, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.6680000000000002e-06, "loss": 0.0, "num_tokens": 520117.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.2133201360702515, "sampling/importance_sampling_ratio/mean": 0.9999736547470093, "sampling/importance_sampling_ratio/min": 0.6892745494842529, "sampling/sampling_logp_difference/max": 0.3721156120300293, "sampling/sampling_logp_difference/mean": 0.006038771476596594, "step": 223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 109.25, "completions/mean_terminated_length": 109.25, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 0.13905080500990152, "epoch": 0.05997322623828648, "frac_reward_zero_std": 0.5, "grad_norm": 1.7118873596191406, "learning_rate": 1.6620000000000001e-06, "loss": -0.0522, "num_tokens": 522355.0, "reward": 0.8125, "reward_std": 0.375, "rewards/reward_fn/mean": 0.8125, "rewards/reward_fn/std": 0.5303300619125366, "sampling/importance_sampling_ratio/max": 1.2761794328689575, "sampling/importance_sampling_ratio/mean": 1.0011992454528809, "sampling/importance_sampling_ratio/min": 0.8115411996841431, "sampling/sampling_logp_difference/max": 0.24387073516845703, "sampling/sampling_logp_difference/mean": 0.006338214036077261, "step": 224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 226.0, "completions/max_terminated_length": 226.0, "completions/mean_length": 151.375, "completions/mean_terminated_length": 151.375, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.13835991267114878, "epoch": 0.060240963855421686, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.6560000000000001e-06, "loss": 0.0, "num_tokens": 524930.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.2582415342330933, "sampling/importance_sampling_ratio/mean": 0.99951171875, "sampling/importance_sampling_ratio/min": 0.8177075386047363, "sampling/sampling_logp_difference/max": 0.22971510887145996, "sampling/sampling_logp_difference/mean": 0.005438277497887611, "step": 225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 150.0, "completions/max_terminated_length": 150.0, "completions/mean_length": 102.625, "completions/mean_terminated_length": 102.625, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "entropy": 0.127193758264184, "epoch": 0.060508701472556894, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.65e-06, "loss": 0.0, "num_tokens": 526979.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.2419331073760986, "sampling/importance_sampling_ratio/mean": 1.0013092756271362, "sampling/importance_sampling_ratio/min": 0.800597608089447, "sampling/sampling_logp_difference/max": 0.2223968505859375, "sampling/sampling_logp_difference/mean": 0.006357919424772263, "step": 226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/max_terminated_length": 369.0, "completions/mean_length": 135.625, "completions/mean_terminated_length": 135.625, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "entropy": 0.09478156408295035, "epoch": 0.0607764390896921, "frac_reward_zero_std": 0.5, "grad_norm": 1.6837663650512695, "learning_rate": 1.6440000000000003e-06, "loss": 0.2949, "num_tokens": 529168.0, "reward": 0.75, "reward_std": 0.5, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 0.7071067690849304, "sampling/importance_sampling_ratio/max": 1.2647678852081299, "sampling/importance_sampling_ratio/mean": 0.9994001388549805, "sampling/importance_sampling_ratio/min": 0.6969671845436096, "sampling/sampling_logp_difference/max": 0.36101698875427246, "sampling/sampling_logp_difference/mean": 0.004585161339491606, "step": 227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 417.0, "completions/max_terminated_length": 417.0, "completions/mean_length": 157.125, "completions/mean_terminated_length": 157.125, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 0.1887792944908142, "epoch": 0.06104417670682731, "frac_reward_zero_std": 0.5, "grad_norm": 2.3043482303619385, "learning_rate": 1.6380000000000002e-06, "loss": -0.2618, "num_tokens": 531797.0, "reward": 0.5625, "reward_std": 0.375, "rewards/reward_fn/mean": 0.5625, "rewards/reward_fn/std": 0.6781013607978821, "sampling/importance_sampling_ratio/max": 1.27113938331604, "sampling/importance_sampling_ratio/mean": 1.0000073909759521, "sampling/importance_sampling_ratio/min": 0.7664273381233215, "sampling/sampling_logp_difference/max": 0.2660154104232788, "sampling/sampling_logp_difference/mean": 0.007665729615837336, "step": 228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 452.0, "completions/max_terminated_length": 452.0, "completions/mean_length": 235.0, "completions/mean_terminated_length": 235.0, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.13285671593621373, "epoch": 0.06131191432396252, "frac_reward_zero_std": 0.0, "grad_norm": 1.199474811553955, "learning_rate": 1.6320000000000002e-06, "loss": -0.1923, "num_tokens": 534913.0, "reward": 0.5, "reward_std": 0.6403881907463074, "rewards/reward_fn/mean": 0.5, "rewards/reward_fn/std": 0.8017837405204773, "sampling/importance_sampling_ratio/max": 1.2840756177902222, "sampling/importance_sampling_ratio/mean": 1.0007753372192383, "sampling/importance_sampling_ratio/min": 0.8309637904167175, "sampling/sampling_logp_difference/max": 0.25003910064697266, "sampling/sampling_logp_difference/mean": 0.004654538352042437, "step": 229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/max_terminated_length": 373.0, "completions/mean_length": 236.625, "completions/mean_terminated_length": 236.625, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.10630225297063589, "epoch": 0.06157965194109773, "frac_reward_zero_std": 0.5, "grad_norm": 0.886779248714447, "learning_rate": 1.6260000000000002e-06, "loss": 0.0662, "num_tokens": 538090.0, "reward": 0.75, "reward_std": 0.5, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 0.7071067690849304, "sampling/importance_sampling_ratio/max": 1.1418272256851196, "sampling/importance_sampling_ratio/mean": 0.9993942379951477, "sampling/importance_sampling_ratio/min": 0.6065549850463867, "sampling/sampling_logp_difference/max": 0.49995994567871094, "sampling/sampling_logp_difference/mean": 0.0049164132215082645, "step": 230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 141.5, "completions/mean_terminated_length": 141.5, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.11815956141799688, "epoch": 0.061847389558232935, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.6200000000000002e-06, "loss": 0.0, "num_tokens": 540422.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.1245213747024536, "sampling/importance_sampling_ratio/mean": 0.9997545480728149, "sampling/importance_sampling_ratio/min": 0.7355735301971436, "sampling/sampling_logp_difference/max": 0.30710482597351074, "sampling/sampling_logp_difference/mean": 0.004456756170839071, "step": 231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 188.0, "completions/max_terminated_length": 188.0, "completions/mean_length": 136.625, "completions/mean_terminated_length": 136.625, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.14277100004255772, "epoch": 0.062115127175368136, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.6140000000000001e-06, "loss": 0.0, "num_tokens": 542663.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.1938289403915405, "sampling/importance_sampling_ratio/mean": 0.9986977577209473, "sampling/importance_sampling_ratio/min": 0.7788282632827759, "sampling/sampling_logp_difference/max": 0.24996471405029297, "sampling/sampling_logp_difference/mean": 0.007078282535076141, "step": 232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/max_terminated_length": 138.0, "completions/mean_length": 97.0, "completions/mean_terminated_length": 97.0, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "entropy": 0.08931392524391413, "epoch": 0.062382864792503344, "frac_reward_zero_std": 0.5, "grad_norm": 2.3963754177093506, "learning_rate": 1.608e-06, "loss": -0.1035, "num_tokens": 544627.0, "reward": 0.90625, "reward_std": 0.1875, "rewards/reward_fn/mean": 0.90625, "rewards/reward_fn/std": 0.2651650309562683, "sampling/importance_sampling_ratio/max": 1.4431791305541992, "sampling/importance_sampling_ratio/mean": 1.0010331869125366, "sampling/importance_sampling_ratio/min": 0.7189193964004517, "sampling/sampling_logp_difference/max": 0.3668484687805176, "sampling/sampling_logp_difference/mean": 0.005404102150350809, "step": 233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 160.0, "completions/max_terminated_length": 160.0, "completions/mean_length": 107.375, "completions/mean_terminated_length": 107.375, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.14731685351580381, "epoch": 0.06265060240963856, "frac_reward_zero_std": 0.5, "grad_norm": 2.0598363876342773, "learning_rate": 1.602e-06, "loss": -0.0038, "num_tokens": 546634.0, "reward": 0.9375, "reward_std": 0.125, "rewards/reward_fn/mean": 0.9375, "rewards/reward_fn/std": 0.1767766922712326, "sampling/importance_sampling_ratio/max": 1.2264916896820068, "sampling/importance_sampling_ratio/mean": 1.000298261642456, "sampling/importance_sampling_ratio/min": 0.8152910470962524, "sampling/sampling_logp_difference/max": 0.2042100429534912, "sampling/sampling_logp_difference/mean": 0.006380253471434116, "step": 234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 144.0, "completions/max_terminated_length": 144.0, "completions/mean_length": 109.5, "completions/mean_terminated_length": 109.5, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.1140655754134059, "epoch": 0.06291834002677377, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.596e-06, "loss": 0.0, "num_tokens": 548754.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.2659380435943604, "sampling/importance_sampling_ratio/mean": 1.0000096559524536, "sampling/importance_sampling_ratio/min": 0.815159261226654, "sampling/sampling_logp_difference/max": 0.23581337928771973, "sampling/sampling_logp_difference/mean": 0.005406687967479229, "step": 235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 141.0, "completions/max_terminated_length": 141.0, "completions/mean_length": 103.5, "completions/mean_terminated_length": 103.5, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.13682924956083298, "epoch": 0.06318607764390897, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.59e-06, "loss": 0.0, "num_tokens": 550690.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3944960832595825, "sampling/importance_sampling_ratio/mean": 0.9999586939811707, "sampling/importance_sampling_ratio/min": 0.7681770920753479, "sampling/sampling_logp_difference/max": 0.3325331211090088, "sampling/sampling_logp_difference/mean": 0.00858171284198761, "step": 236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 213.0, "completions/max_terminated_length": 213.0, "completions/mean_length": 121.625, "completions/mean_terminated_length": 121.625, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "entropy": 0.0518215443007648, "epoch": 0.06345381526104418, "frac_reward_zero_std": 0.5, "grad_norm": 1.2869410514831543, "learning_rate": 1.5840000000000002e-06, "loss": -0.0569, "num_tokens": 552907.0, "reward": 0.75, "reward_std": 0.5, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 0.7071067690849304, "sampling/importance_sampling_ratio/max": 1.2667895555496216, "sampling/importance_sampling_ratio/mean": 1.0007007122039795, "sampling/importance_sampling_ratio/min": 0.8756467700004578, "sampling/sampling_logp_difference/max": 0.23648583889007568, "sampling/sampling_logp_difference/mean": 0.0032285640481859446, "step": 237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 148.375, "completions/mean_terminated_length": 148.375, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.11561266612261534, "epoch": 0.06372155287817939, "frac_reward_zero_std": 0.5, "grad_norm": 2.1914520263671875, "learning_rate": 1.5780000000000002e-06, "loss": 0.087, "num_tokens": 555314.0, "reward": 0.8125, "reward_std": 0.375, "rewards/reward_fn/mean": 0.8125, "rewards/reward_fn/std": 0.5303300619125366, "sampling/importance_sampling_ratio/max": 1.2731958627700806, "sampling/importance_sampling_ratio/mean": 1.0005236864089966, "sampling/importance_sampling_ratio/min": 0.7963707447052002, "sampling/sampling_logp_difference/max": 0.241530179977417, "sampling/sampling_logp_difference/mean": 0.006102397572249174, "step": 238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 147.375, "completions/mean_terminated_length": 147.375, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.13926818408071995, "epoch": 0.06398929049531459, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.5720000000000002e-06, "loss": 0.0, "num_tokens": 557789.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.377425193786621, "sampling/importance_sampling_ratio/mean": 1.0003737211227417, "sampling/importance_sampling_ratio/min": 0.8148226141929626, "sampling/sampling_logp_difference/max": 0.32021594047546387, "sampling/sampling_logp_difference/mean": 0.006641722284257412, "step": 239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 275.0, "completions/max_terminated_length": 275.0, "completions/mean_length": 171.25, "completions/mean_terminated_length": 171.25, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.14518126752227545, "epoch": 0.0642570281124498, "frac_reward_zero_std": 0.5, "grad_norm": 1.2161062955856323, "learning_rate": 1.5660000000000001e-06, "loss": -0.0083, "num_tokens": 560463.0, "reward": 0.8125, "reward_std": 0.375, "rewards/reward_fn/mean": 0.8125, "rewards/reward_fn/std": 0.5303300619125366, "sampling/importance_sampling_ratio/max": 1.1273144483566284, "sampling/importance_sampling_ratio/mean": 0.9999881982803345, "sampling/importance_sampling_ratio/min": 0.8365118503570557, "sampling/sampling_logp_difference/max": 0.17851459980010986, "sampling/sampling_logp_difference/mean": 0.00490223616361618, "step": 240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 179.5, "completions/mean_terminated_length": 179.5, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 0.19748284574598074, "epoch": 0.064524765729585, "frac_reward_zero_std": 0.0, "grad_norm": 2.309004068374634, "learning_rate": 1.56e-06, "loss": -0.1305, "num_tokens": 563259.0, "reward": 0.5, "reward_std": 0.6403881907463074, "rewards/reward_fn/mean": 0.5, "rewards/reward_fn/std": 0.8017837405204773, "sampling/importance_sampling_ratio/max": 1.225052833557129, "sampling/importance_sampling_ratio/mean": 0.9994854927062988, "sampling/importance_sampling_ratio/min": 0.5106779932975769, "sampling/sampling_logp_difference/max": 0.6720160245895386, "sampling/sampling_logp_difference/mean": 0.007922729477286339, "step": 241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/max_terminated_length": 262.0, "completions/mean_length": 143.75, "completions/mean_terminated_length": 143.75, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "entropy": 0.1118714464828372, "epoch": 0.06479250334672021, "frac_reward_zero_std": 0.0, "grad_norm": 2.9337830543518066, "learning_rate": 1.554e-06, "loss": 0.1065, "num_tokens": 565593.0, "reward": 0.5, "reward_std": 1.0, "rewards/reward_fn/mean": 0.5, "rewards/reward_fn/std": 0.9258201122283936, "sampling/importance_sampling_ratio/max": 1.5026218891143799, "sampling/importance_sampling_ratio/mean": 1.0009779930114746, "sampling/importance_sampling_ratio/min": 0.8129540681838989, "sampling/sampling_logp_difference/max": 0.4072115421295166, "sampling/sampling_logp_difference/mean": 0.006929077673703432, "step": 242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 189.0, "completions/max_terminated_length": 189.0, "completions/mean_length": 167.625, "completions/mean_terminated_length": 167.625, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.153987274505198, "epoch": 0.06506024096385542, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.548e-06, "loss": 0.0, "num_tokens": 568190.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.2550581693649292, "sampling/importance_sampling_ratio/mean": 0.9992998838424683, "sampling/importance_sampling_ratio/min": 0.6212185621261597, "sampling/sampling_logp_difference/max": 0.4760723114013672, "sampling/sampling_logp_difference/mean": 0.008801111951470375, "step": 243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 173.0, "completions/max_terminated_length": 173.0, "completions/mean_length": 133.5, "completions/mean_terminated_length": 133.5, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.08470105449669063, "epoch": 0.06532797858099063, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.542e-06, "loss": 0.0, "num_tokens": 570422.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.1683592796325684, "sampling/importance_sampling_ratio/mean": 1.0001248121261597, "sampling/importance_sampling_ratio/min": 0.7920904755592346, "sampling/sampling_logp_difference/max": 0.2330796718597412, "sampling/sampling_logp_difference/mean": 0.0038931467570364475, "step": 244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 145.25, "completions/mean_terminated_length": 145.25, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "entropy": 0.1432082299143076, "epoch": 0.06559571619812583, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.536e-06, "loss": 0.0, "num_tokens": 572748.0, "reward": 0.25, "reward_std": 0.0, "rewards/reward_fn/mean": 0.25, "rewards/reward_fn/std": 0.8017837405204773, "sampling/importance_sampling_ratio/max": 1.427609920501709, "sampling/importance_sampling_ratio/mean": 1.0008013248443604, "sampling/importance_sampling_ratio/min": 0.7920993566513062, "sampling/sampling_logp_difference/max": 0.356001615524292, "sampling/sampling_logp_difference/mean": 0.00648135831579566, "step": 245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 127.25, "completions/mean_terminated_length": 127.25, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.11192059726454318, "epoch": 0.06586345381526104, "frac_reward_zero_std": 0.0, "grad_norm": 2.57303786277771, "learning_rate": 1.53e-06, "loss": -0.0363, "num_tokens": 574998.0, "reward": 0.71875, "reward_std": 0.5625, "rewards/reward_fn/mean": 0.71875, "rewards/reward_fn/std": 0.5580178499221802, "sampling/importance_sampling_ratio/max": 1.2205551862716675, "sampling/importance_sampling_ratio/mean": 0.9991847276687622, "sampling/importance_sampling_ratio/min": 0.785285234451294, "sampling/sampling_logp_difference/max": 0.24170827865600586, "sampling/sampling_logp_difference/mean": 0.005943233612924814, "step": 246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 267.0, "completions/max_terminated_length": 267.0, "completions/mean_length": 223.25, "completions/mean_terminated_length": 223.25, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "entropy": 0.12646856578066945, "epoch": 0.06613119143239625, "frac_reward_zero_std": 0.0, "grad_norm": 1.6968319416046143, "learning_rate": 1.5240000000000001e-06, "loss": 0.0232, "num_tokens": 578200.0, "reward": 0.4124999940395355, "reward_std": 0.2665063440799713, "rewards/reward_fn/mean": 0.4124999940395355, "rewards/reward_fn/std": 0.644066333770752, "sampling/importance_sampling_ratio/max": 1.80094313621521, "sampling/importance_sampling_ratio/mean": 1.0003424882888794, "sampling/importance_sampling_ratio/min": 0.6808249950408936, "sampling/sampling_logp_difference/max": 0.5883104801177979, "sampling/sampling_logp_difference/mean": 0.006286430172622204, "step": 247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 147.125, "completions/mean_terminated_length": 147.125, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "entropy": 0.09329359047114849, "epoch": 0.06639892904953146, "frac_reward_zero_std": 0.5, "grad_norm": 1.1935607194900513, "learning_rate": 1.5180000000000001e-06, "loss": 0.1061, "num_tokens": 580569.0, "reward": 0.75, "reward_std": 0.5, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 0.7071067690849304, "sampling/importance_sampling_ratio/max": 1.1531236171722412, "sampling/importance_sampling_ratio/mean": 1.0003776550292969, "sampling/importance_sampling_ratio/min": 0.8883009552955627, "sampling/sampling_logp_difference/max": 0.14247441291809082, "sampling/sampling_logp_difference/mean": 0.0037300633266568184, "step": 248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 147.0, "completions/max_terminated_length": 147.0, "completions/mean_length": 121.5, "completions/mean_terminated_length": 121.5, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.09581447020173073, "epoch": 0.06666666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.512e-06, "loss": 0.0, "num_tokens": 582625.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.2834054231643677, "sampling/importance_sampling_ratio/mean": 1.0002756118774414, "sampling/importance_sampling_ratio/min": 0.8082122802734375, "sampling/sampling_logp_difference/max": 0.24951696395874023, "sampling/sampling_logp_difference/mean": 0.004763828124850988, "step": 249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 159.0, "completions/max_terminated_length": 159.0, "completions/mean_length": 119.5, "completions/mean_terminated_length": 119.5, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "entropy": 0.0923376688733697, "epoch": 0.06693440428380187, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.506e-06, "loss": 0.0, "num_tokens": 584793.0, "reward": 0.25, "reward_std": 0.0, "rewards/reward_fn/mean": 0.25, "rewards/reward_fn/std": 0.8017837405204773, "sampling/importance_sampling_ratio/max": 1.2300126552581787, "sampling/importance_sampling_ratio/mean": 0.9993497729301453, "sampling/importance_sampling_ratio/min": 0.7876134514808655, "sampling/sampling_logp_difference/max": 0.23874789476394653, "sampling/sampling_logp_difference/mean": 0.005852228961884975, "step": 250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 155.75, "completions/mean_terminated_length": 155.75, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.05342700076289475, "epoch": 0.06720214190093708, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.5e-06, "loss": 0.0, "num_tokens": 587303.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.283157467842102, "sampling/importance_sampling_ratio/mean": 1.0002782344818115, "sampling/importance_sampling_ratio/min": 0.7613213062286377, "sampling/sampling_logp_difference/max": 0.27269983291625977, "sampling/sampling_logp_difference/mean": 0.004628194496035576, "step": 251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 181.0, "completions/max_terminated_length": 181.0, "completions/mean_length": 98.5, "completions/mean_terminated_length": 98.5, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.05418249010108411, "epoch": 0.06746987951807229, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.494e-06, "loss": 0.0, "num_tokens": 589263.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.2416356801986694, "sampling/importance_sampling_ratio/mean": 1.0005378723144531, "sampling/importance_sampling_ratio/min": 0.7852837443351746, "sampling/sampling_logp_difference/max": 0.24171018600463867, "sampling/sampling_logp_difference/mean": 0.004614707548171282, "step": 252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 171.0, "completions/max_terminated_length": 171.0, "completions/mean_length": 151.625, "completions/mean_terminated_length": 151.625, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.08494792645797133, "epoch": 0.0677376171352075, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.488e-06, "loss": 0.0, "num_tokens": 591708.0, "reward": 0.25, "reward_std": 0.0, "rewards/reward_fn/mean": 0.25, "rewards/reward_fn/std": 0.8017837405204773, "sampling/importance_sampling_ratio/max": 1.2442560195922852, "sampling/importance_sampling_ratio/mean": 1.0000336170196533, "sampling/importance_sampling_ratio/min": 0.8115481734275818, "sampling/sampling_logp_difference/max": 0.2185378074645996, "sampling/sampling_logp_difference/mean": 0.003974593244493008, "step": 253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 225.0, "completions/max_terminated_length": 225.0, "completions/mean_length": 190.25, "completions/mean_terminated_length": 190.25, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.10195138910785317, "epoch": 0.0680053547523427, "frac_reward_zero_std": 0.0, "grad_norm": 2.0339579582214355, "learning_rate": 1.482e-06, "loss": 0.0338, "num_tokens": 594446.0, "reward": 0.32499998807907104, "reward_std": 0.7412333488464355, "rewards/reward_fn/mean": 0.32499998807907104, "rewards/reward_fn/std": 0.7005100250244141, "sampling/importance_sampling_ratio/max": 1.3078724145889282, "sampling/importance_sampling_ratio/mean": 1.0002086162567139, "sampling/importance_sampling_ratio/min": 0.7997437119483948, "sampling/sampling_logp_difference/max": 0.26840174198150635, "sampling/sampling_logp_difference/mean": 0.00568844610825181, "step": 254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 227.0, "completions/max_terminated_length": 227.0, "completions/mean_length": 167.25, "completions/mean_terminated_length": 167.25, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.054271628614515066, "epoch": 0.06827309236947791, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.476e-06, "loss": 0.0, "num_tokens": 597016.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.384028434753418, "sampling/importance_sampling_ratio/mean": 1.0000503063201904, "sampling/importance_sampling_ratio/min": 0.7876134514808655, "sampling/sampling_logp_difference/max": 0.3249983787536621, "sampling/sampling_logp_difference/mean": 0.002837902633473277, "step": 255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 152.0, "completions/max_terminated_length": 152.0, "completions/mean_length": 115.625, "completions/mean_terminated_length": 115.625, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "entropy": 0.03221141954418272, "epoch": 0.06854082998661312, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.4700000000000001e-06, "loss": 0.0, "num_tokens": 599261.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.1047781705856323, "sampling/importance_sampling_ratio/mean": 0.9990895986557007, "sampling/importance_sampling_ratio/min": 0.7832984328269958, "sampling/sampling_logp_difference/max": 0.24424147605895996, "sampling/sampling_logp_difference/mean": 0.0021623659413307905, "step": 256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 184.0, "completions/max_terminated_length": 184.0, "completions/mean_length": 141.5, "completions/mean_terminated_length": 141.5, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.07242600503377616, "epoch": 0.06880856760374833, "frac_reward_zero_std": 0.5, "grad_norm": 1.5254065990447998, "learning_rate": 1.464e-06, "loss": 0.0288, "num_tokens": 601785.0, "reward": 0.8125, "reward_std": 0.375, "rewards/reward_fn/mean": 0.8125, "rewards/reward_fn/std": 0.5303300619125366, "sampling/importance_sampling_ratio/max": 1.4543546438217163, "sampling/importance_sampling_ratio/mean": 1.0010586977005005, "sampling/importance_sampling_ratio/min": 0.8126509785652161, "sampling/sampling_logp_difference/max": 0.37456226348876953, "sampling/sampling_logp_difference/mean": 0.004589410964399576, "step": 257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 189.0, "completions/max_terminated_length": 189.0, "completions/mean_length": 128.75, "completions/mean_terminated_length": 128.75, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.07172813382931054, "epoch": 0.06907630522088354, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.458e-06, "loss": 0.0, "num_tokens": 604047.0, "reward": 0.25, "reward_std": 0.0, "rewards/reward_fn/mean": 0.25, "rewards/reward_fn/std": 0.8017837405204773, "sampling/importance_sampling_ratio/max": 1.2744884490966797, "sampling/importance_sampling_ratio/mean": 0.9995313882827759, "sampling/importance_sampling_ratio/min": 0.7335963845252991, "sampling/sampling_logp_difference/max": 0.3097963333129883, "sampling/sampling_logp_difference/mean": 0.004771638195961714, "step": 258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 102.0, "completions/max_terminated_length": 102.0, "completions/mean_length": 82.125, "completions/mean_terminated_length": 82.125, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "entropy": 0.0554193330463022, "epoch": 0.06934404283801875, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.452e-06, "loss": 0.0, "num_tokens": 605820.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.159672737121582, "sampling/importance_sampling_ratio/mean": 1.000391960144043, "sampling/importance_sampling_ratio/min": 0.8330153226852417, "sampling/sampling_logp_difference/max": 0.18270325660705566, "sampling/sampling_logp_difference/mean": 0.004388757981359959, "step": 259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 238.0, "completions/max_terminated_length": 238.0, "completions/mean_length": 189.0, "completions/mean_terminated_length": 189.0, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.07995394710451365, "epoch": 0.06961178045515395, "frac_reward_zero_std": 0.5, "grad_norm": 1.1340000629425049, "learning_rate": 1.446e-06, "loss": 0.0203, "num_tokens": 608820.0, "reward": 0.625, "reward_std": 0.4330126941204071, "rewards/reward_fn/mean": 0.625, "rewards/reward_fn/std": 0.6943650841712952, "sampling/importance_sampling_ratio/max": 1.1765358448028564, "sampling/importance_sampling_ratio/mean": 0.9997998476028442, "sampling/importance_sampling_ratio/min": 0.6985894441604614, "sampling/sampling_logp_difference/max": 0.3586920499801636, "sampling/sampling_logp_difference/mean": 0.005118843633681536, "step": 260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 161.0, "completions/max_terminated_length": 161.0, "completions/mean_length": 117.0, "completions/mean_terminated_length": 117.0, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "entropy": 0.07617259910330176, "epoch": 0.06987951807228916, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.44e-06, "loss": 0.0, "num_tokens": 610928.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.1509473323822021, "sampling/importance_sampling_ratio/mean": 0.9985508322715759, "sampling/importance_sampling_ratio/min": 0.7906150221824646, "sampling/sampling_logp_difference/max": 0.23494410514831543, "sampling/sampling_logp_difference/mean": 0.0046264249831438065, "step": 261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 183.0, "completions/max_terminated_length": 183.0, "completions/mean_length": 115.0, "completions/mean_terminated_length": 115.0, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.09582668170332909, "epoch": 0.07014725568942437, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.434e-06, "loss": 0.0, "num_tokens": 612972.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5018633604049683, "sampling/importance_sampling_ratio/mean": 1.0007418394088745, "sampling/importance_sampling_ratio/min": 0.6038216948509216, "sampling/sampling_logp_difference/max": 0.5044763088226318, "sampling/sampling_logp_difference/mean": 0.00511948810890317, "step": 262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 130.0, "completions/max_terminated_length": 130.0, "completions/mean_length": 107.5, "completions/mean_terminated_length": 107.5, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 0.08721153903752565, "epoch": 0.07041499330655958, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.428e-06, "loss": 0.0, "num_tokens": 614936.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4062658548355103, "sampling/importance_sampling_ratio/mean": 1.0013206005096436, "sampling/importance_sampling_ratio/min": 0.7998916506767273, "sampling/sampling_logp_difference/max": 0.34093785285949707, "sampling/sampling_logp_difference/mean": 0.006704363506287336, "step": 263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 154.125, "completions/mean_terminated_length": 154.125, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.07511234283447266, "epoch": 0.07068273092369477, "frac_reward_zero_std": 0.5, "grad_norm": 1.9572402238845825, "learning_rate": 1.422e-06, "loss": 0.0194, "num_tokens": 617341.0, "reward": 0.8125, "reward_std": 0.375, "rewards/reward_fn/mean": 0.8125, "rewards/reward_fn/std": 0.5303300619125366, "sampling/importance_sampling_ratio/max": 1.438183069229126, "sampling/importance_sampling_ratio/mean": 1.000599980354309, "sampling/importance_sampling_ratio/min": 0.6954065561294556, "sampling/sampling_logp_difference/max": 0.36338043212890625, "sampling/sampling_logp_difference/mean": 0.005109885707497597, "step": 264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 112.0, "completions/max_terminated_length": 112.0, "completions/mean_length": 103.0, "completions/mean_terminated_length": 103.0, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.03653552522882819, "epoch": 0.07095046854082998, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.4159999999999999e-06, "loss": 0.0, "num_tokens": 619513.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.2501708269119263, "sampling/importance_sampling_ratio/mean": 1.000441551208496, "sampling/importance_sampling_ratio/min": 0.6632506251335144, "sampling/sampling_logp_difference/max": 0.410602331161499, "sampling/sampling_logp_difference/mean": 0.00331076025031507, "step": 265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/max_terminated_length": 322.0, "completions/mean_length": 115.75, "completions/mean_terminated_length": 115.75, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "entropy": 0.05293184798210859, "epoch": 0.07121820615796519, "frac_reward_zero_std": 0.5, "grad_norm": 5.08146858215332, "learning_rate": 1.41e-06, "loss": 0.2769, "num_tokens": 621603.0, "reward": 0.3125, "reward_std": 0.3145764470100403, "rewards/reward_fn/mean": 0.3125, "rewards/reward_fn/std": 0.8425090312957764, "sampling/importance_sampling_ratio/max": 1.4956040382385254, "sampling/importance_sampling_ratio/mean": 1.0001014471054077, "sampling/importance_sampling_ratio/min": 0.8115516304969788, "sampling/sampling_logp_difference/max": 0.4025301933288574, "sampling/sampling_logp_difference/mean": 0.003140098648145795, "step": 266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 209.0, "completions/max_terminated_length": 209.0, "completions/mean_length": 125.0, "completions/mean_terminated_length": 125.0, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "entropy": 0.08010188117623329, "epoch": 0.0714859437751004, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.404e-06, "loss": 0.0, "num_tokens": 623707.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.284680724143982, "sampling/importance_sampling_ratio/mean": 1.0004098415374756, "sampling/importance_sampling_ratio/min": 0.5475502014160156, "sampling/sampling_logp_difference/max": 0.6023011207580566, "sampling/sampling_logp_difference/mean": 0.006400415673851967, "step": 267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 146.0, "completions/max_terminated_length": 146.0, "completions/mean_length": 103.125, "completions/mean_terminated_length": 103.125, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "entropy": 0.069522506557405, "epoch": 0.0717536813922356, "frac_reward_zero_std": 0.5, "grad_norm": 3.621129035949707, "learning_rate": 1.3980000000000002e-06, "loss": -0.025, "num_tokens": 626116.0, "reward": 0.90625, "reward_std": 0.1875, "rewards/reward_fn/mean": 0.90625, "rewards/reward_fn/std": 0.2651650309562683, "sampling/importance_sampling_ratio/max": 1.2796767950057983, "sampling/importance_sampling_ratio/mean": 0.999781608581543, "sampling/importance_sampling_ratio/min": 0.6715880632400513, "sampling/sampling_logp_difference/max": 0.39811015129089355, "sampling/sampling_logp_difference/mean": 0.005026625469326973, "step": 268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 111.5, "completions/mean_terminated_length": 111.5, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.05577753530815244, "epoch": 0.07202141900937081, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.3920000000000002e-06, "loss": 0.0, "num_tokens": 628088.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.214403748512268, "sampling/importance_sampling_ratio/mean": 0.9995174407958984, "sampling/importance_sampling_ratio/min": 0.8904551267623901, "sampling/sampling_logp_difference/max": 0.19425320625305176, "sampling/sampling_logp_difference/mean": 0.0026309450622648, "step": 269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/max_terminated_length": 138.0, "completions/mean_length": 120.375, "completions/mean_terminated_length": 120.375, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.05003736959770322, "epoch": 0.07228915662650602, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.3860000000000002e-06, "loss": 0.0, "num_tokens": 630203.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5962978601455688, "sampling/importance_sampling_ratio/mean": 1.0002179145812988, "sampling/importance_sampling_ratio/min": 0.7550804615020752, "sampling/sampling_logp_difference/max": 0.46768712997436523, "sampling/sampling_logp_difference/mean": 0.003973248414695263, "step": 270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 127.5, "completions/mean_terminated_length": 127.5, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.05865175137296319, "epoch": 0.07255689424364123, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.3800000000000001e-06, "loss": 0.0, "num_tokens": 632499.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.1689109802246094, "sampling/importance_sampling_ratio/mean": 0.9993337988853455, "sampling/importance_sampling_ratio/min": 0.5614790320396423, "sampling/sampling_logp_difference/max": 0.5771808624267578, "sampling/sampling_logp_difference/mean": 0.004001270048320293, "step": 271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/max_terminated_length": 348.0, "completions/mean_length": 221.75, "completions/mean_terminated_length": 221.75, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.07139572128653526, "epoch": 0.07282463186077644, "frac_reward_zero_std": 0.0, "grad_norm": 1.8267873525619507, "learning_rate": 1.374e-06, "loss": -0.0786, "num_tokens": 635561.0, "reward": 0.375, "reward_std": 0.8903881907463074, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.876274585723877, "sampling/importance_sampling_ratio/max": 1.394998550415039, "sampling/importance_sampling_ratio/mean": 0.9998965859413147, "sampling/importance_sampling_ratio/min": 0.6989693641662598, "sampling/sampling_logp_difference/max": 0.35814833641052246, "sampling/sampling_logp_difference/mean": 0.004979416262358427, "step": 272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 333.0, "completions/max_terminated_length": 333.0, "completions/mean_length": 188.0, "completions/mean_terminated_length": 188.0, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.052812155336141586, "epoch": 0.07309236947791165, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.368e-06, "loss": 0.0, "num_tokens": 638305.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.2840259075164795, "sampling/importance_sampling_ratio/mean": 0.9998717308044434, "sampling/importance_sampling_ratio/min": 0.7999129891395569, "sampling/sampling_logp_difference/max": 0.25000035762786865, "sampling/sampling_logp_difference/mean": 0.0030952640809118748, "step": 273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/max_terminated_length": 262.0, "completions/mean_length": 153.25, "completions/mean_terminated_length": 153.25, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "entropy": 0.14102348126471043, "epoch": 0.07336010709504685, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.362e-06, "loss": 0.0, "num_tokens": 640907.0, "reward": 0.25, "reward_std": 0.0, "rewards/reward_fn/mean": 0.25, "rewards/reward_fn/std": 0.8017837405204773, "sampling/importance_sampling_ratio/max": 1.3252089023590088, "sampling/importance_sampling_ratio/mean": 0.9998700618743896, "sampling/importance_sampling_ratio/min": 0.7858969569206238, "sampling/sampling_logp_difference/max": 0.2815701961517334, "sampling/sampling_logp_difference/mean": 0.007397210691124201, "step": 274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 119.875, "completions/mean_terminated_length": 119.875, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.05599166848696768, "epoch": 0.07362784471218206, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.356e-06, "loss": 0.0, "num_tokens": 643214.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.1943178176879883, "sampling/importance_sampling_ratio/mean": 0.9994301199913025, "sampling/importance_sampling_ratio/min": 0.8088793754577637, "sampling/sampling_logp_difference/max": 0.21210551261901855, "sampling/sampling_logp_difference/mean": 0.0036983436439186335, "step": 275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 128.0, "completions/max_terminated_length": 128.0, "completions/mean_length": 98.0, "completions/mean_terminated_length": 98.0, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 0.06810007500462234, "epoch": 0.07389558232931727, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.35e-06, "loss": 0.0, "num_tokens": 645082.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3086566925048828, "sampling/importance_sampling_ratio/mean": 1.0002855062484741, "sampling/importance_sampling_ratio/min": 0.7419939041137695, "sampling/sampling_logp_difference/max": 0.2984142303466797, "sampling/sampling_logp_difference/mean": 0.004501513205468655, "step": 276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 266.0, "completions/max_terminated_length": 266.0, "completions/mean_length": 137.0, "completions/mean_terminated_length": 137.0, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.06128932535648346, "epoch": 0.07416331994645248, "frac_reward_zero_std": 0.5, "grad_norm": 2.653231620788574, "learning_rate": 1.344e-06, "loss": 0.1665, "num_tokens": 647498.0, "reward": 0.9375, "reward_std": 0.125, "rewards/reward_fn/mean": 0.9375, "rewards/reward_fn/std": 0.1767766922712326, "sampling/importance_sampling_ratio/max": 1.492400050163269, "sampling/importance_sampling_ratio/mean": 0.9992762804031372, "sampling/importance_sampling_ratio/min": 0.6877583861351013, "sampling/sampling_logp_difference/max": 0.40038561820983887, "sampling/sampling_logp_difference/mean": 0.006772695574909449, "step": 277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/max_terminated_length": 341.0, "completions/mean_length": 196.25, "completions/mean_terminated_length": 196.25, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.07421794510446489, "epoch": 0.07443105756358769, "frac_reward_zero_std": 0.5, "grad_norm": 2.137287139892578, "learning_rate": 1.3380000000000001e-06, "loss": 0.0997, "num_tokens": 650236.0, "reward": 0.5625, "reward_std": 0.5153881907463074, "rewards/reward_fn/mean": 0.5625, "rewards/reward_fn/std": 0.8210403323173523, "sampling/importance_sampling_ratio/max": 1.275705337524414, "sampling/importance_sampling_ratio/mean": 1.0004850625991821, "sampling/importance_sampling_ratio/min": 0.8076605200767517, "sampling/sampling_logp_difference/max": 0.2434992790222168, "sampling/sampling_logp_difference/mean": 0.004975650925189257, "step": 278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 155.0, "completions/max_terminated_length": 155.0, "completions/mean_length": 107.25, "completions/mean_terminated_length": 107.25, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.10950127895921469, "epoch": 0.0746987951807229, "frac_reward_zero_std": 0.5, "grad_norm": 3.675204277038574, "learning_rate": 1.3320000000000001e-06, "loss": -0.0694, "num_tokens": 652330.0, "reward": 0.4375, "reward_std": 0.375, "rewards/reward_fn/mean": 0.4375, "rewards/reward_fn/std": 0.7763237953186035, "sampling/importance_sampling_ratio/max": 1.2963581085205078, "sampling/importance_sampling_ratio/mean": 0.9997933506965637, "sampling/importance_sampling_ratio/min": 0.7566525936126709, "sampling/sampling_logp_difference/max": 0.27885106205940247, "sampling/sampling_logp_difference/mean": 0.009443460963666439, "step": 279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 103.875, "completions/mean_terminated_length": 103.875, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.08775105746462941, "epoch": 0.0749665327978581, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.326e-06, "loss": 0.0, "num_tokens": 654301.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3066405057907104, "sampling/importance_sampling_ratio/mean": 1.0004574060440063, "sampling/importance_sampling_ratio/min": 0.723045825958252, "sampling/sampling_logp_difference/max": 0.3242826461791992, "sampling/sampling_logp_difference/mean": 0.007838370278477669, "step": 280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 234.0, "completions/max_terminated_length": 234.0, "completions/mean_length": 156.125, "completions/mean_terminated_length": 156.125, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.08591798739507794, "epoch": 0.07523427041499331, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.32e-06, "loss": 0.0, "num_tokens": 657066.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.1420122385025024, "sampling/importance_sampling_ratio/mean": 0.9986868500709534, "sampling/importance_sampling_ratio/min": 0.805192232131958, "sampling/sampling_logp_difference/max": 0.21667420864105225, "sampling/sampling_logp_difference/mean": 0.004789266269654036, "step": 281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 118.0, "completions/max_terminated_length": 118.0, "completions/mean_length": 68.625, "completions/mean_terminated_length": 68.625, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 0.09261566121131182, "epoch": 0.07550200803212852, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.314e-06, "loss": 0.0, "num_tokens": 658955.0, "reward": 0.25, "reward_std": 0.0, "rewards/reward_fn/mean": 0.25, "rewards/reward_fn/std": 0.8017837405204773, "sampling/importance_sampling_ratio/max": 1.141309380531311, "sampling/importance_sampling_ratio/mean": 0.9975247979164124, "sampling/importance_sampling_ratio/min": 0.6473673582077026, "sampling/sampling_logp_difference/max": 0.4348413944244385, "sampling/sampling_logp_difference/mean": 0.007542146369814873, "step": 282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 150.0, "completions/max_terminated_length": 150.0, "completions/mean_length": 116.25, "completions/mean_terminated_length": 116.25, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.07097541214898229, "epoch": 0.07576974564926373, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.308e-06, "loss": 0.0, "num_tokens": 661045.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.2510074377059937, "sampling/importance_sampling_ratio/mean": 0.998486340045929, "sampling/importance_sampling_ratio/min": 0.7834607362747192, "sampling/sampling_logp_difference/max": 0.24403434991836548, "sampling/sampling_logp_difference/mean": 0.004922127351164818, "step": 283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 166.625, "completions/mean_terminated_length": 166.625, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.03048290335573256, "epoch": 0.07603748326639893, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.302e-06, "loss": 0.0, "num_tokens": 663682.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0008925199508667, "sampling/importance_sampling_ratio/min": 0.3222828209400177, "sampling/sampling_logp_difference/max": 1.1323257684707642, "sampling/sampling_logp_difference/mean": 0.005625824443995953, "step": 284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 185.0, "completions/mean_terminated_length": 185.0, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 0.19470311235636473, "epoch": 0.07630522088353414, "frac_reward_zero_std": 0.0, "grad_norm": 2.7165589332580566, "learning_rate": 1.296e-06, "loss": -0.0634, "num_tokens": 666450.0, "reward": 0.53125, "reward_std": 0.6205127239227295, "rewards/reward_fn/mean": 0.53125, "rewards/reward_fn/std": 0.6870940327644348, "sampling/importance_sampling_ratio/max": 1.7614554166793823, "sampling/importance_sampling_ratio/mean": 1.0005552768707275, "sampling/importance_sampling_ratio/min": 0.6989746689796448, "sampling/sampling_logp_difference/max": 0.5661404132843018, "sampling/sampling_logp_difference/mean": 0.006937386933714151, "step": 285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 128.0, "completions/max_terminated_length": 128.0, "completions/mean_length": 111.25, "completions/mean_terminated_length": 111.25, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.059943039901554585, "epoch": 0.07657295850066935, "frac_reward_zero_std": 0.5, "grad_norm": 2.4447569847106934, "learning_rate": 1.29e-06, "loss": -0.0355, "num_tokens": 668460.0, "reward": 0.625, "reward_std": 0.3061862289905548, "rewards/reward_fn/mean": 0.625, "rewards/reward_fn/std": 0.5669467449188232, "sampling/importance_sampling_ratio/max": 1.124353051185608, "sampling/importance_sampling_ratio/mean": 0.9988304376602173, "sampling/importance_sampling_ratio/min": 0.7004509568214417, "sampling/sampling_logp_difference/max": 0.3560309410095215, "sampling/sampling_logp_difference/mean": 0.0034653637558221817, "step": 286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 175.0, "completions/max_terminated_length": 175.0, "completions/mean_length": 112.75, "completions/mean_terminated_length": 112.75, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 0.09391266480088234, "epoch": 0.07684069611780456, "frac_reward_zero_std": 0.5, "grad_norm": 2.528184652328491, "learning_rate": 1.284e-06, "loss": 0.0899, "num_tokens": 670554.0, "reward": 0.90625, "reward_std": 0.1875, "rewards/reward_fn/mean": 0.90625, "rewards/reward_fn/std": 0.2651650309562683, "sampling/importance_sampling_ratio/max": 1.2729849815368652, "sampling/importance_sampling_ratio/mean": 0.999646008014679, "sampling/importance_sampling_ratio/min": 0.7847145795822144, "sampling/sampling_logp_difference/max": 0.24243515729904175, "sampling/sampling_logp_difference/mean": 0.004388025496155024, "step": 287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 156.0, "completions/max_terminated_length": 156.0, "completions/mean_length": 112.0, "completions/mean_terminated_length": 112.0, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.062198235653340816, "epoch": 0.07710843373493977, "frac_reward_zero_std": 0.5, "grad_norm": 1.2979862689971924, "learning_rate": 1.278e-06, "loss": -0.0085, "num_tokens": 672566.0, "reward": 0.75, "reward_std": 0.5, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 0.7071067690849304, "sampling/importance_sampling_ratio/max": 1.2529423236846924, "sampling/importance_sampling_ratio/mean": 1.0005813837051392, "sampling/importance_sampling_ratio/min": 0.8379479646682739, "sampling/sampling_logp_difference/max": 0.2254946231842041, "sampling/sampling_logp_difference/mean": 0.00393708748742938, "step": 288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 259.0, "completions/max_terminated_length": 259.0, "completions/mean_length": 161.625, "completions/mean_terminated_length": 161.625, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.11542502138763666, "epoch": 0.07737617135207496, "frac_reward_zero_std": 0.0, "grad_norm": 3.509361505508423, "learning_rate": 1.272e-06, "loss": 0.1057, "num_tokens": 675171.0, "reward": 0.4375, "reward_std": 0.8080127239227295, "rewards/reward_fn/mean": 0.4375, "rewards/reward_fn/std": 0.7763237953186035, "sampling/importance_sampling_ratio/max": 1.7647712230682373, "sampling/importance_sampling_ratio/mean": 1.0002522468566895, "sampling/importance_sampling_ratio/min": 0.46567410230636597, "sampling/sampling_logp_difference/max": 0.764269232749939, "sampling/sampling_logp_difference/mean": 0.010059405118227005, "step": 289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 94.0, "completions/max_terminated_length": 94.0, "completions/mean_length": 83.75, "completions/mean_terminated_length": 83.75, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.057427250081673265, "epoch": 0.07764390896921017, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.266e-06, "loss": 0.0, "num_tokens": 677017.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.2800213098526, "sampling/importance_sampling_ratio/mean": 1.0014666318893433, "sampling/importance_sampling_ratio/min": 0.9281532764434814, "sampling/sampling_logp_difference/max": 0.24687671661376953, "sampling/sampling_logp_difference/mean": 0.0026622351724654436, "step": 290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 85.875, "completions/mean_terminated_length": 85.875, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "entropy": 0.08595438906922936, "epoch": 0.07791164658634538, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.26e-06, "loss": 0.0, "num_tokens": 679000.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.2207716703414917, "sampling/importance_sampling_ratio/mean": 0.9985074400901794, "sampling/importance_sampling_ratio/min": 0.7831106781959534, "sampling/sampling_logp_difference/max": 0.24448126554489136, "sampling/sampling_logp_difference/mean": 0.0054033780470490456, "step": 291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 115.0, "completions/max_terminated_length": 115.0, "completions/mean_length": 90.75, "completions/mean_terminated_length": 90.75, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.06387204211205244, "epoch": 0.07817938420348058, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.254e-06, "loss": 0.0, "num_tokens": 680922.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.2210793495178223, "sampling/importance_sampling_ratio/mean": 0.9993816614151001, "sampling/importance_sampling_ratio/min": 0.8093388676643372, "sampling/sampling_logp_difference/max": 0.21153759956359863, "sampling/sampling_logp_difference/mean": 0.00424946378916502, "step": 292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 192.0, "completions/max_terminated_length": 192.0, "completions/mean_length": 148.875, "completions/mean_terminated_length": 148.875, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.06144785135984421, "epoch": 0.07844712182061579, "frac_reward_zero_std": 0.5, "grad_norm": 2.4755187034606934, "learning_rate": 1.248e-06, "loss": -0.0501, "num_tokens": 683501.0, "reward": 0.9375, "reward_std": 0.125, "rewards/reward_fn/mean": 0.9375, "rewards/reward_fn/std": 0.1767766922712326, "sampling/importance_sampling_ratio/max": 1.2721506357192993, "sampling/importance_sampling_ratio/mean": 1.0007685422897339, "sampling/importance_sampling_ratio/min": 0.5951430201530457, "sampling/sampling_logp_difference/max": 0.5189535617828369, "sampling/sampling_logp_difference/mean": 0.004717384930700064, "step": 293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 154.0, "completions/max_terminated_length": 154.0, "completions/mean_length": 122.125, "completions/mean_terminated_length": 122.125, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "entropy": 0.03430660883896053, "epoch": 0.078714859437751, "frac_reward_zero_std": 0.5, "grad_norm": 1.4481868743896484, "learning_rate": 1.242e-06, "loss": -0.0034, "num_tokens": 685646.0, "reward": 0.75, "reward_std": 0.5, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 0.7071067690849304, "sampling/importance_sampling_ratio/max": 1.3326246738433838, "sampling/importance_sampling_ratio/mean": 0.9989423155784607, "sampling/importance_sampling_ratio/min": 0.6251910924911499, "sampling/sampling_logp_difference/max": 0.4696979522705078, "sampling/sampling_logp_difference/mean": 0.0037921154871582985, "step": 294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.0, "completions/max_terminated_length": 300.0, "completions/mean_length": 213.25, "completions/mean_terminated_length": 213.25, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.08255507610738277, "epoch": 0.07898259705488621, "frac_reward_zero_std": 0.5, "grad_norm": 0.9227774739265442, "learning_rate": 1.236e-06, "loss": 0.0223, "num_tokens": 688956.0, "reward": 0.4375, "reward_std": 0.375, "rewards/reward_fn/mean": 0.4375, "rewards/reward_fn/std": 0.7763237953186035, "sampling/importance_sampling_ratio/max": 1.2148504257202148, "sampling/importance_sampling_ratio/mean": 1.0006242990493774, "sampling/importance_sampling_ratio/min": 0.8276379704475403, "sampling/sampling_logp_difference/max": 0.19462096691131592, "sampling/sampling_logp_difference/mean": 0.004637486767023802, "step": 295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 154.0, "completions/max_terminated_length": 154.0, "completions/mean_length": 84.875, "completions/mean_terminated_length": 84.875, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "entropy": 0.03503145254217088, "epoch": 0.07925033467202142, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.2299999999999999e-06, "loss": 0.0, "num_tokens": 690699.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.2831391096115112, "sampling/importance_sampling_ratio/mean": 1.0002120733261108, "sampling/importance_sampling_ratio/min": 0.8295605182647705, "sampling/sampling_logp_difference/max": 0.24930953979492188, "sampling/sampling_logp_difference/mean": 0.0023403421510010958, "step": 296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 173.0, "completions/max_terminated_length": 173.0, "completions/mean_length": 123.5, "completions/mean_terminated_length": 123.5, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "entropy": 0.08865122054703534, "epoch": 0.07951807228915662, "frac_reward_zero_std": 0.5, "grad_norm": 3.604426383972168, "learning_rate": 1.224e-06, "loss": 0.0446, "num_tokens": 692895.0, "reward": 0.90625, "reward_std": 0.1875, "rewards/reward_fn/mean": 0.90625, "rewards/reward_fn/std": 0.2651650309562683, "sampling/importance_sampling_ratio/max": 1.8765958547592163, "sampling/importance_sampling_ratio/mean": 1.0020368099212646, "sampling/importance_sampling_ratio/min": 0.7569504976272583, "sampling/sampling_logp_difference/max": 0.6294593811035156, "sampling/sampling_logp_difference/mean": 0.006022048182785511, "step": 297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 192.0, "completions/max_terminated_length": 192.0, "completions/mean_length": 148.75, "completions/mean_terminated_length": 148.75, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.06080527463927865, "epoch": 0.07978580990629183, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.218e-06, "loss": 0.0, "num_tokens": 695333.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.1768158674240112, "sampling/importance_sampling_ratio/mean": 0.9986566305160522, "sampling/importance_sampling_ratio/min": 0.7883612513542175, "sampling/sampling_logp_difference/max": 0.23779892921447754, "sampling/sampling_logp_difference/mean": 0.0037001457531005144, "step": 298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 167.0, "completions/max_terminated_length": 167.0, "completions/mean_length": 118.125, "completions/mean_terminated_length": 118.125, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.036896280478686094, "epoch": 0.08005354752342704, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.2120000000000002e-06, "loss": 0.0, "num_tokens": 697406.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.2829862833023071, "sampling/importance_sampling_ratio/mean": 1.0008474588394165, "sampling/importance_sampling_ratio/min": 0.8051016926765442, "sampling/sampling_logp_difference/max": 0.2491903305053711, "sampling/sampling_logp_difference/mean": 0.002871220698580146, "step": 299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 312.0, "completions/max_terminated_length": 312.0, "completions/mean_length": 157.875, "completions/mean_terminated_length": 157.875, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.09345291694626212, "epoch": 0.08032128514056225, "frac_reward_zero_std": 0.0, "grad_norm": 2.4048407077789307, "learning_rate": 1.2060000000000002e-06, "loss": 0.1847, "num_tokens": 699933.0, "reward": 0.625, "reward_std": 0.75, "rewards/reward_fn/mean": 0.625, "rewards/reward_fn/std": 0.6943650841712952, "sampling/importance_sampling_ratio/max": 1.266870141029358, "sampling/importance_sampling_ratio/mean": 0.9995386600494385, "sampling/importance_sampling_ratio/min": 0.8205752968788147, "sampling/sampling_logp_difference/max": 0.23654937744140625, "sampling/sampling_logp_difference/mean": 0.005342492833733559, "step": 300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 157.0, "completions/max_terminated_length": 157.0, "completions/mean_length": 117.5, "completions/mean_terminated_length": 117.5, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.067539366427809, "epoch": 0.08058902275769746, "frac_reward_zero_std": 0.5, "grad_norm": 2.305908441543579, "learning_rate": 1.2000000000000002e-06, "loss": 0.0766, "num_tokens": 701993.0, "reward": 0.8125, "reward_std": 0.375, "rewards/reward_fn/mean": 0.8125, "rewards/reward_fn/std": 0.5303300619125366, "sampling/importance_sampling_ratio/max": 1.2076475620269775, "sampling/importance_sampling_ratio/mean": 1.0005135536193848, "sampling/importance_sampling_ratio/min": 0.8076342940330505, "sampling/sampling_logp_difference/max": 0.21364593505859375, "sampling/sampling_logp_difference/mean": 0.003998192027211189, "step": 301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 172.0, "completions/max_terminated_length": 172.0, "completions/mean_length": 112.875, "completions/mean_terminated_length": 112.875, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "entropy": 0.11904980335384607, "epoch": 0.08085676037483266, "frac_reward_zero_std": 0.5, "grad_norm": 1.8405557870864868, "learning_rate": 1.1940000000000001e-06, "loss": -0.0442, "num_tokens": 704124.0, "reward": 0.90625, "reward_std": 0.1875, "rewards/reward_fn/mean": 0.90625, "rewards/reward_fn/std": 0.2651650309562683, "sampling/importance_sampling_ratio/max": 1.146812915802002, "sampling/importance_sampling_ratio/mean": 0.9998396635055542, "sampling/importance_sampling_ratio/min": 0.6976358294487, "sampling/sampling_logp_difference/max": 0.360058069229126, "sampling/sampling_logp_difference/mean": 0.006269815377891064, "step": 302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 140.0, "completions/max_terminated_length": 140.0, "completions/mean_length": 119.875, "completions/mean_terminated_length": 119.875, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.06523250229656696, "epoch": 0.08112449799196787, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.188e-06, "loss": 0.0, "num_tokens": 706351.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.2957061529159546, "sampling/importance_sampling_ratio/mean": 1.0004132986068726, "sampling/importance_sampling_ratio/min": 0.8497671484947205, "sampling/sampling_logp_difference/max": 0.25905585289001465, "sampling/sampling_logp_difference/mean": 0.0032374951988458633, "step": 303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 119.0, "completions/max_terminated_length": 119.0, "completions/mean_length": 104.25, "completions/mean_terminated_length": 104.25, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.09066258743405342, "epoch": 0.08139223560910308, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.182e-06, "loss": 0.0, "num_tokens": 708605.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.185592532157898, "sampling/importance_sampling_ratio/mean": 1.0002049207687378, "sampling/importance_sampling_ratio/min": 0.8214309215545654, "sampling/sampling_logp_difference/max": 0.19670748710632324, "sampling/sampling_logp_difference/mean": 0.0044455984607338905, "step": 304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 149.125, "completions/mean_terminated_length": 149.125, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.08574727643281221, "epoch": 0.08165997322623829, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.176e-06, "loss": 0.0, "num_tokens": 711002.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4280275106430054, "sampling/importance_sampling_ratio/mean": 1.000168800354004, "sampling/importance_sampling_ratio/min": 0.7570231556892395, "sampling/sampling_logp_difference/max": 0.35629403591156006, "sampling/sampling_logp_difference/mean": 0.006902152672410011, "step": 305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 230.75, "completions/mean_terminated_length": 230.75, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "entropy": 0.08512867847457528, "epoch": 0.0819277108433735, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.17e-06, "loss": 0.0, "num_tokens": 714288.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4405975341796875, "sampling/importance_sampling_ratio/mean": 1.00014066696167, "sampling/importance_sampling_ratio/min": 0.7911865711212158, "sampling/sampling_logp_difference/max": 0.36505794525146484, "sampling/sampling_logp_difference/mean": 0.003983593545854092, "step": 306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 420.0, "completions/max_terminated_length": 420.0, "completions/mean_length": 189.75, "completions/mean_terminated_length": 189.75, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.06676621688529849, "epoch": 0.0821954484605087, "frac_reward_zero_std": 0.5, "grad_norm": 2.3853604793548584, "learning_rate": 1.164e-06, "loss": 0.1206, "num_tokens": 717094.0, "reward": 0.46875, "reward_std": 0.2576940953731537, "rewards/reward_fn/mean": 0.46875, "rewards/reward_fn/std": 0.6605936288833618, "sampling/importance_sampling_ratio/max": 1.387581467628479, "sampling/importance_sampling_ratio/mean": 0.9997885227203369, "sampling/importance_sampling_ratio/min": 0.7029744982719421, "sampling/sampling_logp_difference/max": 0.3524346351623535, "sampling/sampling_logp_difference/mean": 0.00539702782407403, "step": 307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 243.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 121.625, "completions/mean_terminated_length": 121.625, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "entropy": 0.042899271473288536, "epoch": 0.08246318607764391, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.158e-06, "loss": 0.0, "num_tokens": 719099.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3243584632873535, "sampling/importance_sampling_ratio/mean": 0.9999261498451233, "sampling/importance_sampling_ratio/min": 0.6120638847351074, "sampling/sampling_logp_difference/max": 0.4909186363220215, "sampling/sampling_logp_difference/mean": 0.004771738313138485, "step": 308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 152.0, "completions/max_terminated_length": 152.0, "completions/mean_length": 117.0, "completions/mean_terminated_length": 117.0, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "entropy": 0.10202642809599638, "epoch": 0.08273092369477912, "frac_reward_zero_std": 0.5, "grad_norm": 3.0948758125305176, "learning_rate": 1.1520000000000002e-06, "loss": 0.0106, "num_tokens": 721167.0, "reward": 0.625, "reward_std": 0.25, "rewards/reward_fn/mean": 0.625, "rewards/reward_fn/std": 0.5175492167472839, "sampling/importance_sampling_ratio/max": 1.400730848312378, "sampling/importance_sampling_ratio/mean": 0.9998996257781982, "sampling/importance_sampling_ratio/min": 0.6932842135429382, "sampling/sampling_logp_difference/max": 0.36631524562835693, "sampling/sampling_logp_difference/mean": 0.008777028881013393, "step": 309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/max_terminated_length": 346.0, "completions/mean_length": 195.875, "completions/mean_terminated_length": 195.875, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.08563368488103151, "epoch": 0.08299866131191433, "frac_reward_zero_std": 0.5, "grad_norm": 2.626880168914795, "learning_rate": 1.1460000000000001e-06, "loss": 0.1165, "num_tokens": 724006.0, "reward": 0.4375, "reward_std": 0.375, "rewards/reward_fn/mean": 0.4375, "rewards/reward_fn/std": 0.7763237953186035, "sampling/importance_sampling_ratio/max": 1.4406044483184814, "sampling/importance_sampling_ratio/mean": 1.0008835792541504, "sampling/importance_sampling_ratio/min": 0.6976735591888428, "sampling/sampling_logp_difference/max": 0.3650627136230469, "sampling/sampling_logp_difference/mean": 0.006444377824664116, "step": 310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 179.0, "completions/max_terminated_length": 179.0, "completions/mean_length": 122.5, "completions/mean_terminated_length": 122.5, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.05886700376868248, "epoch": 0.08326639892904954, "frac_reward_zero_std": 0.5, "grad_norm": 2.3448545932769775, "learning_rate": 1.14e-06, "loss": -0.0426, "num_tokens": 726242.0, "reward": 0.8125, "reward_std": 0.21650634706020355, "rewards/reward_fn/mean": 0.8125, "rewards/reward_fn/std": 0.3471825420856476, "sampling/importance_sampling_ratio/max": 1.8706989288330078, "sampling/importance_sampling_ratio/mean": 0.9996541738510132, "sampling/importance_sampling_ratio/min": 0.6411384344100952, "sampling/sampling_logp_difference/max": 0.6263121366500854, "sampling/sampling_logp_difference/mean": 0.00480793509632349, "step": 311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 143.5, "completions/mean_terminated_length": 143.5, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.04736422072164714, "epoch": 0.08353413654618475, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.134e-06, "loss": 0.0, "num_tokens": 728578.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.2144099473953247, "sampling/importance_sampling_ratio/mean": 0.9995613098144531, "sampling/importance_sampling_ratio/min": 0.804253876209259, "sampling/sampling_logp_difference/max": 0.217840313911438, "sampling/sampling_logp_difference/mean": 0.003223955165594816, "step": 312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 98.0, "completions/max_terminated_length": 98.0, "completions/mean_length": 82.125, "completions/mean_terminated_length": 82.125, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 0.04943274310790002, "epoch": 0.08380187416331995, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.128e-06, "loss": 0.0, "num_tokens": 730327.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5113264322280884, "sampling/importance_sampling_ratio/mean": 1.0015605688095093, "sampling/importance_sampling_ratio/min": 0.733035147190094, "sampling/sampling_logp_difference/max": 0.41298770904541016, "sampling/sampling_logp_difference/mean": 0.005375869572162628, "step": 313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 81.0, "completions/max_terminated_length": 81.0, "completions/mean_length": 68.75, "completions/mean_terminated_length": 68.75, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.057202502619475126, "epoch": 0.08406961178045515, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.122e-06, "loss": 0.0, "num_tokens": 732041.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3801374435424805, "sampling/importance_sampling_ratio/mean": 1.0026148557662964, "sampling/importance_sampling_ratio/min": 0.7322600483894348, "sampling/sampling_logp_difference/max": 0.32218313217163086, "sampling/sampling_logp_difference/mean": 0.005152307450771332, "step": 314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/max_terminated_length": 291.0, "completions/mean_length": 183.625, "completions/mean_terminated_length": 183.625, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.09210221003741026, "epoch": 0.08433734939759036, "frac_reward_zero_std": 0.5, "grad_norm": 3.541691541671753, "learning_rate": 1.116e-06, "loss": 0.0102, "num_tokens": 735054.0, "reward": 0.84375, "reward_std": 0.1875, "rewards/reward_fn/mean": 0.84375, "rewards/reward_fn/std": 0.29693374037742615, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000246524810791, "sampling/importance_sampling_ratio/min": 0.7288697361946106, "sampling/sampling_logp_difference/max": 0.877377986907959, "sampling/sampling_logp_difference/mean": 0.005722996313124895, "step": 315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 192.0, "completions/max_terminated_length": 192.0, "completions/mean_length": 131.875, "completions/mean_terminated_length": 131.875, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.0737892184406519, "epoch": 0.08460508701472556, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.11e-06, "loss": 0.0, "num_tokens": 737289.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4354221820831299, "sampling/importance_sampling_ratio/mean": 1.0012996196746826, "sampling/importance_sampling_ratio/min": 0.6895196437835693, "sampling/sampling_logp_difference/max": 0.37176012992858887, "sampling/sampling_logp_difference/mean": 0.006080441642552614, "step": 316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 93.375, "completions/mean_terminated_length": 93.375, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 0.07663291320204735, "epoch": 0.08487282463186077, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.104e-06, "loss": 0.0, "num_tokens": 739052.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.1364166736602783, "sampling/importance_sampling_ratio/mean": 1.0002644062042236, "sampling/importance_sampling_ratio/min": 0.7702326774597168, "sampling/sampling_logp_difference/max": 0.2610626220703125, "sampling/sampling_logp_difference/mean": 0.003993673250079155, "step": 317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 145.0, "completions/max_terminated_length": 145.0, "completions/mean_length": 127.625, "completions/mean_terminated_length": 127.625, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.05528829200193286, "epoch": 0.08514056224899598, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.098e-06, "loss": 0.0, "num_tokens": 741237.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.396429419517517, "sampling/importance_sampling_ratio/mean": 1.0012474060058594, "sampling/importance_sampling_ratio/min": 0.7550811767578125, "sampling/sampling_logp_difference/max": 0.33391857147216797, "sampling/sampling_logp_difference/mean": 0.004840245470404625, "step": 318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 229.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 134.75, "completions/mean_terminated_length": 134.75, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.0665902434848249, "epoch": 0.08540829986613119, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.092e-06, "loss": 0.0, "num_tokens": 743531.0, "reward": 0.25, "reward_std": 0.0, "rewards/reward_fn/mean": 0.25, "rewards/reward_fn/std": 0.8017837405204773, "sampling/importance_sampling_ratio/max": 1.380139708518982, "sampling/importance_sampling_ratio/mean": 0.999398946762085, "sampling/importance_sampling_ratio/min": 0.6967176198959351, "sampling/sampling_logp_difference/max": 0.361375093460083, "sampling/sampling_logp_difference/mean": 0.005692849867045879, "step": 319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 211.0, "completions/max_terminated_length": 211.0, "completions/mean_length": 167.625, "completions/mean_terminated_length": 167.625, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.06742543866857886, "epoch": 0.0856760374832664, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.086e-06, "loss": 0.0, "num_tokens": 746216.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5987622737884521, "sampling/importance_sampling_ratio/mean": 1.0005491971969604, "sampling/importance_sampling_ratio/min": 0.8089372515678406, "sampling/sampling_logp_difference/max": 0.46922969818115234, "sampling/sampling_logp_difference/mean": 0.004980063997209072, "step": 320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 114.0, "completions/max_terminated_length": 114.0, "completions/mean_length": 94.5, "completions/mean_terminated_length": 94.5, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "entropy": 0.07301402417942882, "epoch": 0.0859437751004016, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.08e-06, "loss": 0.0, "num_tokens": 748368.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.2313910722732544, "sampling/importance_sampling_ratio/mean": 0.9991440176963806, "sampling/importance_sampling_ratio/min": 0.713898777961731, "sampling/sampling_logp_difference/max": 0.33701419830322266, "sampling/sampling_logp_difference/mean": 0.006454926915466785, "step": 321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 269.0, "completions/max_terminated_length": 269.0, "completions/mean_length": 171.5, "completions/mean_terminated_length": 171.5, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.11774657759815454, "epoch": 0.08621151271753681, "frac_reward_zero_std": 0.5, "grad_norm": 1.6479531526565552, "learning_rate": 1.074e-06, "loss": 0.0192, "num_tokens": 751248.0, "reward": 0.3125, "reward_std": 0.4732423424720764, "rewards/reward_fn/mean": 0.3125, "rewards/reward_fn/std": 0.9613049626350403, "sampling/importance_sampling_ratio/max": 1.2948668003082275, "sampling/importance_sampling_ratio/mean": 0.9985540509223938, "sampling/importance_sampling_ratio/min": 0.7870590090751648, "sampling/sampling_logp_difference/max": 0.2584078311920166, "sampling/sampling_logp_difference/mean": 0.007045522797852755, "step": 322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 220.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 173.75, "completions/mean_terminated_length": 173.75, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.10921285394579172, "epoch": 0.08647925033467202, "frac_reward_zero_std": 0.5, "grad_norm": 1.7331308126449585, "learning_rate": 1.068e-06, "loss": -0.0061, "num_tokens": 754130.0, "reward": 0.71875, "reward_std": 0.3590351641178131, "rewards/reward_fn/mean": 0.71875, "rewards/reward_fn/std": 0.5580178499221802, "sampling/importance_sampling_ratio/max": 1.3170795440673828, "sampling/importance_sampling_ratio/mean": 1.0000064373016357, "sampling/importance_sampling_ratio/min": 0.8603836297988892, "sampling/sampling_logp_difference/max": 0.27541685104370117, "sampling/sampling_logp_difference/mean": 0.00450369156897068, "step": 323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 178.0, "completions/max_terminated_length": 178.0, "completions/mean_length": 149.25, "completions/mean_terminated_length": 149.25, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.06921208673156798, "epoch": 0.08674698795180723, "frac_reward_zero_std": 0.5, "grad_norm": 1.4060966968536377, "learning_rate": 1.062e-06, "loss": -0.0042, "num_tokens": 756548.0, "reward": 0.75, "reward_std": 0.5, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 0.7071067690849304, "sampling/importance_sampling_ratio/max": 1.2825428247451782, "sampling/importance_sampling_ratio/mean": 0.9999324083328247, "sampling/importance_sampling_ratio/min": 0.7030447721481323, "sampling/sampling_logp_difference/max": 0.35233473777770996, "sampling/sampling_logp_difference/mean": 0.0036369487643241882, "step": 324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 185.0, "completions/max_terminated_length": 185.0, "completions/mean_length": 97.75, "completions/mean_terminated_length": 97.75, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "entropy": 0.04925892222672701, "epoch": 0.08701472556894244, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.056e-06, "loss": 0.0, "num_tokens": 758462.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4203462600708008, "sampling/importance_sampling_ratio/mean": 0.9997683167457581, "sampling/importance_sampling_ratio/min": 0.6077790260314941, "sampling/sampling_logp_difference/max": 0.4979438781738281, "sampling/sampling_logp_difference/mean": 0.004481722135096788, "step": 325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 269.0, "completions/max_terminated_length": 269.0, "completions/mean_length": 167.25, "completions/mean_terminated_length": 167.25, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "entropy": 0.08303294470533729, "epoch": 0.08728246318607764, "frac_reward_zero_std": 0.0, "grad_norm": 2.171299934387207, "learning_rate": 1.05e-06, "loss": -0.0808, "num_tokens": 760948.0, "reward": 0.625, "reward_std": 0.75, "rewards/reward_fn/mean": 0.625, "rewards/reward_fn/std": 0.6943650841712952, "sampling/importance_sampling_ratio/max": 1.4409058094024658, "sampling/importance_sampling_ratio/mean": 1.0009050369262695, "sampling/importance_sampling_ratio/min": 0.7671014070510864, "sampling/sampling_logp_difference/max": 0.3652719259262085, "sampling/sampling_logp_difference/mean": 0.0050959959626197815, "step": 326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 157.0, "completions/max_terminated_length": 157.0, "completions/mean_length": 137.5, "completions/mean_terminated_length": 137.5, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.06733882427215576, "epoch": 0.08755020080321285, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0439999999999999e-06, "loss": 0.0, "num_tokens": 763184.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.1929048299789429, "sampling/importance_sampling_ratio/mean": 0.9997973442077637, "sampling/importance_sampling_ratio/min": 0.7917224168777466, "sampling/sampling_logp_difference/max": 0.2335444688796997, "sampling/sampling_logp_difference/mean": 0.003922729287296534, "step": 327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 101.0, "completions/max_terminated_length": 101.0, "completions/mean_length": 89.125, "completions/mean_terminated_length": 89.125, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.043483864050358534, "epoch": 0.08781793842034806, "frac_reward_zero_std": 0.5, "grad_norm": 2.2139832973480225, "learning_rate": 1.0379999999999998e-06, "loss": 0.0114, "num_tokens": 764989.0, "reward": 0.9375, "reward_std": 0.125, "rewards/reward_fn/mean": 0.9375, "rewards/reward_fn/std": 0.1767766922712326, "sampling/importance_sampling_ratio/max": 1.1243540048599243, "sampling/importance_sampling_ratio/mean": 0.9989975690841675, "sampling/importance_sampling_ratio/min": 0.7998932003974915, "sampling/sampling_logp_difference/max": 0.22327709197998047, "sampling/sampling_logp_difference/mean": 0.002327987225726247, "step": 328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 144.0, "completions/max_terminated_length": 144.0, "completions/mean_length": 112.75, "completions/mean_terminated_length": 112.75, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.0782771185040474, "epoch": 0.08808567603748327, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.032e-06, "loss": 0.0, "num_tokens": 767187.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.2732771635055542, "sampling/importance_sampling_ratio/mean": 0.9998339414596558, "sampling/importance_sampling_ratio/min": 0.8115481734275818, "sampling/sampling_logp_difference/max": 0.2415940761566162, "sampling/sampling_logp_difference/mean": 0.005125013180077076, "step": 329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 241.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 137.625, "completions/mean_terminated_length": 137.625, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.05333467619493604, "epoch": 0.08835341365461848, "frac_reward_zero_std": 0.5, "grad_norm": 2.5471761226654053, "learning_rate": 1.026e-06, "loss": 0.1313, "num_tokens": 769476.0, "reward": 0.75, "reward_std": 0.5, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 0.7071067690849304, "sampling/importance_sampling_ratio/max": 1.2231223583221436, "sampling/importance_sampling_ratio/mean": 1.0004645586013794, "sampling/importance_sampling_ratio/min": 0.7279509902000427, "sampling/sampling_logp_difference/max": 0.3175215721130371, "sampling/sampling_logp_difference/mean": 0.004164563491940498, "step": 330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/max_terminated_length": 138.0, "completions/mean_length": 96.5, "completions/mean_terminated_length": 96.5, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.06674651661887765, "epoch": 0.08862115127175368, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0200000000000002e-06, "loss": 0.0, "num_tokens": 771460.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.22139573097229, "sampling/importance_sampling_ratio/mean": 1.0002553462982178, "sampling/importance_sampling_ratio/min": 0.7820942401885986, "sampling/sampling_logp_difference/max": 0.24577999114990234, "sampling/sampling_logp_difference/mean": 0.004201126284897327, "step": 331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 124.125, "completions/mean_terminated_length": 124.125, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.09929323056712747, "epoch": 0.08888888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0140000000000002e-06, "loss": 0.0, "num_tokens": 773729.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4041218757629395, "sampling/importance_sampling_ratio/mean": 0.9992116093635559, "sampling/importance_sampling_ratio/min": 0.7027314901351929, "sampling/sampling_logp_difference/max": 0.35278046131134033, "sampling/sampling_logp_difference/mean": 0.005993953440338373, "step": 332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 212.0, "completions/max_terminated_length": 212.0, "completions/mean_length": 153.875, "completions/mean_terminated_length": 153.875, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.04688257095403969, "epoch": 0.0891566265060241, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0080000000000001e-06, "loss": 0.0, "num_tokens": 776208.0, "reward": 0.25, "reward_std": 0.0, "rewards/reward_fn/mean": 0.25, "rewards/reward_fn/std": 0.8017837405204773, "sampling/importance_sampling_ratio/max": 1.272094488143921, "sampling/importance_sampling_ratio/mean": 1.0001040697097778, "sampling/importance_sampling_ratio/min": 0.8023777604103088, "sampling/sampling_logp_difference/max": 0.24066472053527832, "sampling/sampling_logp_difference/mean": 0.003385780844837427, "step": 333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 117.0, "completions/max_terminated_length": 117.0, "completions/mean_length": 78.5, "completions/mean_terminated_length": 78.5, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 0.0652413354255259, "epoch": 0.08942436412315931, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.002e-06, "loss": 0.0, "num_tokens": 777864.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.571394443511963, "sampling/importance_sampling_ratio/mean": 0.9998838901519775, "sampling/importance_sampling_ratio/min": 0.6534345149993896, "sampling/sampling_logp_difference/max": 0.4519634246826172, "sampling/sampling_logp_difference/mean": 0.0072417352348566055, "step": 334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 130.0, "completions/max_terminated_length": 130.0, "completions/mean_length": 95.5, "completions/mean_terminated_length": 95.5, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 0.0785472821444273, "epoch": 0.08969210174029452, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 9.96e-07, "loss": 0.0, "num_tokens": 779832.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.2129456996917725, "sampling/importance_sampling_ratio/mean": 1.0001981258392334, "sampling/importance_sampling_ratio/min": 0.8013851642608643, "sampling/sampling_logp_difference/max": 0.22141361236572266, "sampling/sampling_logp_difference/mean": 0.004222359973937273, "step": 335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 182.0, "completions/max_terminated_length": 182.0, "completions/mean_length": 158.375, "completions/mean_terminated_length": 158.375, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.04445435362868011, "epoch": 0.08995983935742972, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 9.9e-07, "loss": 0.0, "num_tokens": 782311.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.2597631216049194, "sampling/importance_sampling_ratio/mean": 0.9996749758720398, "sampling/importance_sampling_ratio/min": 0.764583170413971, "sampling/sampling_logp_difference/max": 0.26842451095581055, "sampling/sampling_logp_difference/mean": 0.003246289910748601, "step": 336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 220.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 169.875, "completions/mean_terminated_length": 169.875, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.039451243821531534, "epoch": 0.09022757697456493, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 9.84e-07, "loss": 0.0, "num_tokens": 784826.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3712879419326782, "sampling/importance_sampling_ratio/mean": 0.9995579719543457, "sampling/importance_sampling_ratio/min": 0.6982067227363586, "sampling/sampling_logp_difference/max": 0.3592400550842285, "sampling/sampling_logp_difference/mean": 0.004576263949275017, "step": 337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 233.0, "completions/mean_length": 401.875, "completions/mean_terminated_length": 166.71429443359375, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.06463369249831885, "epoch": 0.09049531459170014, "frac_reward_zero_std": 0.5, "grad_norm": 0.9886462688446045, "learning_rate": 9.78e-07, "loss": 0.865, "num_tokens": 789481.0, "reward": 0.7250000238418579, "reward_std": 0.4856267273426056, "rewards/reward_fn/mean": 0.7250000238418579, "rewards/reward_fn/std": 0.7005100250244141, "sampling/importance_sampling_ratio/max": 1.732648491859436, "sampling/importance_sampling_ratio/mean": 1.000361442565918, "sampling/importance_sampling_ratio/min": 0.7222753167152405, "sampling/sampling_logp_difference/max": 0.5496511459350586, "sampling/sampling_logp_difference/mean": 0.002522141905501485, "step": 338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 155.0, "completions/max_terminated_length": 155.0, "completions/mean_length": 116.5, "completions/mean_terminated_length": 116.5, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.06959035014733672, "epoch": 0.09076305220883533, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 9.72e-07, "loss": 0.0, "num_tokens": 791697.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.2392290830612183, "sampling/importance_sampling_ratio/mean": 1.0003633499145508, "sampling/importance_sampling_ratio/min": 0.7963237166404724, "sampling/sampling_logp_difference/max": 0.22774946689605713, "sampling/sampling_logp_difference/mean": 0.004342770669609308, "step": 339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 157.0, "completions/max_terminated_length": 157.0, "completions/mean_length": 114.25, "completions/mean_terminated_length": 114.25, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.08051117742434144, "epoch": 0.09103078982597054, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 9.660000000000002e-07, "loss": 0.0, "num_tokens": 794143.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3406111001968384, "sampling/importance_sampling_ratio/mean": 0.9984790682792664, "sampling/importance_sampling_ratio/min": 0.6588512063026428, "sampling/sampling_logp_difference/max": 0.41725754737854004, "sampling/sampling_logp_difference/mean": 0.007061757147312164, "step": 340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 97.0, "completions/max_terminated_length": 97.0, "completions/mean_length": 77.0, "completions/mean_terminated_length": 77.0, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.10392457339912653, "epoch": 0.09129852744310575, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 9.600000000000001e-07, "loss": 0.0, "num_tokens": 795943.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.502851128578186, "sampling/importance_sampling_ratio/mean": 1.0009105205535889, "sampling/importance_sampling_ratio/min": 0.8141688108444214, "sampling/sampling_logp_difference/max": 0.40736401081085205, "sampling/sampling_logp_difference/mean": 0.005704786162823439, "step": 341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 140.0, "completions/max_terminated_length": 140.0, "completions/mean_length": 115.0, "completions/mean_terminated_length": 115.0, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.08692356571555138, "epoch": 0.09156626506024096, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 9.54e-07, "loss": 0.0, "num_tokens": 798011.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3717001676559448, "sampling/importance_sampling_ratio/mean": 1.0002130270004272, "sampling/importance_sampling_ratio/min": 0.7998926043510437, "sampling/sampling_logp_difference/max": 0.31605100631713867, "sampling/sampling_logp_difference/mean": 0.0046792239882051945, "step": 342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 135.0, "completions/max_terminated_length": 135.0, "completions/mean_length": 87.25, "completions/mean_terminated_length": 87.25, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.05874200304970145, "epoch": 0.09183400267737617, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 9.480000000000001e-07, "loss": 0.0, "num_tokens": 799889.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.7231991291046143, "sampling/importance_sampling_ratio/mean": 1.0028952360153198, "sampling/importance_sampling_ratio/min": 0.7819111943244934, "sampling/sampling_logp_difference/max": 0.544182538986206, "sampling/sampling_logp_difference/mean": 0.006667483132332563, "step": 343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 142.0, "completions/max_terminated_length": 142.0, "completions/mean_length": 105.25, "completions/mean_terminated_length": 105.25, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "entropy": 0.10166341299191117, "epoch": 0.09210174029451138, "frac_reward_zero_std": 0.5, "grad_norm": 2.5722274780273438, "learning_rate": 9.42e-07, "loss": 0.0101, "num_tokens": 801963.0, "reward": 0.8125, "reward_std": 0.375, "rewards/reward_fn/mean": 0.8125, "rewards/reward_fn/std": 0.5303300619125366, "sampling/importance_sampling_ratio/max": 1.2832173109054565, "sampling/importance_sampling_ratio/mean": 0.9995464086532593, "sampling/importance_sampling_ratio/min": 0.7269793748855591, "sampling/sampling_logp_difference/max": 0.31885725259780884, "sampling/sampling_logp_difference/mean": 0.006948776543140411, "step": 344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 139.375, "completions/mean_terminated_length": 139.375, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "entropy": 0.06260534911416471, "epoch": 0.09236947791164658, "frac_reward_zero_std": 0.5, "grad_norm": 2.8062257766723633, "learning_rate": 9.36e-07, "loss": -0.0234, "num_tokens": 804278.0, "reward": 0.6875, "reward_std": 0.4732423424720764, "rewards/reward_fn/mean": 0.6875, "rewards/reward_fn/std": 0.7039430141448975, "sampling/importance_sampling_ratio/max": 1.9803489446640015, "sampling/importance_sampling_ratio/mean": 1.0006921291351318, "sampling/importance_sampling_ratio/min": 0.7668157815933228, "sampling/sampling_logp_difference/max": 0.6832730770111084, "sampling/sampling_logp_difference/mean": 0.005668348632752895, "step": 345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 175.0, "completions/max_terminated_length": 175.0, "completions/mean_length": 129.75, "completions/mean_terminated_length": 129.75, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.10539969243109226, "epoch": 0.09263721552878179, "frac_reward_zero_std": 0.5, "grad_norm": 1.8714728355407715, "learning_rate": 9.3e-07, "loss": -0.0412, "num_tokens": 806564.0, "reward": 0.4375, "reward_std": 0.375, "rewards/reward_fn/mean": 0.4375, "rewards/reward_fn/std": 0.7763237953186035, "sampling/importance_sampling_ratio/max": 1.4164905548095703, "sampling/importance_sampling_ratio/mean": 1.000461459159851, "sampling/importance_sampling_ratio/min": 0.6960167288780212, "sampling/sampling_logp_difference/max": 0.36238157749176025, "sampling/sampling_logp_difference/mean": 0.008314263075590134, "step": 346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 108.0, "completions/max_terminated_length": 108.0, "completions/mean_length": 89.875, "completions/mean_terminated_length": 89.875, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 0.02283781470032409, "epoch": 0.092904953145917, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 9.24e-07, "loss": 0.0, "num_tokens": 808735.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.1230496168136597, "sampling/importance_sampling_ratio/mean": 0.9992437362670898, "sampling/importance_sampling_ratio/min": 0.7900989055633545, "sampling/sampling_logp_difference/max": 0.2355971336364746, "sampling/sampling_logp_difference/mean": 0.0017315247096121311, "step": 347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 185.0, "completions/max_terminated_length": 185.0, "completions/mean_length": 157.875, "completions/mean_terminated_length": 157.875, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.08709438750520349, "epoch": 0.09317269076305221, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 9.18e-07, "loss": 0.0, "num_tokens": 811262.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.1846990585327148, "sampling/importance_sampling_ratio/mean": 1.0005362033843994, "sampling/importance_sampling_ratio/min": 0.8072311878204346, "sampling/sampling_logp_difference/max": 0.21414518356323242, "sampling/sampling_logp_difference/mean": 0.003370346501469612, "step": 348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 162.0, "completions/max_terminated_length": 162.0, "completions/mean_length": 140.0, "completions/mean_terminated_length": 140.0, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.08366846665740013, "epoch": 0.09344042838018742, "frac_reward_zero_std": 0.5, "grad_norm": 1.2272402048110962, "learning_rate": 9.12e-07, "loss": -0.0175, "num_tokens": 813594.0, "reward": 0.8125, "reward_std": 0.21650634706020355, "rewards/reward_fn/mean": 0.8125, "rewards/reward_fn/std": 0.3471825420856476, "sampling/importance_sampling_ratio/max": 1.2774200439453125, "sampling/importance_sampling_ratio/mean": 1.0000452995300293, "sampling/importance_sampling_ratio/min": 0.7017320394515991, "sampling/sampling_logp_difference/max": 0.3542037010192871, "sampling/sampling_logp_difference/mean": 0.005098911467939615, "step": 349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 292.0, "completions/max_terminated_length": 292.0, "completions/mean_length": 199.375, "completions/mean_terminated_length": 199.375, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.07056958321481943, "epoch": 0.09370816599732262, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 9.06e-07, "loss": 0.0, "num_tokens": 816765.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4387598037719727, "sampling/importance_sampling_ratio/mean": 0.999501645565033, "sampling/importance_sampling_ratio/min": 0.5164121985435486, "sampling/sampling_logp_difference/max": 0.6608500480651855, "sampling/sampling_logp_difference/mean": 0.006599396001547575, "step": 350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 121.0, "completions/max_terminated_length": 121.0, "completions/mean_length": 97.875, "completions/mean_terminated_length": 97.875, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "entropy": 0.03770438116043806, "epoch": 0.09397590361445783, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 9e-07, "loss": 0.0, "num_tokens": 818644.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3778806924819946, "sampling/importance_sampling_ratio/mean": 1.0001534223556519, "sampling/importance_sampling_ratio/min": 0.8054704070091248, "sampling/sampling_logp_difference/max": 0.32054662704467773, "sampling/sampling_logp_difference/mean": 0.0029743092600256205, "step": 351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 156.0, "completions/max_terminated_length": 156.0, "completions/mean_length": 128.125, "completions/mean_terminated_length": 128.125, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.12112649716436863, "epoch": 0.09424364123159304, "frac_reward_zero_std": 0.0, "grad_norm": 2.541131019592285, "learning_rate": 8.939999999999999e-07, "loss": 0.0272, "num_tokens": 821137.0, "reward": 0.75, "reward_std": 0.5, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 0.5345224738121033, "sampling/importance_sampling_ratio/max": 1.6671329736709595, "sampling/importance_sampling_ratio/mean": 1.000504493713379, "sampling/importance_sampling_ratio/min": 0.6028450727462769, "sampling/sampling_logp_difference/max": 0.5111054182052612, "sampling/sampling_logp_difference/mean": 0.006023418623954058, "step": 352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 125.0, "completions/max_terminated_length": 125.0, "completions/mean_length": 90.5, "completions/mean_terminated_length": 90.5, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "entropy": 0.02649637800641358, "epoch": 0.09451137884872825, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 8.88e-07, "loss": 0.0, "num_tokens": 822953.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.158608317375183, "sampling/importance_sampling_ratio/mean": 0.9995938539505005, "sampling/importance_sampling_ratio/min": 0.9022785425186157, "sampling/sampling_logp_difference/max": 0.14721953868865967, "sampling/sampling_logp_difference/mean": 0.0013654439244419336, "step": 353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.0, "completions/max_terminated_length": 217.0, "completions/mean_length": 139.125, "completions/mean_terminated_length": 139.125, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.10144210886210203, "epoch": 0.09477911646586346, "frac_reward_zero_std": 0.5, "grad_norm": 1.8350906372070312, "learning_rate": 8.82e-07, "loss": -0.0482, "num_tokens": 825310.0, "reward": 0.625, "reward_std": 0.4330126941204071, "rewards/reward_fn/mean": 0.625, "rewards/reward_fn/std": 0.6943650841712952, "sampling/importance_sampling_ratio/max": 1.2144140005111694, "sampling/importance_sampling_ratio/mean": 0.9995567202568054, "sampling/importance_sampling_ratio/min": 0.8155248165130615, "sampling/sampling_logp_difference/max": 0.20392346382141113, "sampling/sampling_logp_difference/mean": 0.005423142574727535, "step": 354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 218.0, "completions/max_terminated_length": 218.0, "completions/mean_length": 175.75, "completions/mean_terminated_length": 175.75, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.07431343756616116, "epoch": 0.09504685408299866, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 8.76e-07, "loss": 0.0, "num_tokens": 828076.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.367244005203247, "sampling/importance_sampling_ratio/mean": 1.0012143850326538, "sampling/importance_sampling_ratio/min": 0.7933870553970337, "sampling/sampling_logp_difference/max": 0.31279706954956055, "sampling/sampling_logp_difference/mean": 0.004398541059345007, "step": 355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 177.0, "completions/max_terminated_length": 177.0, "completions/mean_length": 128.375, "completions/mean_terminated_length": 128.375, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.0436905468814075, "epoch": 0.09531459170013387, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 8.699999999999999e-07, "loss": 0.0, "num_tokens": 830359.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.1706008911132812, "sampling/importance_sampling_ratio/mean": 0.9991294741630554, "sampling/importance_sampling_ratio/min": 0.7100054025650024, "sampling/sampling_logp_difference/max": 0.3424827456474304, "sampling/sampling_logp_difference/mean": 0.0030755207408219576, "step": 356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 125.375, "completions/mean_terminated_length": 125.375, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.10346514312550426, "epoch": 0.09558232931726908, "frac_reward_zero_std": 0.5, "grad_norm": 3.9346635341644287, "learning_rate": 8.64e-07, "loss": -0.1441, "num_tokens": 832846.0, "reward": 0.53125, "reward_std": 0.3590351641178131, "rewards/reward_fn/mean": 0.53125, "rewards/reward_fn/std": 0.6870940327644348, "sampling/importance_sampling_ratio/max": 1.459579348564148, "sampling/importance_sampling_ratio/mean": 1.002278447151184, "sampling/importance_sampling_ratio/min": 0.8153514266014099, "sampling/sampling_logp_difference/max": 0.37814831733703613, "sampling/sampling_logp_difference/mean": 0.0060095819644629955, "step": 357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 165.0, "completions/max_terminated_length": 165.0, "completions/mean_length": 111.75, "completions/mean_terminated_length": 111.75, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.09187029581516981, "epoch": 0.09585006693440429, "frac_reward_zero_std": 0.0, "grad_norm": 3.1284472942352295, "learning_rate": 8.58e-07, "loss": -0.1223, "num_tokens": 834848.0, "reward": 0.5625, "reward_std": 0.7285534143447876, "rewards/reward_fn/mean": 0.5625, "rewards/reward_fn/std": 0.6781013607978821, "sampling/importance_sampling_ratio/max": 1.3032253980636597, "sampling/importance_sampling_ratio/mean": 1.0002996921539307, "sampling/importance_sampling_ratio/min": 0.754493236541748, "sampling/sampling_logp_difference/max": 0.2817089557647705, "sampling/sampling_logp_difference/mean": 0.0062119727954268456, "step": 358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 100.0, "completions/max_terminated_length": 100.0, "completions/mean_length": 85.625, "completions/mean_terminated_length": 85.625, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.11660524643957615, "epoch": 0.0961178045515395, "frac_reward_zero_std": 0.5, "grad_norm": 3.0788726806640625, "learning_rate": 8.52e-07, "loss": 0.0443, "num_tokens": 836657.0, "reward": 0.8125, "reward_std": 0.375, "rewards/reward_fn/mean": 0.8125, "rewards/reward_fn/std": 0.5303300619125366, "sampling/importance_sampling_ratio/max": 1.6571362018585205, "sampling/importance_sampling_ratio/mean": 1.0032631158828735, "sampling/importance_sampling_ratio/min": 0.6181145906448364, "sampling/sampling_logp_difference/max": 0.5050909519195557, "sampling/sampling_logp_difference/mean": 0.010026800446212292, "step": 359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 190.0, "completions/max_terminated_length": 190.0, "completions/mean_length": 123.75, "completions/mean_terminated_length": 123.75, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.07889661309309304, "epoch": 0.0963855421686747, "frac_reward_zero_std": 0.5, "grad_norm": 2.5320746898651123, "learning_rate": 8.459999999999999e-07, "loss": -0.0641, "num_tokens": 838795.0, "reward": 0.625, "reward_std": 0.4330126941204071, "rewards/reward_fn/mean": 0.625, "rewards/reward_fn/std": 0.6943650841712952, "sampling/importance_sampling_ratio/max": 1.4358640909194946, "sampling/importance_sampling_ratio/mean": 1.0002895593643188, "sampling/importance_sampling_ratio/min": 0.42892172932624817, "sampling/sampling_logp_difference/max": 0.8464808464050293, "sampling/sampling_logp_difference/mean": 0.0070754606276750565, "step": 360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 146.25, "completions/mean_terminated_length": 146.25, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.1168463071808219, "epoch": 0.09665327978580991, "frac_reward_zero_std": 0.0, "grad_norm": 2.4085419178009033, "learning_rate": 8.400000000000001e-07, "loss": -0.0234, "num_tokens": 841305.0, "reward": 0.25, "reward_std": 0.8660253882408142, "rewards/reward_fn/mean": 0.25, "rewards/reward_fn/std": 0.8017837405204773, "sampling/importance_sampling_ratio/max": 1.2076363563537598, "sampling/importance_sampling_ratio/mean": 0.9994372129440308, "sampling/importance_sampling_ratio/min": 0.7979040741920471, "sampling/sampling_logp_difference/max": 0.22576689720153809, "sampling/sampling_logp_difference/mean": 0.006309518124908209, "step": 361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 188.0, "completions/max_terminated_length": 188.0, "completions/mean_length": 99.125, "completions/mean_terminated_length": 99.125, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "entropy": 0.04571910900995135, "epoch": 0.09692101740294512, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 8.340000000000001e-07, "loss": 0.0, "num_tokens": 843226.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3326261043548584, "sampling/importance_sampling_ratio/mean": 1.000421166419983, "sampling/importance_sampling_ratio/min": 0.8252893686294556, "sampling/sampling_logp_difference/max": 0.287151575088501, "sampling/sampling_logp_difference/mean": 0.004419665317982435, "step": 362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 188.0, "completions/max_terminated_length": 188.0, "completions/mean_length": 125.875, "completions/mean_terminated_length": 125.875, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.04728045535739511, "epoch": 0.09718875502008033, "frac_reward_zero_std": 0.5, "grad_norm": 2.4475045204162598, "learning_rate": 8.280000000000001e-07, "loss": -0.0318, "num_tokens": 845433.0, "reward": 0.8125, "reward_std": 0.375, "rewards/reward_fn/mean": 0.8125, "rewards/reward_fn/std": 0.5303300619125366, "sampling/importance_sampling_ratio/max": 1.2832132577896118, "sampling/importance_sampling_ratio/mean": 0.9997134208679199, "sampling/importance_sampling_ratio/min": 0.7958905100822449, "sampling/sampling_logp_difference/max": 0.24936723709106445, "sampling/sampling_logp_difference/mean": 0.0031259842216968536, "step": 363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 172.0, "completions/max_terminated_length": 172.0, "completions/mean_length": 126.5, "completions/mean_terminated_length": 126.5, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.0548446848988533, "epoch": 0.09745649263721552, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 8.220000000000001e-07, "loss": 0.0, "num_tokens": 847665.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4974737167358398, "sampling/importance_sampling_ratio/mean": 1.000665545463562, "sampling/importance_sampling_ratio/min": 0.7073565125465393, "sampling/sampling_logp_difference/max": 0.4037795066833496, "sampling/sampling_logp_difference/mean": 0.004893678240478039, "step": 364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 233.0, "completions/max_terminated_length": 233.0, "completions/mean_length": 156.125, "completions/mean_terminated_length": 156.125, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.04928994597867131, "epoch": 0.09772423025435073, "frac_reward_zero_std": 0.0, "grad_norm": 2.4354028701782227, "learning_rate": 8.160000000000001e-07, "loss": 0.096, "num_tokens": 850054.0, "reward": 0.71875, "reward_std": 0.5625, "rewards/reward_fn/mean": 0.71875, "rewards/reward_fn/std": 0.5580178499221802, "sampling/importance_sampling_ratio/max": 1.439418911933899, "sampling/importance_sampling_ratio/mean": 1.0006517171859741, "sampling/importance_sampling_ratio/min": 0.7074717283248901, "sampling/sampling_logp_difference/max": 0.3642394542694092, "sampling/sampling_logp_difference/mean": 0.0041305068880319595, "step": 365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 187.0, "completions/max_terminated_length": 187.0, "completions/mean_length": 128.25, "completions/mean_terminated_length": 128.25, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "entropy": 0.0736327855847776, "epoch": 0.09799196787148594, "frac_reward_zero_std": 0.5, "grad_norm": 2.0879108905792236, "learning_rate": 8.100000000000001e-07, "loss": -0.0326, "num_tokens": 852272.0, "reward": 0.8125, "reward_std": 0.375, "rewards/reward_fn/mean": 0.8125, "rewards/reward_fn/std": 0.5303300619125366, "sampling/importance_sampling_ratio/max": 1.179817795753479, "sampling/importance_sampling_ratio/mean": 0.9987912178039551, "sampling/importance_sampling_ratio/min": 0.7915790677070618, "sampling/sampling_logp_difference/max": 0.23372554779052734, "sampling/sampling_logp_difference/mean": 0.0049341521225869656, "step": 366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 268.0, "completions/max_terminated_length": 268.0, "completions/mean_length": 123.5, "completions/mean_terminated_length": 123.5, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 0.0803635767661035, "epoch": 0.09825970548862115, "frac_reward_zero_std": 0.5, "grad_norm": 3.1625707149505615, "learning_rate": 8.04e-07, "loss": -0.1309, "num_tokens": 854580.0, "reward": 0.3125, "reward_std": 0.4732423424720764, "rewards/reward_fn/mean": 0.3125, "rewards/reward_fn/std": 0.9613049626350403, "sampling/importance_sampling_ratio/max": 1.5303796529769897, "sampling/importance_sampling_ratio/mean": 0.9999043941497803, "sampling/importance_sampling_ratio/min": 0.7817946672439575, "sampling/sampling_logp_difference/max": 0.42551589012145996, "sampling/sampling_logp_difference/mean": 0.005497123580425978, "step": 367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 177.875, "completions/mean_terminated_length": 177.875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.05020028306171298, "epoch": 0.09852744310575635, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.98e-07, "loss": 0.0, "num_tokens": 857163.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4300366640090942, "sampling/importance_sampling_ratio/mean": 0.9995969533920288, "sampling/importance_sampling_ratio/min": 0.604065477848053, "sampling/sampling_logp_difference/max": 0.5040726661682129, "sampling/sampling_logp_difference/mean": 0.0028436745051294565, "step": 368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 123.0, "completions/max_terminated_length": 123.0, "completions/mean_length": 86.0, "completions/mean_terminated_length": 86.0, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 0.14963187463581562, "epoch": 0.09879518072289156, "frac_reward_zero_std": 0.5, "grad_norm": 2.2361743450164795, "learning_rate": 7.920000000000001e-07, "loss": -0.1169, "num_tokens": 858899.0, "reward": 0.8125, "reward_std": 0.375, "rewards/reward_fn/mean": 0.8125, "rewards/reward_fn/std": 0.5303300619125366, "sampling/importance_sampling_ratio/max": 1.2466490268707275, "sampling/importance_sampling_ratio/mean": 1.0007280111312866, "sampling/importance_sampling_ratio/min": 0.784203290939331, "sampling/sampling_logp_difference/max": 0.24308693408966064, "sampling/sampling_logp_difference/mean": 0.009444947354495525, "step": 369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 221.0, "completions/max_terminated_length": 221.0, "completions/mean_length": 149.125, "completions/mean_terminated_length": 149.125, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.058058994356542826, "epoch": 0.09906291834002677, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.860000000000001e-07, "loss": 0.0, "num_tokens": 861284.0, "reward": 0.25, "reward_std": 0.0, "rewards/reward_fn/mean": 0.25, "rewards/reward_fn/std": 0.8017837405204773, "sampling/importance_sampling_ratio/max": 1.3599092960357666, "sampling/importance_sampling_ratio/mean": 1.0009280443191528, "sampling/importance_sampling_ratio/min": 0.642375648021698, "sampling/sampling_logp_difference/max": 0.44258201122283936, "sampling/sampling_logp_difference/mean": 0.005142963491380215, "step": 370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 183.0, "completions/max_terminated_length": 183.0, "completions/mean_length": 111.25, "completions/mean_terminated_length": 111.25, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.10582809150218964, "epoch": 0.09933065595716198, "frac_reward_zero_std": 0.5, "grad_norm": 3.2061948776245117, "learning_rate": 7.8e-07, "loss": -0.1517, "num_tokens": 863314.0, "reward": 0.4375, "reward_std": 0.375, "rewards/reward_fn/mean": 0.4375, "rewards/reward_fn/std": 0.7763237953186035, "sampling/importance_sampling_ratio/max": 1.4223660230636597, "sampling/importance_sampling_ratio/mean": 1.000265121459961, "sampling/importance_sampling_ratio/min": 0.8024592995643616, "sampling/sampling_logp_difference/max": 0.3523216247558594, "sampling/sampling_logp_difference/mean": 0.006412203889340162, "step": 371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 167.0, "completions/max_terminated_length": 167.0, "completions/mean_length": 89.0, "completions/mean_terminated_length": 89.0, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "entropy": 0.07425260217860341, "epoch": 0.09959839357429719, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.74e-07, "loss": 0.0, "num_tokens": 865174.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.2569704055786133, "sampling/importance_sampling_ratio/mean": 0.9983083605766296, "sampling/importance_sampling_ratio/min": 0.7404568195343018, "sampling/sampling_logp_difference/max": 0.3004879951477051, "sampling/sampling_logp_difference/mean": 0.006936808116734028, "step": 372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 128.0, "completions/max_terminated_length": 128.0, "completions/mean_length": 97.75, "completions/mean_terminated_length": 97.75, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 0.08514757687225938, "epoch": 0.0998661311914324, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.68e-07, "loss": 0.0, "num_tokens": 867240.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.2462620735168457, "sampling/importance_sampling_ratio/mean": 1.0002506971359253, "sampling/importance_sampling_ratio/min": 0.843531608581543, "sampling/sampling_logp_difference/max": 0.22014880180358887, "sampling/sampling_logp_difference/mean": 0.0049998085014522076, "step": 373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.0, "completions/max_terminated_length": 299.0, "completions/mean_length": 191.0, "completions/mean_terminated_length": 191.0, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.07090399553999305, "epoch": 0.1001338688085676, "frac_reward_zero_std": 0.5, "grad_norm": 1.8470078706741333, "learning_rate": 7.620000000000001e-07, "loss": 0.0063, "num_tokens": 870048.0, "reward": 0.0625, "reward_std": 0.375, "rewards/reward_fn/mean": 0.0625, "rewards/reward_fn/std": 0.7763237953186035, "sampling/importance_sampling_ratio/max": 1.3719909191131592, "sampling/importance_sampling_ratio/mean": 1.0004600286483765, "sampling/importance_sampling_ratio/min": 0.8338613510131836, "sampling/sampling_logp_difference/max": 0.31626296043395996, "sampling/sampling_logp_difference/mean": 0.005865985993295908, "step": 374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 120.375, "completions/mean_terminated_length": 120.375, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.10819089971482754, "epoch": 0.10040160642570281, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.56e-07, "loss": 0.0, "num_tokens": 872175.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.2235392332077026, "sampling/importance_sampling_ratio/mean": 0.9983866214752197, "sampling/importance_sampling_ratio/min": 0.6388687491416931, "sampling/sampling_logp_difference/max": 0.4480562210083008, "sampling/sampling_logp_difference/mean": 0.007528863847255707, "step": 375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 145.0, "completions/max_terminated_length": 145.0, "completions/mean_length": 103.125, "completions/mean_terminated_length": 103.125, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "entropy": 0.07077036891132593, "epoch": 0.10066934404283802, "frac_reward_zero_std": 0.5, "grad_norm": 3.9627468585968018, "learning_rate": 7.5e-07, "loss": 0.0996, "num_tokens": 874264.0, "reward": 0.8125, "reward_std": 0.375, "rewards/reward_fn/mean": 0.8125, "rewards/reward_fn/std": 0.5303300619125366, "sampling/importance_sampling_ratio/max": 1.5526821613311768, "sampling/importance_sampling_ratio/mean": 1.0004500150680542, "sampling/importance_sampling_ratio/min": 0.6114530563354492, "sampling/sampling_logp_difference/max": 0.49191713333129883, "sampling/sampling_logp_difference/mean": 0.008239112794399261, "step": 376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 170.0, "completions/max_terminated_length": 170.0, "completions/mean_length": 113.375, "completions/mean_terminated_length": 113.375, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 0.05927344644442201, "epoch": 0.10093708165997323, "frac_reward_zero_std": 0.5, "grad_norm": 1.6596786975860596, "learning_rate": 7.44e-07, "loss": -0.1586, "num_tokens": 876315.0, "reward": 0.8125, "reward_std": 0.375, "rewards/reward_fn/mean": 0.8125, "rewards/reward_fn/std": 0.5303300619125366, "sampling/importance_sampling_ratio/max": 1.267147421836853, "sampling/importance_sampling_ratio/mean": 0.9989814758300781, "sampling/importance_sampling_ratio/min": 0.6186072826385498, "sampling/sampling_logp_difference/max": 0.4802846908569336, "sampling/sampling_logp_difference/mean": 0.003967831376940012, "step": 377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 134.5, "completions/mean_terminated_length": 134.5, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.07954400638118386, "epoch": 0.10120481927710843, "frac_reward_zero_std": 0.5, "grad_norm": 1.6820160150527954, "learning_rate": 7.38e-07, "loss": 0.0298, "num_tokens": 878543.0, "reward": 0.8125, "reward_std": 0.375, "rewards/reward_fn/mean": 0.8125, "rewards/reward_fn/std": 0.5303300619125366, "sampling/importance_sampling_ratio/max": 1.2327934503555298, "sampling/importance_sampling_ratio/mean": 0.9988861680030823, "sampling/importance_sampling_ratio/min": 0.6222361326217651, "sampling/sampling_logp_difference/max": 0.47443556785583496, "sampling/sampling_logp_difference/mean": 0.005573929287493229, "step": 378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 646.0, "completions/max_terminated_length": 646.0, "completions/mean_length": 225.5, "completions/mean_terminated_length": 225.5, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "entropy": 0.09125758334994316, "epoch": 0.10147255689424364, "frac_reward_zero_std": 0.5, "grad_norm": 1.9166181087493896, "learning_rate": 7.32e-07, "loss": 0.0799, "num_tokens": 881487.0, "reward": 0.375, "reward_std": 0.25, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.744023859500885, "sampling/importance_sampling_ratio/max": 1.5918940305709839, "sampling/importance_sampling_ratio/mean": 1.0013386011123657, "sampling/importance_sampling_ratio/min": 0.7339412569999695, "sampling/sampling_logp_difference/max": 0.46492457389831543, "sampling/sampling_logp_difference/mean": 0.007726229261606932, "step": 379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 167.0, "completions/max_terminated_length": 167.0, "completions/mean_length": 128.75, "completions/mean_terminated_length": 128.75, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.10029460024088621, "epoch": 0.10174029451137885, "frac_reward_zero_std": 0.5, "grad_norm": 2.3961241245269775, "learning_rate": 7.26e-07, "loss": 0.039, "num_tokens": 883889.0, "reward": 0.84375, "reward_std": 0.1875, "rewards/reward_fn/mean": 0.84375, "rewards/reward_fn/std": 0.29693374037742615, "sampling/importance_sampling_ratio/max": 1.176796317100525, "sampling/importance_sampling_ratio/mean": 1.0002285242080688, "sampling/importance_sampling_ratio/min": 0.7707146406173706, "sampling/sampling_logp_difference/max": 0.26043713092803955, "sampling/sampling_logp_difference/mean": 0.006159475073218346, "step": 380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 213.0, "completions/max_terminated_length": 213.0, "completions/mean_length": 144.875, "completions/mean_terminated_length": 144.875, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.07305065169930458, "epoch": 0.10200803212851406, "frac_reward_zero_std": 0.5, "grad_norm": 3.2194554805755615, "learning_rate": 7.2e-07, "loss": 0.0722, "num_tokens": 886316.0, "reward": 0.71875, "reward_std": 0.3590351641178131, "rewards/reward_fn/mean": 0.71875, "rewards/reward_fn/std": 0.5580178499221802, "sampling/importance_sampling_ratio/max": 1.5400786399841309, "sampling/importance_sampling_ratio/mean": 1.0024458169937134, "sampling/importance_sampling_ratio/min": 0.6976394653320312, "sampling/sampling_logp_difference/max": 0.43183350563049316, "sampling/sampling_logp_difference/mean": 0.009489068761467934, "step": 381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 142.375, "completions/mean_terminated_length": 142.375, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.07052872842177749, "epoch": 0.10227576974564927, "frac_reward_zero_std": 0.5, "grad_norm": 1.9321306943893433, "learning_rate": 7.14e-07, "loss": -0.0875, "num_tokens": 888655.0, "reward": 0.625, "reward_std": 0.4330126941204071, "rewards/reward_fn/mean": 0.625, "rewards/reward_fn/std": 0.6943650841712952, "sampling/importance_sampling_ratio/max": 1.343408465385437, "sampling/importance_sampling_ratio/mean": 1.0004748106002808, "sampling/importance_sampling_ratio/min": 0.7570950984954834, "sampling/sampling_logp_difference/max": 0.29521000385284424, "sampling/sampling_logp_difference/mean": 0.005760665517300367, "step": 382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 242.0, "completions/max_terminated_length": 242.0, "completions/mean_length": 151.375, "completions/mean_terminated_length": 151.375, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "entropy": 0.0786778018809855, "epoch": 0.10254350736278448, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.079999999999999e-07, "loss": 0.0, "num_tokens": 891170.0, "reward": 0.25, "reward_std": 0.0, "rewards/reward_fn/mean": 0.25, "rewards/reward_fn/std": 0.8017837405204773, "sampling/importance_sampling_ratio/max": 1.3592694997787476, "sampling/importance_sampling_ratio/mean": 1.0001035928726196, "sampling/importance_sampling_ratio/min": 0.7778481841087341, "sampling/sampling_logp_difference/max": 0.3069474697113037, "sampling/sampling_logp_difference/mean": 0.005185719579458237, "step": 383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 160.0, "completions/max_terminated_length": 160.0, "completions/mean_length": 128.875, "completions/mean_terminated_length": 128.875, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.0749786626547575, "epoch": 0.10281124497991968, "frac_reward_zero_std": 0.5, "grad_norm": 2.9602253437042236, "learning_rate": 7.02e-07, "loss": 0.0534, "num_tokens": 893449.0, "reward": 0.4375, "reward_std": 0.375, "rewards/reward_fn/mean": 0.4375, "rewards/reward_fn/std": 0.7763237953186035, "sampling/importance_sampling_ratio/max": 1.7269530296325684, "sampling/importance_sampling_ratio/mean": 1.001651406288147, "sampling/importance_sampling_ratio/min": 0.6353104114532471, "sampling/sampling_logp_difference/max": 0.546358585357666, "sampling/sampling_logp_difference/mean": 0.006172421853989363, "step": 384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 292.0, "completions/max_terminated_length": 292.0, "completions/mean_length": 188.125, "completions/mean_terminated_length": 188.125, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.07689076103270054, "epoch": 0.10307898259705489, "frac_reward_zero_std": 0.5, "grad_norm": 1.5033085346221924, "learning_rate": 6.960000000000001e-07, "loss": -0.0459, "num_tokens": 896214.0, "reward": 0.625, "reward_std": 0.4330126941204071, "rewards/reward_fn/mean": 0.625, "rewards/reward_fn/std": 0.6943650841712952, "sampling/importance_sampling_ratio/max": 1.2669365406036377, "sampling/importance_sampling_ratio/mean": 0.9998987317085266, "sampling/importance_sampling_ratio/min": 0.8623171448707581, "sampling/sampling_logp_difference/max": 0.2366018295288086, "sampling/sampling_logp_difference/mean": 0.003983908798545599, "step": 385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 121.75, "completions/mean_terminated_length": 121.75, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 0.09890526533126831, "epoch": 0.1033467202141901, "frac_reward_zero_std": 0.5, "grad_norm": 3.493856191635132, "learning_rate": 6.900000000000001e-07, "loss": -0.0622, "num_tokens": 898332.0, "reward": 0.625, "reward_std": 0.4330126941204071, "rewards/reward_fn/mean": 0.625, "rewards/reward_fn/std": 0.6943650841712952, "sampling/importance_sampling_ratio/max": 1.2547365427017212, "sampling/importance_sampling_ratio/mean": 1.0002261400222778, "sampling/importance_sampling_ratio/min": 0.6930984854698181, "sampling/sampling_logp_difference/max": 0.36658310890197754, "sampling/sampling_logp_difference/mean": 0.007008379325270653, "step": 386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 168.0, "completions/mean_terminated_length": 168.0, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.06586588872596622, "epoch": 0.10361445783132531, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.84e-07, "loss": 0.0, "num_tokens": 901000.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4007571935653687, "sampling/importance_sampling_ratio/mean": 0.9995333552360535, "sampling/importance_sampling_ratio/min": 0.7906538844108582, "sampling/sampling_logp_difference/max": 0.33701300621032715, "sampling/sampling_logp_difference/mean": 0.004960671998560429, "step": 387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 205.875, "completions/mean_terminated_length": 205.875, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.05452088359743357, "epoch": 0.10388219544846052, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.78e-07, "loss": 0.0, "num_tokens": 904067.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.281874418258667, "sampling/importance_sampling_ratio/mean": 1.0002394914627075, "sampling/importance_sampling_ratio/min": 0.7806978225708008, "sampling/sampling_logp_difference/max": 0.2483234405517578, "sampling/sampling_logp_difference/mean": 0.0033783114049583673, "step": 388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 187.0, "completions/max_terminated_length": 187.0, "completions/mean_length": 108.5, "completions/mean_terminated_length": 108.5, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 0.0462900044512935, "epoch": 0.10414993306559571, "frac_reward_zero_std": 0.5, "grad_norm": 3.6790757179260254, "learning_rate": 6.72e-07, "loss": 0.1324, "num_tokens": 906147.0, "reward": 0.4375, "reward_std": 0.375, "rewards/reward_fn/mean": 0.4375, "rewards/reward_fn/std": 0.7763237953186035, "sampling/importance_sampling_ratio/max": 1.3951913118362427, "sampling/importance_sampling_ratio/mean": 0.9990285634994507, "sampling/importance_sampling_ratio/min": 0.7412779331207275, "sampling/sampling_logp_difference/max": 0.3330315351486206, "sampling/sampling_logp_difference/mean": 0.005884090438485146, "step": 389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 151.0, "completions/max_terminated_length": 151.0, "completions/mean_length": 122.875, "completions/mean_terminated_length": 122.875, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 0.03876467002555728, "epoch": 0.10441767068273092, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.660000000000001e-07, "loss": 0.0, "num_tokens": 908430.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3434768915176392, "sampling/importance_sampling_ratio/mean": 1.0017447471618652, "sampling/importance_sampling_ratio/min": 0.8632645606994629, "sampling/sampling_logp_difference/max": 0.2952609062194824, "sampling/sampling_logp_difference/mean": 0.0035630224738270044, "step": 390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 178.0, "completions/max_terminated_length": 178.0, "completions/mean_length": 121.125, "completions/mean_terminated_length": 121.125, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 0.058091006241738796, "epoch": 0.10468540829986613, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.6e-07, "loss": 0.0, "num_tokens": 910623.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4172409772872925, "sampling/importance_sampling_ratio/mean": 1.000789999961853, "sampling/importance_sampling_ratio/min": 0.669457197189331, "sampling/sampling_logp_difference/max": 0.4012880325317383, "sampling/sampling_logp_difference/mean": 0.006565970368683338, "step": 391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 168.0, "completions/max_terminated_length": 168.0, "completions/mean_length": 123.375, "completions/mean_terminated_length": 123.375, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.0992729514837265, "epoch": 0.10495314591700133, "frac_reward_zero_std": 0.5, "grad_norm": 3.6570773124694824, "learning_rate": 6.54e-07, "loss": -0.0159, "num_tokens": 912922.0, "reward": 0.625, "reward_std": 0.4330126941204071, "rewards/reward_fn/mean": 0.625, "rewards/reward_fn/std": 0.6943650841712952, "sampling/importance_sampling_ratio/max": 1.3471999168395996, "sampling/importance_sampling_ratio/mean": 0.9999240040779114, "sampling/importance_sampling_ratio/min": 0.7436296343803406, "sampling/sampling_logp_difference/max": 0.29802823066711426, "sampling/sampling_logp_difference/mean": 0.008179552853107452, "step": 392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 192.0, "completions/max_terminated_length": 192.0, "completions/mean_length": 167.5, "completions/mean_terminated_length": 167.5, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.049369717482477427, "epoch": 0.10522088353413654, "frac_reward_zero_std": 0.5, "grad_norm": 1.999069094657898, "learning_rate": 6.48e-07, "loss": -0.0362, "num_tokens": 915698.0, "reward": 0.8125, "reward_std": 0.375, "rewards/reward_fn/mean": 0.8125, "rewards/reward_fn/std": 0.5303300619125366, "sampling/importance_sampling_ratio/max": 1.6347160339355469, "sampling/importance_sampling_ratio/mean": 0.9998428821563721, "sampling/importance_sampling_ratio/min": 0.6938474774360657, "sampling/sampling_logp_difference/max": 0.491469144821167, "sampling/sampling_logp_difference/mean": 0.004718809388577938, "step": 393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 186.0, "completions/max_terminated_length": 186.0, "completions/mean_length": 144.0, "completions/mean_terminated_length": 144.0, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.07888981839641929, "epoch": 0.10548862115127175, "frac_reward_zero_std": 0.5, "grad_norm": 1.6689685583114624, "learning_rate": 6.42e-07, "loss": -0.02, "num_tokens": 918150.0, "reward": 0.8125, "reward_std": 0.375, "rewards/reward_fn/mean": 0.8125, "rewards/reward_fn/std": 0.5303300619125366, "sampling/importance_sampling_ratio/max": 1.3646961450576782, "sampling/importance_sampling_ratio/mean": 0.9983444213867188, "sampling/importance_sampling_ratio/min": 0.5686691403388977, "sampling/sampling_logp_difference/max": 0.5644564628601074, "sampling/sampling_logp_difference/mean": 0.0067650992423295975, "step": 394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/max_terminated_length": 262.0, "completions/mean_length": 117.375, "completions/mean_terminated_length": 117.375, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "entropy": 0.09501227922737598, "epoch": 0.10575635876840696, "frac_reward_zero_std": 0.5, "grad_norm": 2.7259342670440674, "learning_rate": 6.36e-07, "loss": -0.1476, "num_tokens": 920165.0, "reward": 0.625, "reward_std": 0.3061862289905548, "rewards/reward_fn/mean": 0.625, "rewards/reward_fn/std": 0.5669467449188232, "sampling/importance_sampling_ratio/max": 1.2076606750488281, "sampling/importance_sampling_ratio/mean": 0.9990693926811218, "sampling/importance_sampling_ratio/min": 0.609165370464325, "sampling/sampling_logp_difference/max": 0.4956655502319336, "sampling/sampling_logp_difference/mean": 0.007825275883078575, "step": 395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 161.125, "completions/mean_terminated_length": 161.125, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "entropy": 0.07538006594404578, "epoch": 0.10602409638554217, "frac_reward_zero_std": 0.5, "grad_norm": 1.9610947370529175, "learning_rate": 6.3e-07, "loss": -0.0106, "num_tokens": 922826.0, "reward": 0.5625, "reward_std": 0.375, "rewards/reward_fn/mean": 0.5625, "rewards/reward_fn/std": 0.6781013607978821, "sampling/importance_sampling_ratio/max": 1.725523591041565, "sampling/importance_sampling_ratio/mean": 0.9996272921562195, "sampling/importance_sampling_ratio/min": 0.43628066778182983, "sampling/sampling_logp_difference/max": 0.8294695019721985, "sampling/sampling_logp_difference/mean": 0.006852356251329184, "step": 396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 163.0, "completions/max_terminated_length": 163.0, "completions/mean_length": 119.75, "completions/mean_terminated_length": 119.75, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.08384125307202339, "epoch": 0.10629183400267737, "frac_reward_zero_std": 0.5, "grad_norm": 2.567168951034546, "learning_rate": 6.24e-07, "loss": 0.0445, "num_tokens": 925104.0, "reward": 0.8125, "reward_std": 0.375, "rewards/reward_fn/mean": 0.8125, "rewards/reward_fn/std": 0.5303300619125366, "sampling/importance_sampling_ratio/max": 1.8134123086929321, "sampling/importance_sampling_ratio/mean": 1.0012052059173584, "sampling/importance_sampling_ratio/min": 0.4046269655227661, "sampling/sampling_logp_difference/max": 0.9047896862030029, "sampling/sampling_logp_difference/mean": 0.007590695284307003, "step": 397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 118.0, "completions/max_terminated_length": 118.0, "completions/mean_length": 83.125, "completions/mean_terminated_length": 83.125, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "entropy": 0.07682485226541758, "epoch": 0.10655957161981258, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.18e-07, "loss": 0.0, "num_tokens": 926829.0, "reward": 0.625, "reward_std": 0.0, "rewards/reward_fn/mean": 0.625, "rewards/reward_fn/std": 0.40089187026023865, "sampling/importance_sampling_ratio/max": 1.2372941970825195, "sampling/importance_sampling_ratio/mean": 0.9988314509391785, "sampling/importance_sampling_ratio/min": 0.8122565150260925, "sampling/sampling_logp_difference/max": 0.21292686462402344, "sampling/sampling_logp_difference/mean": 0.005808745976537466, "step": 398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 198.0, "completions/max_terminated_length": 198.0, "completions/mean_length": 169.625, "completions/mean_terminated_length": 169.625, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.07382126664742827, "epoch": 0.10682730923694779, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.12e-07, "loss": 0.0, "num_tokens": 929642.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6034724712371826, "sampling/importance_sampling_ratio/mean": 0.999808669090271, "sampling/importance_sampling_ratio/min": 0.679843544960022, "sampling/sampling_logp_difference/max": 0.4721715450286865, "sampling/sampling_logp_difference/mean": 0.007272281218320131, "step": 399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 138.875, "completions/mean_terminated_length": 138.875, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.09032107959501445, "epoch": 0.107095046854083, "frac_reward_zero_std": 0.5, "grad_norm": 2.3996057510375977, "learning_rate": 6.060000000000001e-07, "loss": -0.0588, "num_tokens": 932257.0, "reward": 0.6875, "reward_std": 0.3145764470100403, "rewards/reward_fn/mean": 0.6875, "rewards/reward_fn/std": 0.5303300619125366, "sampling/importance_sampling_ratio/max": 1.2555066347122192, "sampling/importance_sampling_ratio/mean": 1.0004559755325317, "sampling/importance_sampling_ratio/min": 0.6405255198478699, "sampling/sampling_logp_difference/max": 0.4454662799835205, "sampling/sampling_logp_difference/mean": 0.005942198913544416, "step": 400 } ], "logging_steps": 1, "max_steps": 500, "num_input_tokens_seen": 932257, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }