{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.0135, "eval_steps": 500, "global_step": 1350, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1430.0, "completions/max_terminated_length": 1430.0, "completions/mean_length": 390.71875, "completions/mean_terminated_length": 390.71875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 8.503647923469543, "epoch": 1e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.01053933147341013, "kl": 0.0, "learning_rate": 0.0, "loss": 0.0003, "num_tokens": 49837.0, "reward": 0.21860405802726746, "reward_std": 1.510036826133728, "rewards/rollout_reward_func/mean": 0.21860405802726746, "rewards/rollout_reward_func/std": 1.5175718069076538, "sampling/importance_sampling_ratio/max": 0.04880070313811302, "sampling/importance_sampling_ratio/mean": 0.00836901180446148, "sampling/importance_sampling_ratio/min": 1.8566814481268262e-13, "sampling/sampling_logp_difference/max": 10.760838508605957, "sampling/sampling_logp_difference/mean": 1.7294938564300537, "step": 1, "step_time": 11.826248200999544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 8.503647923469543, "epoch": 2e-05, "grad_norm": 0.010674488730728626, "kl": 0.0, "learning_rate": 2.8571428571428575e-07, "loss": 0.0003, "step": 2, "step_time": 5.830465508999623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 773.0, "completions/max_terminated_length": 773.0, "completions/mean_length": 103.71875, "completions/mean_terminated_length": 103.71875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 8.892502665519714, "epoch": 3e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.017932534217834473, "kl": 0.001027411864924943, "learning_rate": 5.714285714285715e-07, "loss": -0.0002, "num_tokens": 84742.0, "reward": 0.0745391920208931, "reward_std": 1.4993906021118164, "rewards/rollout_reward_func/mean": 0.0745391920208931, "rewards/rollout_reward_func/std": 1.5188325643539429, "sampling/importance_sampling_ratio/max": 0.0681685209274292, "sampling/importance_sampling_ratio/mean": 0.020558957010507584, "sampling/importance_sampling_ratio/min": 5.531217539100908e-06, "sampling/sampling_logp_difference/max": 4.820071220397949, "sampling/sampling_logp_difference/mean": 1.796181559562683, "step": 3, "step_time": 8.05213572899629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 8.890735983848572, "epoch": 4e-05, "grad_norm": 0.01817218028008938, "kl": 0.0008793309643806424, "learning_rate": 8.571428571428572e-07, "loss": -0.0002, "step": 4, "step_time": 4.397696408999764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1520.0, "completions/max_terminated_length": 1520.0, "completions/mean_length": 482.28125, "completions/mean_terminated_length": 482.28125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 8.531500458717346, "epoch": 5e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.00725229037925601, "kl": 0.0009109678794629872, "learning_rate": 1.142857142857143e-06, "loss": 0.0, "num_tokens": 137269.0, "reward": 0.4154509902000427, "reward_std": 1.5819138288497925, "rewards/rollout_reward_func/mean": 0.4154509902000427, "rewards/rollout_reward_func/std": 1.572078824043274, "sampling/importance_sampling_ratio/max": 0.01950044184923172, "sampling/importance_sampling_ratio/mean": 0.004303420893847942, "sampling/importance_sampling_ratio/min": 5.057508665334831e-12, "sampling/sampling_logp_difference/max": 4.2797441482543945, "sampling/sampling_logp_difference/mean": 1.6404157876968384, "step": 5, "step_time": 11.525438371001655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 8.522217988967896, "epoch": 6e-05, "grad_norm": 0.007474060170352459, "kl": 0.0007408717792714015, "learning_rate": 1.4285714285714286e-06, "loss": 0.0, "step": 6, "step_time": 5.946805360001235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 982.0, "completions/max_terminated_length": 982.0, "completions/mean_length": 227.6875, "completions/mean_terminated_length": 227.6875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 8.60615086555481, "epoch": 7e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.014114073477685452, "kl": 0.0005928883474553004, "learning_rate": 1.7142857142857145e-06, "loss": 0.0004, "num_tokens": 179097.0, "reward": 0.15339350700378418, "reward_std": 1.3799924850463867, "rewards/rollout_reward_func/mean": 0.15339350700378418, "rewards/rollout_reward_func/std": 1.512816309928894, "sampling/importance_sampling_ratio/max": 0.047606635838747025, "sampling/importance_sampling_ratio/mean": 0.01643146388232708, "sampling/importance_sampling_ratio/min": 7.434471332827541e-13, "sampling/sampling_logp_difference/max": 4.103754043579102, "sampling/sampling_logp_difference/mean": 1.569874882698059, "step": 7, "step_time": 8.650433748996875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 8.59926450252533, "epoch": 8e-05, "grad_norm": 0.013981531374156475, "kl": 0.0010546768244239502, "learning_rate": 2.0000000000000003e-06, "loss": 0.0004, "step": 8, "step_time": 4.870647691997874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 858.0, "completions/max_terminated_length": 858.0, "completions/mean_length": 214.625, "completions/mean_terminated_length": 214.625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 8.781949162483215, "epoch": 9e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.008456221781671047, "kl": 0.001456338526622858, "learning_rate": 2.285714285714286e-06, "loss": 0.0002, "num_tokens": 221504.0, "reward": -0.03815160691738129, "reward_std": 1.3043205738067627, "rewards/rollout_reward_func/mean": -0.03815160691738129, "rewards/rollout_reward_func/std": 1.3641914129257202, "sampling/importance_sampling_ratio/max": 0.056080833077430725, "sampling/importance_sampling_ratio/mean": 0.011154292151331902, "sampling/importance_sampling_ratio/min": 9.532552163683267e-20, "sampling/sampling_logp_difference/max": 14.188918113708496, "sampling/sampling_logp_difference/mean": 1.9105571508407593, "step": 9, "step_time": 8.345916734002458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 8.776870012283325, "epoch": 0.0001, "grad_norm": 0.008276408538222313, "kl": 0.0015252166340360418, "learning_rate": 2.571428571428571e-06, "loss": 0.0002, "step": 10, "step_time": 5.5083886419979535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1071.0, "completions/max_terminated_length": 1071.0, "completions/mean_length": 227.96875, "completions/mean_terminated_length": 227.96875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 8.853073716163635, "epoch": 0.00011, "frac_reward_zero_std": 0.0, "grad_norm": 0.012281826697289944, "kl": 0.0010711039940360934, "learning_rate": 2.8571428571428573e-06, "loss": 0.0003, "num_tokens": 264206.0, "reward": 0.30741074681282043, "reward_std": 1.6079063415527344, "rewards/rollout_reward_func/mean": 0.30741074681282043, "rewards/rollout_reward_func/std": 1.605423092842102, "sampling/importance_sampling_ratio/max": 0.05672132223844528, "sampling/importance_sampling_ratio/mean": 0.013314912095665932, "sampling/importance_sampling_ratio/min": 1.9117207816776727e-09, "sampling/sampling_logp_difference/max": 2.415975570678711, "sampling/sampling_logp_difference/mean": 1.7506613731384277, "step": 11, "step_time": 8.958855187998779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 8.862006187438965, "epoch": 0.00012, "grad_norm": 0.011902211233973503, "kl": 0.0015465187825611793, "learning_rate": 3.142857142857143e-06, "loss": 0.0003, "step": 12, "step_time": 5.0128902190026565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1345.0, "completions/max_terminated_length": 1345.0, "completions/mean_length": 163.875, "completions/mean_terminated_length": 163.875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 8.826934695243835, "epoch": 0.00013, "frac_reward_zero_std": 0.0, "grad_norm": 0.03199106082320213, "kl": 0.003164995403494686, "learning_rate": 3.428571428571429e-06, "loss": -0.0009, "num_tokens": 301790.0, "reward": 0.5541991591453552, "reward_std": 1.363976001739502, "rewards/rollout_reward_func/mean": 0.5541991591453552, "rewards/rollout_reward_func/std": 1.5858471393585205, "sampling/importance_sampling_ratio/max": 0.08638323098421097, "sampling/importance_sampling_ratio/mean": 0.02800275757908821, "sampling/importance_sampling_ratio/min": 4.827537757329389e-16, "sampling/sampling_logp_difference/max": 4.43721866607666, "sampling/sampling_logp_difference/mean": 1.7803475856781006, "step": 13, "step_time": 9.892554498001118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 8.787010669708252, "epoch": 0.00014, "grad_norm": 0.03249078616499901, "kl": 0.004570890654576942, "learning_rate": 3.7142857142857146e-06, "loss": -0.001, "step": 14, "step_time": 5.601703357999213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1029.0, "completions/max_terminated_length": 1029.0, "completions/mean_length": 253.46875, "completions/mean_terminated_length": 261.1290283203125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 8.75649344921112, "epoch": 0.00015, "frac_reward_zero_std": 0.0, "grad_norm": 0.012387493625283241, "kl": 0.0064363747369498014, "learning_rate": 4.000000000000001e-06, "loss": -0.0001, "num_tokens": 345689.0, "reward": 0.2678149938583374, "reward_std": 1.007262110710144, "rewards/rollout_reward_func/mean": 0.2678149938583374, "rewards/rollout_reward_func/std": 1.2396751642227173, "sampling/importance_sampling_ratio/max": 0.06502779573202133, "sampling/importance_sampling_ratio/mean": 0.016495101153850555, "sampling/importance_sampling_ratio/min": 4.91094998326079e-14, "sampling/sampling_logp_difference/max": 4.969776153564453, "sampling/sampling_logp_difference/mean": 1.7686097621917725, "step": 15, "step_time": 8.895431961002032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 8.760064482688904, "epoch": 0.00016, "grad_norm": 0.012612699531018734, "kl": 0.007741181296296418, "learning_rate": 4.2857142857142855e-06, "loss": -0.0001, "step": 16, "step_time": 5.832437281002058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.005681818351149559, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005681818351149559, "completions/clipped_ratio": 0.0, "completions/max_length": 1045.0, "completions/max_terminated_length": 1045.0, "completions/mean_length": 366.90625, "completions/mean_terminated_length": 366.90625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 8.540326118469238, "epoch": 0.00017, "frac_reward_zero_std": 0.0, "grad_norm": 0.007581131067126989, "kl": 0.009920262295054272, "learning_rate": 4.571428571428572e-06, "loss": -0.0001, "num_tokens": 394125.0, "reward": 0.6872893571853638, "reward_std": 1.4782172441482544, "rewards/rollout_reward_func/mean": 0.6872893571853638, "rewards/rollout_reward_func/std": 1.6554666757583618, "sampling/importance_sampling_ratio/max": 0.04768194630742073, "sampling/importance_sampling_ratio/mean": 0.00561548862606287, "sampling/importance_sampling_ratio/min": 8.230732539971086e-09, "sampling/sampling_logp_difference/max": 4.795921325683594, "sampling/sampling_logp_difference/mean": 1.7115674018859863, "step": 17, "step_time": 9.275694907002617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.005681818351149559, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005681818351149559, "entropy": 8.520831406116486, "epoch": 0.00018, "grad_norm": 0.00865135621279478, "kl": 0.014904321054928005, "learning_rate": 4.857142857142858e-06, "loss": -0.0001, "step": 18, "step_time": 4.98086209599569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1234.0, "completions/max_terminated_length": 1234.0, "completions/mean_length": 225.5, "completions/mean_terminated_length": 231.9666748046875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 8.32957148551941, "epoch": 0.00019, "frac_reward_zero_std": 0.0, "grad_norm": 0.021184073761105537, "kl": 0.024745691451244056, "learning_rate": 5.142857142857142e-06, "loss": -0.001, "num_tokens": 435483.0, "reward": -0.0033794045448303223, "reward_std": 0.6692992448806763, "rewards/rollout_reward_func/mean": -0.0033794045448303223, "rewards/rollout_reward_func/std": 1.4659594297409058, "sampling/importance_sampling_ratio/max": 0.07749442756175995, "sampling/importance_sampling_ratio/mean": 0.021772563457489014, "sampling/importance_sampling_ratio/min": 3.699305618776183e-12, "sampling/sampling_logp_difference/max": 4.678776741027832, "sampling/sampling_logp_difference/mean": 1.577494502067566, "step": 19, "step_time": 9.652002476999769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 8.265057027339935, "epoch": 0.0002, "grad_norm": 0.02241630107164383, "kl": 0.031153250252828002, "learning_rate": 5.428571428571429e-06, "loss": -0.0011, "step": 20, "step_time": 5.367322839003464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1164.0, "completions/max_terminated_length": 1164.0, "completions/mean_length": 250.0625, "completions/mean_terminated_length": 250.0625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 8.517916679382324, "epoch": 0.00021, "frac_reward_zero_std": 0.0, "grad_norm": 0.015293768607079983, "kl": 0.04098286898806691, "learning_rate": 5.7142857142857145e-06, "loss": -0.0001, "num_tokens": 478402.0, "reward": 0.17407754063606262, "reward_std": 1.3331656455993652, "rewards/rollout_reward_func/mean": 0.17407754063606262, "rewards/rollout_reward_func/std": 1.5952818393707275, "sampling/importance_sampling_ratio/max": 0.10680845379829407, "sampling/importance_sampling_ratio/mean": 0.026019353419542313, "sampling/importance_sampling_ratio/min": 1.8563724843889857e-14, "sampling/sampling_logp_difference/max": 5.039712905883789, "sampling/sampling_logp_difference/mean": 1.7119972705841064, "step": 21, "step_time": 9.140643269001885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 8.431648015975952, "epoch": 0.00022, "grad_norm": 0.015540793538093567, "kl": 0.050505449529737234, "learning_rate": 6e-06, "loss": -0.0001, "step": 22, "step_time": 6.035511868998583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1404.0, "completions/max_terminated_length": 1404.0, "completions/mean_length": 229.96875, "completions/mean_terminated_length": 229.96875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 8.28187245130539, "epoch": 0.00023, "frac_reward_zero_std": 0.0, "grad_norm": 0.01112386491149664, "kl": 0.05019510630518198, "learning_rate": 6.285714285714286e-06, "loss": -0.0005, "num_tokens": 522044.0, "reward": 0.8441202044487, "reward_std": 1.1366348266601562, "rewards/rollout_reward_func/mean": 0.8441202044487, "rewards/rollout_reward_func/std": 1.4684224128723145, "sampling/importance_sampling_ratio/max": 0.11088211834430695, "sampling/importance_sampling_ratio/mean": 0.024211009964346886, "sampling/importance_sampling_ratio/min": 1.2648725089547952e-07, "sampling/sampling_logp_difference/max": 2.7317633628845215, "sampling/sampling_logp_difference/mean": 1.5488548278808594, "step": 23, "step_time": 9.90866866000033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 8.190486311912537, "epoch": 0.00024, "grad_norm": 0.010955761186778545, "kl": 0.060427254531532526, "learning_rate": 6.571428571428572e-06, "loss": -0.0005, "step": 24, "step_time": 5.680853004001619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/max_terminated_length": 399.0, "completions/mean_length": 96.1875, "completions/mean_terminated_length": 96.1875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 8.237438321113586, "epoch": 0.00025, "frac_reward_zero_std": 0.0, "grad_norm": 0.021000884473323822, "kl": 0.07217494118958712, "learning_rate": 6.857142857142858e-06, "loss": -0.002, "num_tokens": 557864.0, "reward": 0.2321830540895462, "reward_std": 1.4286524057388306, "rewards/rollout_reward_func/mean": 0.2321830540895462, "rewards/rollout_reward_func/std": 1.6191457509994507, "sampling/importance_sampling_ratio/max": 0.1522192656993866, "sampling/importance_sampling_ratio/mean": 0.04030841961503029, "sampling/importance_sampling_ratio/min": 9.864000892131908e-13, "sampling/sampling_logp_difference/max": 4.821762561798096, "sampling/sampling_logp_difference/mean": 1.6221089363098145, "step": 25, "step_time": 6.645266196002922 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 8.119462728500366, "epoch": 0.00026, "grad_norm": 0.017815686762332916, "kl": 0.07895512518007308, "learning_rate": 7.1428571428571436e-06, "loss": -0.0021, "step": 26, "step_time": 3.580845648997638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0052083334885537624, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0052083334885537624, "completions/clipped_ratio": 0.0, "completions/max_length": 566.0, "completions/max_terminated_length": 566.0, "completions/mean_length": 126.6875, "completions/mean_terminated_length": 126.6875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.889269471168518, "epoch": 0.00027, "frac_reward_zero_std": 0.0, "grad_norm": 0.01796940341591835, "kl": 0.1194403893314302, "learning_rate": 7.428571428571429e-06, "loss": -0.0028, "num_tokens": 595596.0, "reward": 0.45237910747528076, "reward_std": 1.3235435485839844, "rewards/rollout_reward_func/mean": 0.45237910747528076, "rewards/rollout_reward_func/std": 1.6986472606658936, "sampling/importance_sampling_ratio/max": 0.1697671264410019, "sampling/importance_sampling_ratio/mean": 0.0553402304649353, "sampling/importance_sampling_ratio/min": 2.4898798578476544e-09, "sampling/sampling_logp_difference/max": 4.452243328094482, "sampling/sampling_logp_difference/mean": 1.6267218589782715, "step": 27, "step_time": 7.124761773999126 }, { "clip_ratio/high_max": 0.09375, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 7.762769103050232, "epoch": 0.00028, "grad_norm": 0.015437250956892967, "kl": 0.13411633856594563, "learning_rate": 7.714285714285716e-06, "loss": -0.0029, "step": 28, "step_time": 4.926096689998303 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.014062500093132257, "completions/clipped_ratio": 0.0, "completions/max_length": 1151.0, "completions/max_terminated_length": 1151.0, "completions/mean_length": 255.5625, "completions/mean_terminated_length": 255.5625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.629159092903137, "epoch": 0.00029, "frac_reward_zero_std": 0.0, "grad_norm": 0.017112798988819122, "kl": 0.10179391177371144, "learning_rate": 8.000000000000001e-06, "loss": -0.0016, "num_tokens": 637667.0, "reward": 0.2644121050834656, "reward_std": 1.5657849311828613, "rewards/rollout_reward_func/mean": 0.2644121050834656, "rewards/rollout_reward_func/std": 1.7078864574432373, "sampling/importance_sampling_ratio/max": 0.1699082851409912, "sampling/importance_sampling_ratio/mean": 0.02451401948928833, "sampling/importance_sampling_ratio/min": 4.781616098625818e-07, "sampling/sampling_logp_difference/max": 3.8583059310913086, "sampling/sampling_logp_difference/mean": 1.4698448181152344, "step": 29, "step_time": 9.071784344996559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.008928571827709675, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008928571827709675, "entropy": 7.491530001163483, "epoch": 0.0003, "grad_norm": 0.01688656397163868, "kl": 0.1073358510620892, "learning_rate": 8.285714285714287e-06, "loss": -0.0016, "step": 30, "step_time": 5.187829844000589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 876.0, "completions/max_terminated_length": 876.0, "completions/mean_length": 294.3125, "completions/mean_terminated_length": 294.3125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.219549238681793, "epoch": 0.00031, "frac_reward_zero_std": 0.0, "grad_norm": 0.012751653790473938, "kl": 0.1194508052431047, "learning_rate": 8.571428571428571e-06, "loss": -0.0025, "num_tokens": 682570.0, "reward": 0.6622599363327026, "reward_std": 1.2865434885025024, "rewards/rollout_reward_func/mean": 0.6622599363327026, "rewards/rollout_reward_func/std": 1.4868965148925781, "sampling/importance_sampling_ratio/max": 0.21310703456401825, "sampling/importance_sampling_ratio/mean": 0.05447055771946907, "sampling/importance_sampling_ratio/min": 1.3286771718412638e-05, "sampling/sampling_logp_difference/max": 3.808666467666626, "sampling/sampling_logp_difference/mean": 1.3567442893981934, "step": 31, "step_time": 8.432441721999567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 7.122569680213928, "epoch": 0.00032, "grad_norm": 0.012448047287762165, "kl": 0.129300226457417, "learning_rate": 8.857142857142858e-06, "loss": -0.0026, "step": 32, "step_time": 4.625785149000876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1602.0, "completions/max_terminated_length": 1602.0, "completions/mean_length": 385.125, "completions/mean_terminated_length": 386.9032287597656, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.8571107387542725, "epoch": 0.00033, "frac_reward_zero_std": 0.0, "grad_norm": 0.011674328707158566, "kl": 0.1611535008996725, "learning_rate": 9.142857142857144e-06, "loss": -0.0013, "num_tokens": 730388.0, "reward": 0.657553493976593, "reward_std": 1.437880277633667, "rewards/rollout_reward_func/mean": 0.657553493976593, "rewards/rollout_reward_func/std": 1.4631379842758179, "sampling/importance_sampling_ratio/max": 0.22315549850463867, "sampling/importance_sampling_ratio/mean": 0.061460018157958984, "sampling/importance_sampling_ratio/min": 3.4622338329626245e-09, "sampling/sampling_logp_difference/max": 4.656834602355957, "sampling/sampling_logp_difference/mean": 1.2767832279205322, "step": 33, "step_time": 10.797754528997757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 6.809687912464142, "epoch": 0.00034, "grad_norm": 0.01121480017900467, "kl": 0.16510186530649662, "learning_rate": 9.42857142857143e-06, "loss": -0.0013, "step": 34, "step_time": 6.442160835997129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 143.0, "completions/max_terminated_length": 143.0, "completions/mean_length": 15.71875, "completions/mean_terminated_length": 15.709676742553711, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.408268332481384, "epoch": 0.00035, "frac_reward_zero_std": 0.0, "grad_norm": 0.027532009407877922, "kl": 0.3084005378186703, "learning_rate": 9.714285714285715e-06, "loss": -0.0077, "num_tokens": 761720.0, "reward": 0.4543757438659668, "reward_std": 1.2813209295272827, "rewards/rollout_reward_func/mean": 0.4543757438659668, "rewards/rollout_reward_func/std": 1.5850839614868164, "sampling/importance_sampling_ratio/max": 0.2473972737789154, "sampling/importance_sampling_ratio/mean": 0.10902312397956848, "sampling/importance_sampling_ratio/min": 2.4215635052726725e-10, "sampling/sampling_logp_difference/max": 4.783283710479736, "sampling/sampling_logp_difference/mean": 1.4385796785354614, "step": 35, "step_time": 5.794476761002443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.026041666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.026041666977107525, "entropy": 7.312821626663208, "epoch": 0.00036, "grad_norm": 0.031126106157898903, "kl": 0.3366778027266264, "learning_rate": 1e-05, "loss": -0.0079, "step": 36, "step_time": 3.273662132998652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1002.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 170.78125, "completions/mean_terminated_length": 170.78125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.80996310710907, "epoch": 0.00037, "frac_reward_zero_std": 0.0, "grad_norm": 0.034487318247556686, "kl": 0.3705834187567234, "learning_rate": 9.9999999995372e-06, "loss": -0.0017, "num_tokens": 800406.0, "reward": 1.3762025833129883, "reward_std": 1.0732836723327637, "rewards/rollout_reward_func/mean": 1.3762025833129883, "rewards/rollout_reward_func/std": 1.2089124917984009, "sampling/importance_sampling_ratio/max": 0.270050048828125, "sampling/importance_sampling_ratio/mean": 0.0765732079744339, "sampling/importance_sampling_ratio/min": 7.707410986768082e-05, "sampling/sampling_logp_difference/max": 4.371009826660156, "sampling/sampling_logp_difference/mean": 1.1559932231903076, "step": 37, "step_time": 8.572978425996553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.026041666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.026041666977107525, "entropy": 6.771854102611542, "epoch": 0.00038, "grad_norm": 0.023663703352212906, "kl": 0.36311932653188705, "learning_rate": 9.999999998148802e-06, "loss": -0.0018, "step": 38, "step_time": 4.840308881997771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1094.0, "completions/max_terminated_length": 1094.0, "completions/mean_length": 340.03125, "completions/mean_terminated_length": 340.03125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.101892292499542, "epoch": 0.00039, "frac_reward_zero_std": 0.0, "grad_norm": 0.026734596118330956, "kl": 0.23003484681248665, "learning_rate": 9.999999995834804e-06, "loss": 0.0006, "num_tokens": 846458.0, "reward": 0.18020382523536682, "reward_std": 1.3345727920532227, "rewards/rollout_reward_func/mean": 0.18020382523536682, "rewards/rollout_reward_func/std": 1.5456935167312622, "sampling/importance_sampling_ratio/max": 0.2798444926738739, "sampling/importance_sampling_ratio/mean": 0.04665118083357811, "sampling/importance_sampling_ratio/min": 1.3666953679880578e-10, "sampling/sampling_logp_difference/max": 11.348541259765625, "sampling/sampling_logp_difference/mean": 1.3109968900680542, "step": 39, "step_time": 9.981618785999672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.08965773973613977, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.08965773973613977, "entropy": 7.170056581497192, "epoch": 0.0004, "grad_norm": 0.01202967669814825, "kl": 0.20657090097665787, "learning_rate": 9.999999992595207e-06, "loss": 0.0006, "step": 40, "step_time": 5.1502111310001055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1327.0, "completions/max_terminated_length": 1327.0, "completions/mean_length": 350.125, "completions/mean_terminated_length": 350.125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.999071955680847, "epoch": 0.00041, "frac_reward_zero_std": 0.0, "grad_norm": 0.08163890987634659, "kl": 0.6564499950036407, "learning_rate": 9.999999988430008e-06, "loss": -0.002, "num_tokens": 891120.0, "reward": 0.9664598703384399, "reward_std": 1.4639317989349365, "rewards/rollout_reward_func/mean": 0.9664598703384399, "rewards/rollout_reward_func/std": 1.570468544960022, "sampling/importance_sampling_ratio/max": 0.3111140727996826, "sampling/importance_sampling_ratio/mean": 0.09244965016841888, "sampling/importance_sampling_ratio/min": 7.74041311046858e-08, "sampling/sampling_logp_difference/max": 2.932063102722168, "sampling/sampling_logp_difference/mean": 1.0426199436187744, "step": 41, "step_time": 10.354115635000198 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.030505952890962362, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.04092262079939246, "entropy": 6.03212833404541, "epoch": 0.00042, "grad_norm": 0.04527117311954498, "kl": 0.4457097928971052, "learning_rate": 9.999999983339212e-06, "loss": -0.0024, "step": 42, "step_time": 5.677591968998968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 745.0, "completions/max_terminated_length": 745.0, "completions/mean_length": 173.78125, "completions/mean_terminated_length": 162.64515686035156, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.9677117466926575, "epoch": 0.00043, "frac_reward_zero_std": 0.0, "grad_norm": 0.046699922531843185, "kl": 0.2800980545580387, "learning_rate": 9.999999977322818e-06, "loss": 0.0008, "num_tokens": 931551.0, "reward": -0.07293945550918579, "reward_std": 1.3585679531097412, "rewards/rollout_reward_func/mean": -0.07293945550918579, "rewards/rollout_reward_func/std": 1.3309086561203003, "sampling/importance_sampling_ratio/max": 0.31925565004348755, "sampling/importance_sampling_ratio/mean": 0.0992564707994461, "sampling/importance_sampling_ratio/min": 3.88780015125878e-11, "sampling/sampling_logp_difference/max": 3.822598695755005, "sampling/sampling_logp_difference/mean": 1.3727352619171143, "step": 43, "step_time": 7.911135208996711 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.026041666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03645833395421505, "entropy": 7.107381463050842, "epoch": 0.00044, "grad_norm": 0.03732454404234886, "kl": 0.27290547639131546, "learning_rate": 9.999999970380822e-06, "loss": 0.0004, "step": 44, "step_time": 4.393481617000361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 566.0, "completions/max_terminated_length": 566.0, "completions/mean_length": 50.53125, "completions/mean_terminated_length": 50.53125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.275720000267029, "epoch": 0.00045, "frac_reward_zero_std": 0.0, "grad_norm": 0.03643759712576866, "kl": 0.27540356293320656, "learning_rate": 9.999999962513228e-06, "loss": -0.0062, "num_tokens": 963175.0, "reward": 0.5798416137695312, "reward_std": 1.8553012609481812, "rewards/rollout_reward_func/mean": 0.5798416137695312, "rewards/rollout_reward_func/std": 1.8484326601028442, "sampling/importance_sampling_ratio/max": 0.32663026452064514, "sampling/importance_sampling_ratio/mean": 0.1316564679145813, "sampling/importance_sampling_ratio/min": 1.305834060606986e-15, "sampling/sampling_logp_difference/max": 6.3899688720703125, "sampling/sampling_logp_difference/mean": 1.5742114782333374, "step": 45, "step_time": 7.820503961003851 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.026041666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.041666666977107525, "entropy": 7.316620469093323, "epoch": 0.00046, "grad_norm": 0.0320768877863884, "kl": 0.2760621029883623, "learning_rate": 9.999999953720035e-06, "loss": -0.0062, "step": 46, "step_time": 3.748464399997829 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "completions/clipped_ratio": 0.0, "completions/max_length": 1063.0, "completions/max_terminated_length": 1063.0, "completions/mean_length": 406.3125, "completions/mean_terminated_length": 406.3125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.247396767139435, "epoch": 0.00047, "frac_reward_zero_std": 0.0, "grad_norm": 0.047496166080236435, "kl": 0.22775631863623857, "learning_rate": 9.99999994400124e-06, "loss": -0.0024, "num_tokens": 1013139.0, "reward": 0.3617481589317322, "reward_std": 1.5428225994110107, "rewards/rollout_reward_func/mean": 0.3617481589317322, "rewards/rollout_reward_func/std": 1.663814663887024, "sampling/importance_sampling_ratio/max": 0.08151674270629883, "sampling/importance_sampling_ratio/mean": 0.01615138351917267, "sampling/importance_sampling_ratio/min": 3.156197792009152e-08, "sampling/sampling_logp_difference/max": 3.105797529220581, "sampling/sampling_logp_difference/mean": 1.285693645477295, "step": 47, "step_time": 9.300300393995713 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 7.2613853216171265, "epoch": 0.00048, "grad_norm": 0.03952091932296753, "kl": 0.19108295999467373, "learning_rate": 9.999999933356848e-06, "loss": -0.0024, "step": 48, "step_time": 5.077105316000598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 970.0, "completions/max_terminated_length": 970.0, "completions/mean_length": 202.5625, "completions/mean_terminated_length": 208.5806427001953, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.268330633640289, "epoch": 0.00049, "frac_reward_zero_std": 0.0, "grad_norm": 0.04115511104464531, "kl": 0.1908857487142086, "learning_rate": 9.999999921786855e-06, "loss": -0.004, "num_tokens": 1053647.0, "reward": 0.36281874775886536, "reward_std": 1.3740019798278809, "rewards/rollout_reward_func/mean": 0.36281874775886536, "rewards/rollout_reward_func/std": 1.658832311630249, "sampling/importance_sampling_ratio/max": 0.3280561864376068, "sampling/importance_sampling_ratio/mean": 0.09627939760684967, "sampling/importance_sampling_ratio/min": 5.470304114199576e-13, "sampling/sampling_logp_difference/max": 4.308981895446777, "sampling/sampling_logp_difference/mean": 1.3999559879302979, "step": 49, "step_time": 8.803509736997512 }, { "clip_ratio/high_max": 0.061698718927800655, "clip_ratio/high_mean": 0.030849359463900328, "clip_ratio/low_mean": 0.03866185899823904, "clip_ratio/low_min": 0.009615384973585606, "clip_ratio/region_mean": 0.06951121799647808, "entropy": 7.303388833999634, "epoch": 0.0005, "grad_norm": 0.026844611391425133, "kl": 0.1651767659932375, "learning_rate": 9.999999909291265e-06, "loss": -0.0042, "step": 50, "step_time": 5.2479510689991 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "completions/clipped_ratio": 0.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 152.46875, "completions/mean_terminated_length": 152.46875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.6286501288414, "epoch": 0.00051, "frac_reward_zero_std": 0.25, "grad_norm": 0.02272552251815796, "kl": 0.2713352106511593, "learning_rate": 9.999999895870075e-06, "loss": -0.0034, "num_tokens": 1092298.0, "reward": 1.3554623126983643, "reward_std": 1.2201993465423584, "rewards/rollout_reward_func/mean": 1.3554623126983643, "rewards/rollout_reward_func/std": 1.4114317893981934, "sampling/importance_sampling_ratio/max": 0.3435327410697937, "sampling/importance_sampling_ratio/mean": 0.1417078971862793, "sampling/importance_sampling_ratio/min": 0.00014692850527353585, "sampling/sampling_logp_difference/max": 3.0351338386535645, "sampling/sampling_logp_difference/mean": 1.1783831119537354, "step": 51, "step_time": 8.89473703800104 }, { "clip_ratio/high_max": 0.1197916679084301, "clip_ratio/high_mean": 0.05989583395421505, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.05989583395421505, "entropy": 6.605536103248596, "epoch": 0.00052, "grad_norm": 0.019110489636659622, "kl": 0.2619143519550562, "learning_rate": 9.999999881523285e-06, "loss": -0.0034, "step": 52, "step_time": 4.836728063997725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1237.0, "completions/max_terminated_length": 1237.0, "completions/mean_length": 546.65625, "completions/mean_terminated_length": 549.4193115234375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.710606813430786, "epoch": 0.00053, "frac_reward_zero_std": 0.0, "grad_norm": 0.01761268451809883, "kl": 0.12341306498274207, "learning_rate": 9.999999866250896e-06, "loss": -0.0015, "num_tokens": 1147221.0, "reward": 0.673620343208313, "reward_std": 1.6765152215957642, "rewards/rollout_reward_func/mean": 0.673620343208313, "rewards/rollout_reward_func/std": 1.6075048446655273, "sampling/importance_sampling_ratio/max": 0.07806026190519333, "sampling/importance_sampling_ratio/mean": 0.014422687701880932, "sampling/importance_sampling_ratio/min": 1.8906127748422041e-13, "sampling/sampling_logp_difference/max": 4.076859474182129, "sampling/sampling_logp_difference/mean": 1.3921265602111816, "step": 53, "step_time": 10.227058177995787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 7.712913393974304, "epoch": 0.00054, "grad_norm": 0.01709599979221821, "kl": 0.11993480985984206, "learning_rate": 9.999999850052909e-06, "loss": -0.0015, "step": 54, "step_time": 5.4916659079990495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1000.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 216.84375, "completions/mean_terminated_length": 216.84375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.985095500946045, "epoch": 0.00055, "frac_reward_zero_std": 0.0, "grad_norm": 0.03449093550443649, "kl": 0.25550913996994495, "learning_rate": 9.99999983292932e-06, "loss": -0.0055, "num_tokens": 1187468.0, "reward": 0.468566358089447, "reward_std": 1.2987395524978638, "rewards/rollout_reward_func/mean": 0.468566358089447, "rewards/rollout_reward_func/std": 1.5708911418914795, "sampling/importance_sampling_ratio/max": 0.35531288385391235, "sampling/importance_sampling_ratio/mean": 0.10985319316387177, "sampling/importance_sampling_ratio/min": 3.525473948684521e-05, "sampling/sampling_logp_difference/max": 2.8317575454711914, "sampling/sampling_logp_difference/mean": 1.197380781173706, "step": 55, "step_time": 8.530665742997371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 6.980204820632935, "epoch": 0.00056, "grad_norm": 0.03531354293227196, "kl": 0.25823790952563286, "learning_rate": 9.999999814880132e-06, "loss": -0.0056, "step": 56, "step_time": 5.323749295999733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1044.0, "completions/max_terminated_length": 1044.0, "completions/mean_length": 245.5, "completions/mean_terminated_length": 245.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 8.020567119121552, "epoch": 0.00057, "frac_reward_zero_std": 0.0, "grad_norm": 0.024744149297475815, "kl": 0.14257202204316854, "learning_rate": 9.999999795905347e-06, "loss": -0.0008, "num_tokens": 1230722.0, "reward": -0.39199885725975037, "reward_std": 0.8500816226005554, "rewards/rollout_reward_func/mean": -0.39199885725975037, "rewards/rollout_reward_func/std": 1.2477563619613647, "sampling/importance_sampling_ratio/max": 0.16220532357692719, "sampling/importance_sampling_ratio/mean": 0.024032242596149445, "sampling/importance_sampling_ratio/min": 4.631675830850327e-19, "sampling/sampling_logp_difference/max": 12.119287490844727, "sampling/sampling_logp_difference/mean": 1.5578806400299072, "step": 57, "step_time": 9.37269086500055 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 8.036377668380737, "epoch": 0.00058, "grad_norm": 0.02132246643304825, "kl": 0.1449498599395156, "learning_rate": 9.999999776004962e-06, "loss": -0.0008, "step": 58, "step_time": 4.935657740998067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0031250000465661287, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250000465661287, "completions/clipped_ratio": 0.03125, "completions/max_length": 906.0, "completions/max_terminated_length": 906.0, "completions/mean_length": 193.03125, "completions/mean_terminated_length": 188.06451416015625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.000362813472748, "epoch": 0.00059, "frac_reward_zero_std": 0.25, "grad_norm": 0.032315246760845184, "kl": 0.2961342129856348, "learning_rate": 9.999999755178978e-06, "loss": -0.0041, "num_tokens": 1269078.0, "reward": 1.2101643085479736, "reward_std": 1.3150997161865234, "rewards/rollout_reward_func/mean": 1.2101643085479736, "rewards/rollout_reward_func/std": 1.5926079750061035, "sampling/importance_sampling_ratio/max": 0.37034448981285095, "sampling/importance_sampling_ratio/mean": 0.11909395456314087, "sampling/importance_sampling_ratio/min": 5.351588075157784e-18, "sampling/sampling_logp_difference/max": 4.563706874847412, "sampling/sampling_logp_difference/mean": 1.3363635540008545, "step": 59, "step_time": 8.095476389000396 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 6.96342921257019, "epoch": 0.0006, "grad_norm": 0.024006422609090805, "kl": 0.2776392586529255, "learning_rate": 9.999999733427394e-06, "loss": -0.0042, "step": 60, "step_time": 4.516711758002202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 984.0, "completions/max_terminated_length": 984.0, "completions/mean_length": 195.46875, "completions/mean_terminated_length": 195.46875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.668850064277649, "epoch": 0.00061, "frac_reward_zero_std": 0.0, "grad_norm": 0.053888920694589615, "kl": 0.3705067876726389, "learning_rate": 9.99999971075021e-06, "loss": -0.0021, "num_tokens": 1308616.0, "reward": 0.9734269380569458, "reward_std": 1.2361981868743896, "rewards/rollout_reward_func/mean": 0.9734269380569458, "rewards/rollout_reward_func/std": 1.2544515132904053, "sampling/importance_sampling_ratio/max": 0.3786458671092987, "sampling/importance_sampling_ratio/mean": 0.10798259824514389, "sampling/importance_sampling_ratio/min": 0.0008864346309565008, "sampling/sampling_logp_difference/max": 3.1100988388061523, "sampling/sampling_logp_difference/mean": 1.0905110836029053, "step": 61, "step_time": 8.640960560998792 }, { "clip_ratio/high_max": 0.06770833395421505, "clip_ratio/high_mean": 0.033854166977107525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.033854166977107525, "entropy": 6.610040128231049, "epoch": 0.00062, "grad_norm": 0.05148441717028618, "kl": 0.36472541466355324, "learning_rate": 9.999999687147426e-06, "loss": -0.0023, "step": 62, "step_time": 5.644690904000527 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.018229166977107525, "completions/clipped_ratio": 0.0, "completions/max_length": 1340.0, "completions/max_terminated_length": 1340.0, "completions/mean_length": 182.53125, "completions/mean_terminated_length": 182.53125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.198797285556793, "epoch": 0.00063, "frac_reward_zero_std": 0.0, "grad_norm": 0.0507873073220253, "kl": 0.2798380646854639, "learning_rate": 9.999999662619046e-06, "loss": -0.0046, "num_tokens": 1348558.0, "reward": 0.47374916076660156, "reward_std": 1.1890579462051392, "rewards/rollout_reward_func/mean": 0.47374916076660156, "rewards/rollout_reward_func/std": 1.5008788108825684, "sampling/importance_sampling_ratio/max": 0.4002918004989624, "sampling/importance_sampling_ratio/mean": 0.10331062227487564, "sampling/importance_sampling_ratio/min": 1.8332357853978465e-08, "sampling/sampling_logp_difference/max": 3.267086982727051, "sampling/sampling_logp_difference/mean": 1.3027695417404175, "step": 63, "step_time": 9.725618934997328 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.026041666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.041666666977107525, "entropy": 7.157816052436829, "epoch": 0.00064, "grad_norm": 0.036967456340789795, "kl": 0.2905570128932595, "learning_rate": 9.999999637165062e-06, "loss": -0.0048, "step": 64, "step_time": 5.583494353995775 }, { "clip_ratio/high_max": 0.06458333414047956, "clip_ratio/high_mean": 0.03229166707023978, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03229166707023978, "completions/clipped_ratio": 0.03125, "completions/max_length": 1216.0, "completions/max_terminated_length": 1216.0, "completions/mean_length": 229.28125, "completions/mean_terminated_length": 236.16128540039062, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.734614431858063, "epoch": 0.00065, "frac_reward_zero_std": 0.0, "grad_norm": 0.05255567654967308, "kl": 0.6680928859859705, "learning_rate": 9.999999610785483e-06, "loss": -0.0055, "num_tokens": 1390622.0, "reward": 0.8056018352508545, "reward_std": 1.6579127311706543, "rewards/rollout_reward_func/mean": 0.8056018352508545, "rewards/rollout_reward_func/std": 1.7227931022644043, "sampling/importance_sampling_ratio/max": 0.40168115496635437, "sampling/importance_sampling_ratio/mean": 0.11529393494129181, "sampling/importance_sampling_ratio/min": 2.411804514153628e-22, "sampling/sampling_logp_difference/max": 10.456141471862793, "sampling/sampling_logp_difference/mean": 1.469014048576355, "step": 65, "step_time": 9.687163934002456 }, { "clip_ratio/high_max": 0.06944444449618459, "clip_ratio/high_mean": 0.034722222248092294, "clip_ratio/low_mean": 0.0034722222480922937, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03819444449618459, "entropy": 6.686477065086365, "epoch": 0.00066, "grad_norm": 0.03401870280504227, "kl": 0.5125397183001041, "learning_rate": 9.999999583480304e-06, "loss": -0.0056, "step": 66, "step_time": 5.245394550000128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1216.0, "completions/max_terminated_length": 1216.0, "completions/mean_length": 297.625, "completions/mean_terminated_length": 297.625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.941819667816162, "epoch": 0.00067, "frac_reward_zero_std": 0.0, "grad_norm": 0.05538209527730942, "kl": 0.23025558330118656, "learning_rate": 9.999999555249524e-06, "loss": -0.0065, "num_tokens": 1435618.0, "reward": 0.5712000131607056, "reward_std": 1.2800581455230713, "rewards/rollout_reward_func/mean": 0.5712000131607056, "rewards/rollout_reward_func/std": 1.3436827659606934, "sampling/importance_sampling_ratio/max": 0.41386640071868896, "sampling/importance_sampling_ratio/mean": 0.10727531462907791, "sampling/importance_sampling_ratio/min": 1.390353099850472e-05, "sampling/sampling_logp_difference/max": 2.821074962615967, "sampling/sampling_logp_difference/mean": 1.2511284351348877, "step": 67, "step_time": 9.45819388900054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 6.87233579158783, "epoch": 0.00068, "grad_norm": 0.028060290962457657, "kl": 0.2484555710107088, "learning_rate": 9.999999526093148e-06, "loss": -0.0067, "step": 68, "step_time": 6.171403989001192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1217.0, "completions/max_terminated_length": 1217.0, "completions/mean_length": 214.1875, "completions/mean_terminated_length": 182.3870849609375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.176604628562927, "epoch": 0.00069, "frac_reward_zero_std": 0.25, "grad_norm": 0.08219162374734879, "kl": 0.4720988757908344, "learning_rate": 9.999999496011169e-06, "loss": 0.002, "num_tokens": 1477880.0, "reward": -0.4379921853542328, "reward_std": 0.7547243237495422, "rewards/rollout_reward_func/mean": -0.4379921853542328, "rewards/rollout_reward_func/std": 0.914783239364624, "sampling/importance_sampling_ratio/max": 0.41385820508003235, "sampling/importance_sampling_ratio/mean": 0.08802695572376251, "sampling/importance_sampling_ratio/min": 3.8793166660093135e-16, "sampling/sampling_logp_difference/max": 4.104485034942627, "sampling/sampling_logp_difference/mean": 1.1844372749328613, "step": 69, "step_time": 9.514743249994353 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0052083334885537624, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02083333395421505, "entropy": 7.266442775726318, "epoch": 0.0007, "grad_norm": 0.06714635342359543, "kl": 0.45297517254948616, "learning_rate": 9.999999465003593e-06, "loss": 0.0018, "step": 70, "step_time": 5.331208953997702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1073.0, "completions/max_terminated_length": 1073.0, "completions/mean_length": 281.125, "completions/mean_terminated_length": 284.13336181640625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.927394151687622, "epoch": 0.00071, "frac_reward_zero_std": 0.0, "grad_norm": 0.029620256274938583, "kl": 0.30694188456982374, "learning_rate": 9.999999433070417e-06, "loss": -0.0055, "num_tokens": 1520190.0, "reward": 0.54471755027771, "reward_std": 1.6279047727584839, "rewards/rollout_reward_func/mean": 0.54471755027771, "rewards/rollout_reward_func/std": 1.8125107288360596, "sampling/importance_sampling_ratio/max": 0.4260944128036499, "sampling/importance_sampling_ratio/mean": 0.09133689105510712, "sampling/importance_sampling_ratio/min": 4.1069814107723424e-13, "sampling/sampling_logp_difference/max": 3.888707160949707, "sampling/sampling_logp_difference/mean": 1.2803136110305786, "step": 71, "step_time": 9.235269822996997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 7.008886158466339, "epoch": 0.00072, "grad_norm": 0.025310905650258064, "kl": 0.3155640885233879, "learning_rate": 9.999999400211643e-06, "loss": -0.0056, "step": 72, "step_time": 5.054130434999024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 878.0, "completions/max_terminated_length": 878.0, "completions/mean_length": 142.78125, "completions/mean_terminated_length": 142.78125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.236576974391937, "epoch": 0.00073, "frac_reward_zero_std": 0.0, "grad_norm": 0.03069308027625084, "kl": 0.4120718892663717, "learning_rate": 9.99999936642727e-06, "loss": -0.0022, "num_tokens": 1559564.0, "reward": -0.19795551896095276, "reward_std": 1.0526702404022217, "rewards/rollout_reward_func/mean": -0.19795551896095276, "rewards/rollout_reward_func/std": 1.278361439704895, "sampling/importance_sampling_ratio/max": 0.4394993185997009, "sampling/importance_sampling_ratio/mean": 0.11204324662685394, "sampling/importance_sampling_ratio/min": 1.4342249414767139e-05, "sampling/sampling_logp_difference/max": 4.996709823608398, "sampling/sampling_logp_difference/mean": 1.381803035736084, "step": 73, "step_time": 8.479502238003988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.015625, "clip_ratio/region_mean": 0.0234375, "entropy": 7.322831749916077, "epoch": 0.00074, "grad_norm": 0.020390469580888748, "kl": 0.40379553847014904, "learning_rate": 9.999999331717294e-06, "loss": -0.0024, "step": 74, "step_time": 4.995614593999562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 908.0, "completions/max_terminated_length": 894.0, "completions/mean_length": 304.09375, "completions/mean_terminated_length": 284.6128845214844, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.292997181415558, "epoch": 0.00075, "frac_reward_zero_std": 0.0, "grad_norm": 0.05809430405497551, "kl": 0.3454598095268011, "learning_rate": 9.999999296081722e-06, "loss": -0.0038, "num_tokens": 1603629.0, "reward": 0.5183199048042297, "reward_std": 2.0365378856658936, "rewards/rollout_reward_func/mean": 0.5183199048042297, "rewards/rollout_reward_func/std": 2.017329692840576, "sampling/importance_sampling_ratio/max": 0.42764565348625183, "sampling/importance_sampling_ratio/mean": 0.07393398880958557, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 6.131920337677002, "sampling/sampling_logp_difference/mean": 1.312680721282959, "step": 75, "step_time": 8.955736293000882 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 7.296175241470337, "epoch": 0.00076, "grad_norm": 0.048297636210918427, "kl": 0.3507429603487253, "learning_rate": 9.999999259520549e-06, "loss": -0.0039, "step": 76, "step_time": 4.69836780100195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 973.0, "completions/max_terminated_length": 973.0, "completions/mean_length": 310.65625, "completions/mean_terminated_length": 310.65625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.104390978813171, "epoch": 0.00077, "frac_reward_zero_std": 0.0, "grad_norm": 0.03229028359055519, "kl": 0.42647274397313595, "learning_rate": 9.99999922203378e-06, "loss": -0.0051, "num_tokens": 1648280.0, "reward": 1.5990478992462158, "reward_std": 1.4745867252349854, "rewards/rollout_reward_func/mean": 1.5990478992462158, "rewards/rollout_reward_func/std": 1.5096848011016846, "sampling/importance_sampling_ratio/max": 0.4455151855945587, "sampling/importance_sampling_ratio/mean": 0.08760487288236618, "sampling/importance_sampling_ratio/min": 3.254681814723881e-06, "sampling/sampling_logp_difference/max": 3.4753823280334473, "sampling/sampling_logp_difference/mean": 1.3567267656326294, "step": 77, "step_time": 8.833682922000662 }, { "clip_ratio/high_max": 0.054166668094694614, "clip_ratio/high_mean": 0.027083334047347307, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.027083334047347307, "entropy": 7.053032994270325, "epoch": 0.00078, "grad_norm": 0.021157264709472656, "kl": 0.4407466985285282, "learning_rate": 9.99999918362141e-06, "loss": -0.0052, "step": 78, "step_time": 4.849302880000323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1150.0, "completions/max_terminated_length": 1150.0, "completions/mean_length": 317.15625, "completions/mean_terminated_length": 296.3548278808594, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.508662641048431, "epoch": 0.00079, "frac_reward_zero_std": 0.0, "grad_norm": 0.030624650418758392, "kl": 0.3790788818150759, "learning_rate": 9.99999914428344e-06, "loss": -0.0018, "num_tokens": 1692440.0, "reward": 0.49749451875686646, "reward_std": 1.4477945566177368, "rewards/rollout_reward_func/mean": 0.49749451875686646, "rewards/rollout_reward_func/std": 1.5732295513153076, "sampling/importance_sampling_ratio/max": 0.45300352573394775, "sampling/importance_sampling_ratio/mean": 0.08658625930547714, "sampling/importance_sampling_ratio/min": 9.188736496289798e-17, "sampling/sampling_logp_difference/max": 4.02817440032959, "sampling/sampling_logp_difference/mean": 1.4698045253753662, "step": 79, "step_time": 10.2997571900014 }, { "clip_ratio/high_max": 0.0807291679084301, "clip_ratio/high_mean": 0.04036458395421505, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.04036458395421505, "entropy": 7.437947630882263, "epoch": 0.0008, "grad_norm": 0.024775438010692596, "kl": 0.3803098015487194, "learning_rate": 9.999999104019872e-06, "loss": -0.0019, "step": 80, "step_time": 5.140348835997429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 987.0, "completions/max_terminated_length": 987.0, "completions/mean_length": 225.125, "completions/mean_terminated_length": 222.4666748046875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.365742385387421, "epoch": 0.00081, "frac_reward_zero_std": 0.0, "grad_norm": 0.033354587852954865, "kl": 0.36084438022226095, "learning_rate": 9.999999062830703e-06, "loss": -0.0103, "num_tokens": 1733635.0, "reward": 1.0823521614074707, "reward_std": 1.3323743343353271, "rewards/rollout_reward_func/mean": 1.0823521614074707, "rewards/rollout_reward_func/std": 1.478363037109375, "sampling/importance_sampling_ratio/max": 0.4553918242454529, "sampling/importance_sampling_ratio/mean": 0.1584359109401703, "sampling/importance_sampling_ratio/min": 7.846818355673746e-15, "sampling/sampling_logp_difference/max": 4.652369976043701, "sampling/sampling_logp_difference/mean": 1.2663116455078125, "step": 81, "step_time": 8.535321508003108 }, { "clip_ratio/high_max": 0.03645833395421505, "clip_ratio/high_mean": 0.018229166977107525, "clip_ratio/low_mean": 0.0034722222480922937, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02170138922519982, "entropy": 6.2588791847229, "epoch": 0.00082, "grad_norm": 0.028964269906282425, "kl": 0.36925441306084394, "learning_rate": 9.999999020715937e-06, "loss": -0.0105, "step": 82, "step_time": 4.848957051002799 }, { "clip_ratio/high_max": 0.037500000558793545, "clip_ratio/high_mean": 0.018750000279396772, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.018750000279396772, "completions/clipped_ratio": 0.03125, "completions/max_length": 1378.0, "completions/max_terminated_length": 1378.0, "completions/mean_length": 340.15625, "completions/mean_terminated_length": 329.258056640625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.062274932861328, "epoch": 0.00083, "frac_reward_zero_std": 0.0, "grad_norm": 0.03480813279747963, "kl": 0.34026357531547546, "learning_rate": 9.999998977675572e-06, "loss": -0.005, "num_tokens": 1780360.0, "reward": 0.3363593816757202, "reward_std": 1.9151029586791992, "rewards/rollout_reward_func/mean": 0.3363593816757202, "rewards/rollout_reward_func/std": 1.9824378490447998, "sampling/importance_sampling_ratio/max": 0.17261536419391632, "sampling/importance_sampling_ratio/mean": 0.04178832098841667, "sampling/importance_sampling_ratio/min": 1.4837938256063413e-33, "sampling/sampling_logp_difference/max": 12.612214088439941, "sampling/sampling_logp_difference/mean": 1.3152199983596802, "step": 83, "step_time": 10.483766491999631 }, { "clip_ratio/high_max": 0.06668193091172725, "clip_ratio/high_mean": 0.033340965455863625, "clip_ratio/low_mean": 0.0036421912373043597, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.036983156693167984, "entropy": 6.9071003794670105, "epoch": 0.00084, "grad_norm": 0.02768600732088089, "kl": 0.34854941815137863, "learning_rate": 9.999998933709607e-06, "loss": -0.0051, "step": 84, "step_time": 5.740228850998392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1476.0, "completions/max_terminated_length": 1476.0, "completions/mean_length": 430.9375, "completions/mean_terminated_length": 430.9375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.989608705043793, "epoch": 0.00085, "frac_reward_zero_std": 0.0, "grad_norm": 0.034474339336156845, "kl": 0.39298102259635925, "learning_rate": 9.999998888818043e-06, "loss": -0.0071, "num_tokens": 1828649.0, "reward": 1.169428825378418, "reward_std": 1.6383922100067139, "rewards/rollout_reward_func/mean": 1.169428825378418, "rewards/rollout_reward_func/std": 1.631700873374939, "sampling/importance_sampling_ratio/max": 0.4656984806060791, "sampling/importance_sampling_ratio/mean": 0.13778579235076904, "sampling/importance_sampling_ratio/min": 1.8781375380697712e-13, "sampling/sampling_logp_difference/max": 4.335869789123535, "sampling/sampling_logp_difference/mean": 1.1001861095428467, "step": 85, "step_time": 11.517275280000831 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.008140756515786052, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015953256050124764, "entropy": 5.885313034057617, "epoch": 0.00086, "grad_norm": 0.03539878502488136, "kl": 0.3996395319700241, "learning_rate": 9.99999884300088e-06, "loss": -0.0072, "step": 86, "step_time": 5.903712761999486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1267.0, "completions/max_terminated_length": 1267.0, "completions/mean_length": 261.0625, "completions/mean_terminated_length": 261.0625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.7127275466918945, "epoch": 0.00087, "frac_reward_zero_std": 0.25, "grad_norm": 0.0624365471303463, "kl": 0.4296333026140928, "learning_rate": 9.999998796258118e-06, "loss": -0.0045, "num_tokens": 1871861.0, "reward": 0.9391739964485168, "reward_std": 1.1856367588043213, "rewards/rollout_reward_func/mean": 0.9391739964485168, "rewards/rollout_reward_func/std": 1.6325252056121826, "sampling/importance_sampling_ratio/max": 0.4733841121196747, "sampling/importance_sampling_ratio/mean": 0.1754748523235321, "sampling/importance_sampling_ratio/min": 1.0718729641931782e-13, "sampling/sampling_logp_difference/max": 4.748846054077148, "sampling/sampling_logp_difference/mean": 1.0371646881103516, "step": 87, "step_time": 9.388514973003112 }, { "clip_ratio/high_max": 0.005681818351149559, "clip_ratio/high_mean": 0.0028409091755747795, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013257576152682304, "entropy": 5.654805421829224, "epoch": 0.00088, "grad_norm": 0.023943381384015083, "kl": 0.43243107572197914, "learning_rate": 9.999998748589757e-06, "loss": -0.0047, "step": 88, "step_time": 5.391363743998227 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.003289473708719015, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009539473801851273, "completions/clipped_ratio": 0.0, "completions/max_length": 1099.0, "completions/max_terminated_length": 1099.0, "completions/mean_length": 406.21875, "completions/mean_terminated_length": 406.21875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.921933472156525, "epoch": 0.00089, "frac_reward_zero_std": 0.0, "grad_norm": 0.09088526666164398, "kl": 0.3913013506680727, "learning_rate": 9.999998699995797e-06, "loss": -0.0085, "num_tokens": 1921004.0, "reward": 0.7852585315704346, "reward_std": 1.558336615562439, "rewards/rollout_reward_func/mean": 0.7852585315704346, "rewards/rollout_reward_func/std": 1.601723313331604, "sampling/importance_sampling_ratio/max": 0.20759452879428864, "sampling/importance_sampling_ratio/mean": 0.07588660717010498, "sampling/importance_sampling_ratio/min": 4.0798853878643015e-11, "sampling/sampling_logp_difference/max": 4.066885948181152, "sampling/sampling_logp_difference/mean": 0.985002875328064, "step": 89, "step_time": 9.362732258001415 }, { "clip_ratio/high_max": 0.03645833395421505, "clip_ratio/high_mean": 0.018229166977107525, "clip_ratio/low_mean": 0.011101973708719015, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02933114068582654, "entropy": 5.88704389333725, "epoch": 0.0009, "grad_norm": 0.03964877128601074, "kl": 0.4276412148028612, "learning_rate": 9.999998650476238e-06, "loss": -0.0088, "step": 90, "step_time": 5.550653534999583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1367.0, "completions/max_terminated_length": 1367.0, "completions/mean_length": 433.25, "completions/mean_terminated_length": 446.70965576171875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.888664484024048, "epoch": 0.00091, "frac_reward_zero_std": 0.0, "grad_norm": 0.04443107917904854, "kl": 0.32053298875689507, "learning_rate": 9.99999860003108e-06, "loss": -0.0032, "num_tokens": 1971254.0, "reward": 0.03580296039581299, "reward_std": 1.6046538352966309, "rewards/rollout_reward_func/mean": 0.03580296039581299, "rewards/rollout_reward_func/std": 1.6297247409820557, "sampling/importance_sampling_ratio/max": 0.20198531448841095, "sampling/importance_sampling_ratio/mean": 0.054810017347335815, "sampling/importance_sampling_ratio/min": 5.400731305483478e-16, "sampling/sampling_logp_difference/max": 4.209969997406006, "sampling/sampling_logp_difference/mean": 1.2095879316329956, "step": 91, "step_time": 10.489770087999204 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0182291679084301, "entropy": 6.935928106307983, "epoch": 0.00092, "grad_norm": 0.03827163949608803, "kl": 0.31526413932442665, "learning_rate": 9.999998548660322e-06, "loss": -0.0033, "step": 92, "step_time": 5.524019339000006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1272.0, "completions/max_terminated_length": 1272.0, "completions/mean_length": 223.65625, "completions/mean_terminated_length": 215.64515686035156, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.349973261356354, "epoch": 0.00093, "frac_reward_zero_std": 0.25, "grad_norm": 0.04511662945151329, "kl": 0.6379904299974442, "learning_rate": 9.999998496363967e-06, "loss": -0.0026, "num_tokens": 2010786.0, "reward": 1.1421012878417969, "reward_std": 0.7492595911026001, "rewards/rollout_reward_func/mean": 1.1421012878417969, "rewards/rollout_reward_func/std": 1.5183076858520508, "sampling/importance_sampling_ratio/max": 0.49004289507865906, "sampling/importance_sampling_ratio/mean": 0.1957949995994568, "sampling/importance_sampling_ratio/min": 2.0931099735388425e-12, "sampling/sampling_logp_difference/max": 4.646952152252197, "sampling/sampling_logp_difference/mean": 0.8641407489776611, "step": 93, "step_time": 9.109722069999407 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.016741071827709675, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0271577388048172, "entropy": 5.397906720638275, "epoch": 0.00094, "grad_norm": 0.029647210612893105, "kl": 0.6275133043527603, "learning_rate": 9.999998443142012e-06, "loss": -0.0027, "step": 94, "step_time": 5.201335438001479 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.0, "completions/max_length": 1535.0, "completions/max_terminated_length": 1535.0, "completions/mean_length": 532.0625, "completions/mean_terminated_length": 532.0625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.68555212020874, "epoch": 0.00095, "frac_reward_zero_std": 0.0, "grad_norm": 0.10636124759912491, "kl": 0.31725696474313736, "learning_rate": 9.999998388994457e-06, "loss": -0.0052, "num_tokens": 2063240.0, "reward": 0.7128371596336365, "reward_std": 1.578428030014038, "rewards/rollout_reward_func/mean": 0.7128371596336365, "rewards/rollout_reward_func/std": 1.5735907554626465, "sampling/importance_sampling_ratio/max": 0.5234487652778625, "sampling/importance_sampling_ratio/mean": 0.08138619363307953, "sampling/importance_sampling_ratio/min": 1.5089591908492854e-20, "sampling/sampling_logp_difference/max": 4.228877544403076, "sampling/sampling_logp_difference/mean": 1.2495909929275513, "step": 95, "step_time": 10.727750500000184 }, { "clip_ratio/high_max": 0.031723485328257084, "clip_ratio/high_mean": 0.022111742291599512, "clip_ratio/low_mean": 0.026278409641236067, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.04839015193283558, "entropy": 6.727267324924469, "epoch": 0.00096, "grad_norm": 0.04814419522881508, "kl": 0.3084958763793111, "learning_rate": 9.999998333921305e-06, "loss": -0.0057, "step": 96, "step_time": 6.540860187004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1372.0, "completions/max_terminated_length": 1372.0, "completions/mean_length": 415.625, "completions/mean_terminated_length": 407.58062744140625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.041352331638336, "epoch": 0.00097, "frac_reward_zero_std": 0.25, "grad_norm": 0.02004031091928482, "kl": 0.3612249903380871, "learning_rate": 9.999998277922554e-06, "loss": -0.0068, "num_tokens": 2111447.0, "reward": 0.5839200019836426, "reward_std": 1.186773419380188, "rewards/rollout_reward_func/mean": 0.5839200019836426, "rewards/rollout_reward_func/std": 1.6305814981460571, "sampling/importance_sampling_ratio/max": 0.49396437406539917, "sampling/importance_sampling_ratio/mean": 0.13187283277511597, "sampling/importance_sampling_ratio/min": 2.0408528340019067e-22, "sampling/sampling_logp_difference/max": 11.906654357910156, "sampling/sampling_logp_difference/mean": 1.3742644786834717, "step": 97, "step_time": 10.671948198998507 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0016447368543595076, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009457236854359508, "entropy": 6.007218718528748, "epoch": 0.00098, "grad_norm": 0.019002355635166168, "kl": 0.3683354174718261, "learning_rate": 9.999998220998203e-06, "loss": -0.0069, "step": 98, "step_time": 5.6878760420040635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1323.0, "completions/max_terminated_length": 1323.0, "completions/mean_length": 379.21875, "completions/mean_terminated_length": 379.21875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.028254926204681, "epoch": 0.00099, "frac_reward_zero_std": 0.25, "grad_norm": 0.01955627091228962, "kl": 0.3738690111786127, "learning_rate": 9.999998163148253e-06, "loss": -0.008, "num_tokens": 2158048.0, "reward": 1.4591233730316162, "reward_std": 1.2845185995101929, "rewards/rollout_reward_func/mean": 1.4591233730316162, "rewards/rollout_reward_func/std": 1.4613511562347412, "sampling/importance_sampling_ratio/max": 0.49876323342323303, "sampling/importance_sampling_ratio/mean": 0.1496865302324295, "sampling/importance_sampling_ratio/min": 6.484492109937179e-11, "sampling/sampling_logp_difference/max": 4.339596748352051, "sampling/sampling_logp_difference/mean": 1.136824131011963, "step": 99, "step_time": 9.860940981998283 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.01065340917557478, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01690340880304575, "entropy": 6.014227211475372, "epoch": 0.001, "grad_norm": 0.012102395296096802, "kl": 0.37262267619371414, "learning_rate": 9.999998104372703e-06, "loss": -0.008, "step": 100, "step_time": 5.485911664003652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 985.0, "completions/max_terminated_length": 985.0, "completions/mean_length": 135.625, "completions/mean_terminated_length": 135.625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.8397017121315, "epoch": 0.00101, "frac_reward_zero_std": 0.5, "grad_norm": 0.01728295162320137, "kl": 0.6303941793739796, "learning_rate": 9.999998044671557e-06, "loss": -0.0023, "num_tokens": 2193187.0, "reward": 1.0575032234191895, "reward_std": 0.33878785371780396, "rewards/rollout_reward_func/mean": 1.0575032234191895, "rewards/rollout_reward_func/std": 1.1097846031188965, "sampling/importance_sampling_ratio/max": 0.5045081973075867, "sampling/importance_sampling_ratio/mean": 0.29938772320747375, "sampling/importance_sampling_ratio/min": 2.7675582311981195e-11, "sampling/sampling_logp_difference/max": 13.237771987915039, "sampling/sampling_logp_difference/mean": 0.9004347324371338, "step": 101, "step_time": 8.673296645998562 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 4.797123372554779, "epoch": 0.00102, "grad_norm": 0.016097890213131905, "kl": 0.6362945325672626, "learning_rate": 9.999997984044808e-06, "loss": -0.0023, "step": 102, "step_time": 5.261454469997261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1004.0, "completions/max_terminated_length": 1004.0, "completions/mean_length": 299.15625, "completions/mean_terminated_length": 299.15625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.124019116163254, "epoch": 0.00103, "frac_reward_zero_std": 0.0, "grad_norm": 0.034231338649988174, "kl": 0.405900564044714, "learning_rate": 9.999997922492466e-06, "loss": -0.0099, "num_tokens": 2239966.0, "reward": 0.6002016663551331, "reward_std": 2.041043996810913, "rewards/rollout_reward_func/mean": 0.6002016663551331, "rewards/rollout_reward_func/std": 2.0498502254486084, "sampling/importance_sampling_ratio/max": 0.245942622423172, "sampling/importance_sampling_ratio/mean": 0.08841930329799652, "sampling/importance_sampling_ratio/min": 8.245375953189193e-13, "sampling/sampling_logp_difference/max": 4.7913031578063965, "sampling/sampling_logp_difference/mean": 1.0250699520111084, "step": 103, "step_time": 9.260786904003908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 6.146236956119537, "epoch": 0.00104, "grad_norm": 0.034754928201436996, "kl": 0.4070160901173949, "learning_rate": 9.999997860014521e-06, "loss": -0.0099, "step": 104, "step_time": 4.883043375002671 }, { "clip_ratio/high_max": 0.02291666716337204, "clip_ratio/high_mean": 0.01145833358168602, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01145833358168602, "completions/clipped_ratio": 0.0, "completions/max_length": 1043.0, "completions/max_terminated_length": 1043.0, "completions/mean_length": 306.40625, "completions/mean_terminated_length": 306.40625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.641833186149597, "epoch": 0.00105, "frac_reward_zero_std": 0.0, "grad_norm": 0.1224449872970581, "kl": 0.3760248478502035, "learning_rate": 9.99999779661098e-06, "loss": -0.008, "num_tokens": 2286947.0, "reward": 0.09105631709098816, "reward_std": 1.0327988862991333, "rewards/rollout_reward_func/mean": 0.09105631709098816, "rewards/rollout_reward_func/std": 1.3747233152389526, "sampling/importance_sampling_ratio/max": 0.4335859417915344, "sampling/importance_sampling_ratio/mean": 0.1230776458978653, "sampling/importance_sampling_ratio/min": 8.717188961782085e-07, "sampling/sampling_logp_difference/max": 2.8526504039764404, "sampling/sampling_logp_difference/mean": 1.2973997592926025, "step": 105, "step_time": 9.11507095899833 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0052083334885537624, "clip_ratio/low_mean": 0.05208333395421505, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.05729166744276881, "entropy": 6.849429607391357, "epoch": 0.00106, "grad_norm": 0.05075541511178017, "kl": 0.3405358549207449, "learning_rate": 9.999997732281837e-06, "loss": -0.0083, "step": 106, "step_time": 5.0739364239998395 }, { "clip_ratio/high_max": 0.031250000931322575, "clip_ratio/high_mean": 0.015625000465661287, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.026041667442768812, "completions/clipped_ratio": 0.0, "completions/max_length": 1069.0, "completions/max_terminated_length": 1069.0, "completions/mean_length": 307.875, "completions/mean_terminated_length": 307.875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.41602486371994, "epoch": 0.00107, "frac_reward_zero_std": 0.0, "grad_norm": 0.07524735480546951, "kl": 0.44958692975342274, "learning_rate": 9.999997667027097e-06, "loss": -0.0092, "num_tokens": 2332663.0, "reward": 0.09737607091665268, "reward_std": 1.3682873249053955, "rewards/rollout_reward_func/mean": 0.09737607091665268, "rewards/rollout_reward_func/std": 1.5998203754425049, "sampling/importance_sampling_ratio/max": 0.5118520855903625, "sampling/importance_sampling_ratio/mean": 0.12941503524780273, "sampling/importance_sampling_ratio/min": 5.770057331794032e-22, "sampling/sampling_logp_difference/max": 3.307631015777588, "sampling/sampling_logp_difference/mean": 1.1542332172393799, "step": 107, "step_time": 9.307701382998857 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.054687500931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.06250000093132257, "entropy": 6.457621455192566, "epoch": 0.00108, "grad_norm": 0.05864467844367027, "kl": 0.4591484311968088, "learning_rate": 9.999997600846756e-06, "loss": -0.0094, "step": 108, "step_time": 5.9490226350026205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.03125, "completions/max_length": 1480.0, "completions/max_terminated_length": 1480.0, "completions/mean_length": 315.03125, "completions/mean_terminated_length": 306.4838562011719, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.545182108879089, "epoch": 0.00109, "frac_reward_zero_std": 0.0, "grad_norm": 0.06407377868890762, "kl": 0.42142258398234844, "learning_rate": 9.99999753374082e-06, "loss": -0.0066, "num_tokens": 2378648.0, "reward": 0.41595637798309326, "reward_std": 1.7173011302947998, "rewards/rollout_reward_func/mean": 0.41595637798309326, "rewards/rollout_reward_func/std": 1.7713727951049805, "sampling/importance_sampling_ratio/max": 0.5028932690620422, "sampling/importance_sampling_ratio/mean": 0.12672913074493408, "sampling/importance_sampling_ratio/min": 7.714930559571562e-19, "sampling/sampling_logp_difference/max": 4.187413215637207, "sampling/sampling_logp_difference/mean": 1.382702350616455, "step": 109, "step_time": 10.438851299004455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.026041666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.026041666977107525, "entropy": 6.557514727115631, "epoch": 0.0011, "grad_norm": 0.04565112292766571, "kl": 0.45403098687529564, "learning_rate": 9.999997465709281e-06, "loss": -0.0066, "step": 110, "step_time": 5.876818443000957 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "completions/clipped_ratio": 0.0, "completions/max_length": 457.0, "completions/max_terminated_length": 457.0, "completions/mean_length": 54.03125, "completions/mean_terminated_length": 54.03125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.514187157154083, "epoch": 0.00111, "frac_reward_zero_std": 0.25, "grad_norm": 0.1659078299999237, "kl": 0.7084426917135715, "learning_rate": 9.999997396752146e-06, "loss": -0.0084, "num_tokens": 2410455.0, "reward": 0.6727391481399536, "reward_std": 0.8498213291168213, "rewards/rollout_reward_func/mean": 0.6727391481399536, "rewards/rollout_reward_func/std": 1.5950909852981567, "sampling/importance_sampling_ratio/max": 0.51398104429245, "sampling/importance_sampling_ratio/mean": 0.23452544212341309, "sampling/importance_sampling_ratio/min": 2.032929114648141e-05, "sampling/sampling_logp_difference/max": 5.672000408172607, "sampling/sampling_logp_difference/mean": 1.006102442741394, "step": 111, "step_time": 6.494723582003644 }, { "clip_ratio/high_max": 0.08541666809469461, "clip_ratio/high_mean": 0.04270833404734731, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.04270833404734731, "entropy": 5.444120854139328, "epoch": 0.00112, "grad_norm": 0.04222972318530083, "kl": 0.7323946058750153, "learning_rate": 9.999997326869412e-06, "loss": -0.0088, "step": 112, "step_time": 3.5762624479975784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1117.0, "completions/max_terminated_length": 1117.0, "completions/mean_length": 205.09375, "completions/mean_terminated_length": 178.06451416015625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.276473701000214, "epoch": 0.00113, "frac_reward_zero_std": 0.25, "grad_norm": 0.05590462684631348, "kl": 0.6274412609636784, "learning_rate": 9.99999725606108e-06, "loss": -0.0099, "num_tokens": 2448972.0, "reward": 0.9153677821159363, "reward_std": 1.1084911823272705, "rewards/rollout_reward_func/mean": 0.9153677821159363, "rewards/rollout_reward_func/std": 1.682924509048462, "sampling/importance_sampling_ratio/max": 0.5197271108627319, "sampling/importance_sampling_ratio/mean": 0.2833191156387329, "sampling/importance_sampling_ratio/min": 9.445837655859119e-39, "sampling/sampling_logp_difference/max": 2.842677593231201, "sampling/sampling_logp_difference/mean": 0.789119303226471, "step": 113, "step_time": 10.073687967000296 }, { "clip_ratio/high_max": 0.017149390187114477, "clip_ratio/high_mean": 0.008574695093557239, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.018991362070664763, "entropy": 4.191626936197281, "epoch": 0.00114, "grad_norm": 0.04116111993789673, "kl": 0.6366442777216434, "learning_rate": 9.999997184327149e-06, "loss": -0.0101, "step": 114, "step_time": 5.423604208001052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1512.0, "completions/max_terminated_length": 1512.0, "completions/mean_length": 466.78125, "completions/mean_terminated_length": 466.78125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.054388791322708, "epoch": 0.00115, "frac_reward_zero_std": 0.0, "grad_norm": 0.12677058577537537, "kl": 0.42756376415491104, "learning_rate": 9.999997111667619e-06, "loss": -0.0135, "num_tokens": 2501665.0, "reward": 1.3751912117004395, "reward_std": 1.6187975406646729, "rewards/rollout_reward_func/mean": 1.3751912117004395, "rewards/rollout_reward_func/std": 1.671011209487915, "sampling/importance_sampling_ratio/max": 0.31230801343917847, "sampling/importance_sampling_ratio/mean": 0.14722087979316711, "sampling/importance_sampling_ratio/min": 2.5841462758698086e-13, "sampling/sampling_logp_difference/max": 13.848844528198242, "sampling/sampling_logp_difference/mean": 0.947532057762146, "step": 115, "step_time": 10.6174918910001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.014322916977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.014322916977107525, "entropy": 5.006837844848633, "epoch": 0.00116, "grad_norm": 0.0490141399204731, "kl": 0.44543037563562393, "learning_rate": 9.999997038082489e-06, "loss": -0.0137, "step": 116, "step_time": 6.010180175999267 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0052083334885537624, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0052083334885537624, "completions/clipped_ratio": 0.03125, "completions/max_length": 1041.0, "completions/max_terminated_length": 1041.0, "completions/mean_length": 497.09375, "completions/mean_terminated_length": 497.6773986816406, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.226880729198456, "epoch": 0.00117, "frac_reward_zero_std": 0.0, "grad_norm": 0.09204043447971344, "kl": 0.27793290093541145, "learning_rate": 9.999996963571762e-06, "loss": -0.0083, "num_tokens": 2554918.0, "reward": 0.9424960613250732, "reward_std": 1.5272129774093628, "rewards/rollout_reward_func/mean": 0.9424960613250732, "rewards/rollout_reward_func/std": 1.519365668296814, "sampling/importance_sampling_ratio/max": 0.2733117341995239, "sampling/importance_sampling_ratio/mean": 0.08107390999794006, "sampling/importance_sampling_ratio/min": 1.9554236985186435e-07, "sampling/sampling_logp_difference/max": 5.170700550079346, "sampling/sampling_logp_difference/mean": 1.1114184856414795, "step": 117, "step_time": 9.265378948999569 }, { "clip_ratio/high_max": 0.018181818537414074, "clip_ratio/high_mean": 0.009090909268707037, "clip_ratio/low_mean": 0.013257576152682304, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02234848542138934, "entropy": 6.239746928215027, "epoch": 0.00118, "grad_norm": 0.04160206764936447, "kl": 0.26787539571523666, "learning_rate": 9.999996888135438e-06, "loss": -0.0087, "step": 118, "step_time": 5.059421743999337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1108.0, "completions/max_terminated_length": 1108.0, "completions/mean_length": 203.59375, "completions/mean_terminated_length": 203.59375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.1009023785591125, "epoch": 0.00119, "frac_reward_zero_std": 0.25, "grad_norm": 0.08362006396055222, "kl": 0.5631408467888832, "learning_rate": 9.999996811773512e-06, "loss": -0.0046, "num_tokens": 2594838.0, "reward": 0.8804823756217957, "reward_std": 1.0063049793243408, "rewards/rollout_reward_func/mean": 0.8804823756217957, "rewards/rollout_reward_func/std": 1.4376641511917114, "sampling/importance_sampling_ratio/max": 0.5215824842453003, "sampling/importance_sampling_ratio/mean": 0.25886407494544983, "sampling/importance_sampling_ratio/min": 2.371611299167853e-05, "sampling/sampling_logp_difference/max": 3.9509668350219727, "sampling/sampling_logp_difference/mean": 0.8509246110916138, "step": 119, "step_time": 9.60098077800103 }, { "clip_ratio/high_max": 0.11458333395421505, "clip_ratio/high_mean": 0.057291666977107525, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.07291666697710752, "entropy": 4.9484845995903015, "epoch": 0.0012, "grad_norm": 0.04937903210520744, "kl": 0.5874460749328136, "learning_rate": 9.999996734485989e-06, "loss": -0.0046, "step": 120, "step_time": 5.3919139620011265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "completions/clipped_ratio": 0.0, "completions/max_length": 1476.0, "completions/max_terminated_length": 1476.0, "completions/mean_length": 314.09375, "completions/mean_terminated_length": 314.09375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.817377746105194, "epoch": 0.00121, "frac_reward_zero_std": 0.0, "grad_norm": 0.2279805690050125, "kl": 0.543942891061306, "learning_rate": 9.999996656272867e-06, "loss": -0.0007, "num_tokens": 2640107.0, "reward": 0.17706608772277832, "reward_std": 0.9860303997993469, "rewards/rollout_reward_func/mean": 0.17706608772277832, "rewards/rollout_reward_func/std": 1.1346226930618286, "sampling/importance_sampling_ratio/max": 0.5366309881210327, "sampling/importance_sampling_ratio/mean": 0.1822671741247177, "sampling/importance_sampling_ratio/min": 1.1153234424909897e-07, "sampling/sampling_logp_difference/max": 3.124300241470337, "sampling/sampling_logp_difference/mean": 1.0627557039260864, "step": 121, "step_time": 10.310463015997811 }, { "clip_ratio/high_max": 0.05902777845039964, "clip_ratio/high_mean": 0.02951388922519982, "clip_ratio/low_mean": 0.0034722222480922937, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03298611147329211, "entropy": 5.739033579826355, "epoch": 0.00122, "grad_norm": 0.08783575892448425, "kl": 0.5359132438898087, "learning_rate": 9.999996577134147e-06, "loss": -0.0013, "step": 122, "step_time": 5.863464875999853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 955.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 353.4375, "completions/mean_terminated_length": 353.4375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.164858162403107, "epoch": 0.00123, "frac_reward_zero_std": 0.25, "grad_norm": 0.04943426698446274, "kl": 0.47177792713046074, "learning_rate": 9.999996497069828e-06, "loss": -0.011, "num_tokens": 2686379.0, "reward": 1.360892653465271, "reward_std": 1.1387825012207031, "rewards/rollout_reward_func/mean": 1.360892653465271, "rewards/rollout_reward_func/std": 1.4339447021484375, "sampling/importance_sampling_ratio/max": 0.531945526599884, "sampling/importance_sampling_ratio/mean": 0.21239644289016724, "sampling/importance_sampling_ratio/min": 2.019521616603015e-06, "sampling/sampling_logp_difference/max": 2.651494026184082, "sampling/sampling_logp_difference/mean": 0.9138351678848267, "step": 123, "step_time": 8.71245954099868 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 5.17153388261795, "epoch": 0.00124, "grad_norm": 0.05071847885847092, "kl": 0.47336455807089806, "learning_rate": 9.99999641607991e-06, "loss": -0.0111, "step": 124, "step_time": 4.8266374790018745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 569.0, "completions/max_terminated_length": 569.0, "completions/mean_length": 147.875, "completions/mean_terminated_length": 146.33334350585938, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.451188087463379, "epoch": 0.00125, "frac_reward_zero_std": 0.25, "grad_norm": 0.04051702469587326, "kl": 0.5964268017560244, "learning_rate": 9.999996334164396e-06, "loss": -0.0124, "num_tokens": 2722248.0, "reward": 1.357757568359375, "reward_std": 1.1073684692382812, "rewards/rollout_reward_func/mean": 1.357757568359375, "rewards/rollout_reward_func/std": 1.4206875562667847, "sampling/importance_sampling_ratio/max": 0.5350269675254822, "sampling/importance_sampling_ratio/mean": 0.3197130560874939, "sampling/importance_sampling_ratio/min": 1.5445024632240928e-19, "sampling/sampling_logp_difference/max": 4.892044544219971, "sampling/sampling_logp_difference/mean": 0.9959245920181274, "step": 125, "step_time": 7.95942869099963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.461354166269302, "epoch": 0.00126, "grad_norm": 0.04393918812274933, "kl": 0.5932427421212196, "learning_rate": 9.999996251323281e-06, "loss": -0.0124, "step": 126, "step_time": 4.391282246000628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1208.0, "completions/max_terminated_length": 1208.0, "completions/mean_length": 269.5, "completions/mean_terminated_length": 277.6773986816406, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.984882056713104, "epoch": 0.00127, "frac_reward_zero_std": 0.0, "grad_norm": 0.14707772433757782, "kl": 0.49002211913466454, "learning_rate": 9.99999616755657e-06, "loss": -0.0136, "num_tokens": 2765759.0, "reward": 0.7648004293441772, "reward_std": 1.311969518661499, "rewards/rollout_reward_func/mean": 0.7648004293441772, "rewards/rollout_reward_func/std": 1.4130882024765015, "sampling/importance_sampling_ratio/max": 0.5386150479316711, "sampling/importance_sampling_ratio/mean": 0.18707990646362305, "sampling/importance_sampling_ratio/min": 5.137091381619829e-13, "sampling/sampling_logp_difference/max": 4.85533332824707, "sampling/sampling_logp_difference/mean": 1.1754341125488281, "step": 127, "step_time": 9.362888936002491 }, { "clip_ratio/high_max": 0.034722222946584225, "clip_ratio/high_mean": 0.017361111473292112, "clip_ratio/low_mean": 0.026041666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.04340277798473835, "entropy": 5.986009895801544, "epoch": 0.00128, "grad_norm": 0.08097370713949203, "kl": 0.4942837320268154, "learning_rate": 9.999996082864259e-06, "loss": -0.0136, "step": 128, "step_time": 5.319186756996714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0029761905316263437, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0029761905316263437, "completions/clipped_ratio": 0.03125, "completions/max_length": 1038.0, "completions/max_terminated_length": 1038.0, "completions/mean_length": 352.0, "completions/mean_terminated_length": 341.7419128417969, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.588346004486084, "epoch": 0.00129, "frac_reward_zero_std": 0.25, "grad_norm": 0.06606757640838623, "kl": 0.6007013879716396, "learning_rate": 9.99999599724635e-06, "loss": -0.0061, "num_tokens": 2811933.0, "reward": 0.7831199169158936, "reward_std": 1.380183458328247, "rewards/rollout_reward_func/mean": 0.7831199169158936, "rewards/rollout_reward_func/std": 1.7520012855529785, "sampling/importance_sampling_ratio/max": 0.5266455411911011, "sampling/importance_sampling_ratio/mean": 0.1883830726146698, "sampling/importance_sampling_ratio/min": 1.5369551328831095e-35, "sampling/sampling_logp_difference/max": 4.424635410308838, "sampling/sampling_logp_difference/mean": 1.1289480924606323, "step": 129, "step_time": 9.264469900001131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 5.586814761161804, "epoch": 0.0013, "grad_norm": 0.05836707726120949, "kl": 0.6037846878170967, "learning_rate": 9.999995910702842e-06, "loss": -0.0062, "step": 130, "step_time": 5.037554458998784 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "completions/clipped_ratio": 0.09375, "completions/max_length": 946.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 331.65625, "completions/mean_terminated_length": 329.96551513671875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.544417321681976, "epoch": 0.00131, "frac_reward_zero_std": 0.0, "grad_norm": 0.20491153001785278, "kl": 0.7837260775268078, "learning_rate": 9.999995823233738e-06, "loss": -0.0093, "num_tokens": 2859300.0, "reward": 0.28333067893981934, "reward_std": 1.186015248298645, "rewards/rollout_reward_func/mean": 0.28333067893981934, "rewards/rollout_reward_func/std": 1.2224314212799072, "sampling/importance_sampling_ratio/max": 0.5263879299163818, "sampling/importance_sampling_ratio/mean": 0.08992321789264679, "sampling/importance_sampling_ratio/min": 2.065764672638132e-18, "sampling/sampling_logp_difference/max": 4.985796928405762, "sampling/sampling_logp_difference/mean": 1.3705769777297974, "step": 131, "step_time": 9.491233105998617 }, { "clip_ratio/high_max": 0.028125000186264515, "clip_ratio/high_mean": 0.014062500093132257, "clip_ratio/low_mean": 0.016098485328257084, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03016098542138934, "entropy": 6.581247866153717, "epoch": 0.00132, "grad_norm": 0.07383998483419418, "kl": 0.4631114527583122, "learning_rate": 9.999995734839033e-06, "loss": -0.0103, "step": 132, "step_time": 4.7460043049995875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0052083334885537624, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0052083334885537624, "completions/clipped_ratio": 0.03125, "completions/max_length": 1410.0, "completions/max_terminated_length": 1410.0, "completions/mean_length": 275.78125, "completions/mean_terminated_length": 272.3870849609375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.059494763612747, "epoch": 0.00133, "frac_reward_zero_std": 0.25, "grad_norm": 0.11442829668521881, "kl": 0.7576182745397091, "learning_rate": 9.999995645518729e-06, "loss": -0.0071, "num_tokens": 2902043.0, "reward": 0.9746532440185547, "reward_std": 1.0634129047393799, "rewards/rollout_reward_func/mean": 0.9746532440185547, "rewards/rollout_reward_func/std": 1.821220874786377, "sampling/importance_sampling_ratio/max": 0.5382712483406067, "sampling/importance_sampling_ratio/mean": 0.2205704152584076, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.749239921569824, "sampling/sampling_logp_difference/mean": 1.3302011489868164, "step": 133, "step_time": 10.265796142999534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0015822785208001733, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0015822785208001733, "entropy": 5.06514573097229, "epoch": 0.00134, "grad_norm": 0.029743917286396027, "kl": 0.6999533139169216, "learning_rate": 9.999995555272829e-06, "loss": -0.0074, "step": 134, "step_time": 5.78055331199721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1402.0, "completions/max_terminated_length": 1402.0, "completions/mean_length": 456.71875, "completions/mean_terminated_length": 456.71875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.023734390735626, "epoch": 0.00135, "frac_reward_zero_std": 0.0, "grad_norm": 0.096840500831604, "kl": 0.38951464369893074, "learning_rate": 9.99999546410133e-06, "loss": -0.0186, "num_tokens": 2954210.0, "reward": 0.5871058702468872, "reward_std": 1.5889184474945068, "rewards/rollout_reward_func/mean": 0.5871058702468872, "rewards/rollout_reward_func/std": 1.5613205432891846, "sampling/importance_sampling_ratio/max": 0.2892765402793884, "sampling/importance_sampling_ratio/mean": 0.10191500186920166, "sampling/importance_sampling_ratio/min": 3.5683587161494668e-12, "sampling/sampling_logp_difference/max": 13.194729804992676, "sampling/sampling_logp_difference/mean": 1.3214075565338135, "step": 135, "step_time": 10.331662518994563 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 6.015859395265579, "epoch": 0.00136, "grad_norm": 0.056958794593811035, "kl": 0.38675643131136894, "learning_rate": 9.999995372004231e-06, "loss": -0.0187, "step": 136, "step_time": 6.2196079720015405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 884.0, "completions/max_terminated_length": 884.0, "completions/mean_length": 152.03125, "completions/mean_terminated_length": 152.03125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.075616180896759, "epoch": 0.00137, "frac_reward_zero_std": 0.0, "grad_norm": 0.06973341107368469, "kl": 0.525558540597558, "learning_rate": 9.999995278981537e-06, "loss": -0.0171, "num_tokens": 2994081.0, "reward": -0.07382479310035706, "reward_std": 1.223862648010254, "rewards/rollout_reward_func/mean": -0.07382479310035706, "rewards/rollout_reward_func/std": 1.5500718355178833, "sampling/importance_sampling_ratio/max": 0.5303486585617065, "sampling/importance_sampling_ratio/mean": 0.15351749956607819, "sampling/importance_sampling_ratio/min": 8.270180844873581e-21, "sampling/sampling_logp_difference/max": 13.568310737609863, "sampling/sampling_logp_difference/mean": 1.3080222606658936, "step": 137, "step_time": 8.686682781000854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.12395833386108279, "clip_ratio/low_min": 0.03125, "clip_ratio/region_mean": 0.12395833386108279, "entropy": 6.181974083185196, "epoch": 0.00138, "grad_norm": 0.03698393329977989, "kl": 0.5005155615508556, "learning_rate": 9.999995185033245e-06, "loss": -0.0172, "step": 138, "step_time": 4.573563836000176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 966.0, "completions/max_terminated_length": 966.0, "completions/mean_length": 256.71875, "completions/mean_terminated_length": 256.71875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.657118648290634, "epoch": 0.00139, "frac_reward_zero_std": 0.0, "grad_norm": 0.16534361243247986, "kl": 0.46016446873545647, "learning_rate": 9.999995090159351e-06, "loss": -0.0084, "num_tokens": 3037630.0, "reward": 0.3255360722541809, "reward_std": 1.1773626804351807, "rewards/rollout_reward_func/mean": 0.3255360722541809, "rewards/rollout_reward_func/std": 1.5113122463226318, "sampling/importance_sampling_ratio/max": 0.5409528613090515, "sampling/importance_sampling_ratio/mean": 0.22735556960105896, "sampling/importance_sampling_ratio/min": 2.8802038087860637e-08, "sampling/sampling_logp_difference/max": 4.239226818084717, "sampling/sampling_logp_difference/mean": 1.0942423343658447, "step": 139, "step_time": 8.36585226400166 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 5.747200787067413, "epoch": 0.0014, "grad_norm": 0.09071089327335358, "kl": 0.4859937075525522, "learning_rate": 9.999994994359862e-06, "loss": -0.0088, "step": 140, "step_time": 4.765403720000904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 989.0, "completions/max_terminated_length": 989.0, "completions/mean_length": 202.0625, "completions/mean_terminated_length": 202.0625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.726848840713501, "epoch": 0.00141, "frac_reward_zero_std": 0.0, "grad_norm": 0.26324036717414856, "kl": 0.5968099385499954, "learning_rate": 9.999994897634775e-06, "loss": -0.0095, "num_tokens": 3076466.0, "reward": 0.5245130658149719, "reward_std": 1.3122658729553223, "rewards/rollout_reward_func/mean": 0.5245130658149719, "rewards/rollout_reward_func/std": 1.4528554677963257, "sampling/importance_sampling_ratio/max": 0.5299118161201477, "sampling/importance_sampling_ratio/mean": 0.19099047780036926, "sampling/importance_sampling_ratio/min": 1.3519149888452375e-06, "sampling/sampling_logp_difference/max": 4.609641075134277, "sampling/sampling_logp_difference/mean": 1.0942786931991577, "step": 141, "step_time": 8.589285444002599 }, { "clip_ratio/high_max": 0.07500000018626451, "clip_ratio/high_mean": 0.03750000009313226, "clip_ratio/low_mean": 0.041666666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.07916666707023978, "entropy": 5.579538941383362, "epoch": 0.00142, "grad_norm": 0.03808732330799103, "kl": 0.634294580668211, "learning_rate": 9.999994799984088e-06, "loss": -0.01, "step": 142, "step_time": 5.2748738569989655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1252.0, "completions/max_terminated_length": 1252.0, "completions/mean_length": 232.84375, "completions/mean_terminated_length": 232.84375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.350965142250061, "epoch": 0.00143, "frac_reward_zero_std": 0.0, "grad_norm": 0.14498719573020935, "kl": 0.6181293800473213, "learning_rate": 9.999994701407805e-06, "loss": -0.0109, "num_tokens": 3118303.0, "reward": 1.1550862789154053, "reward_std": 1.407928705215454, "rewards/rollout_reward_func/mean": 1.1550862789154053, "rewards/rollout_reward_func/std": 1.4551308155059814, "sampling/importance_sampling_ratio/max": 0.5454234480857849, "sampling/importance_sampling_ratio/mean": 0.23339049518108368, "sampling/importance_sampling_ratio/min": 1.354344725257306e-08, "sampling/sampling_logp_difference/max": 3.558464527130127, "sampling/sampling_logp_difference/mean": 0.9433708190917969, "step": 143, "step_time": 9.970181299000615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.03645833395421505, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03645833395421505, "entropy": 5.445265889167786, "epoch": 0.00144, "grad_norm": 0.056469447910785675, "kl": 0.6147040463984013, "learning_rate": 9.999994601905921e-06, "loss": -0.0112, "step": 144, "step_time": 5.4081195409980864 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "completions/clipped_ratio": 0.03125, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 328.5625, "completions/mean_terminated_length": 318.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.544199705123901, "epoch": 0.00145, "frac_reward_zero_std": 0.0, "grad_norm": 0.09949894994497299, "kl": 0.5922725312411785, "learning_rate": 9.999994501478441e-06, "loss": -0.0193, "num_tokens": 3165055.0, "reward": 0.5348415970802307, "reward_std": 1.5779697895050049, "rewards/rollout_reward_func/mean": 0.5348415970802307, "rewards/rollout_reward_func/std": 1.8612157106399536, "sampling/importance_sampling_ratio/max": 0.5469542741775513, "sampling/importance_sampling_ratio/mean": 0.1988597810268402, "sampling/importance_sampling_ratio/min": 1.4799239718377916e-30, "sampling/sampling_logp_difference/max": 13.341350555419922, "sampling/sampling_logp_difference/mean": 1.1088839769363403, "step": 145, "step_time": 8.817966873997648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 5.542767405509949, "epoch": 0.00146, "grad_norm": 0.0913492739200592, "kl": 0.5852375291287899, "learning_rate": 9.999994400125363e-06, "loss": -0.0196, "step": 146, "step_time": 4.818265863998022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00937500037252903, "clip_ratio/low_min": 0.0062500000931322575, "clip_ratio/region_mean": 0.00937500037252903, "completions/clipped_ratio": 0.15625, "completions/max_length": 1010.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 329.78125, "completions/mean_terminated_length": 306.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.420652151107788, "epoch": 0.00147, "frac_reward_zero_std": 0.0, "grad_norm": 0.29510030150413513, "kl": 0.3857837487012148, "learning_rate": 9.999994297846687e-06, "loss": -0.0245, "num_tokens": 3211221.0, "reward": 0.7677022218704224, "reward_std": 1.730741262435913, "rewards/rollout_reward_func/mean": 0.7677022218704224, "rewards/rollout_reward_func/std": 1.6786185503005981, "sampling/importance_sampling_ratio/max": 0.5375179648399353, "sampling/importance_sampling_ratio/mean": 0.14485961198806763, "sampling/importance_sampling_ratio/min": 5.253967941577032e-18, "sampling/sampling_logp_difference/max": 4.3326826095581055, "sampling/sampling_logp_difference/mean": 1.3817838430404663, "step": 147, "step_time": 9.08322525299991 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.01065340917557478, "clip_ratio/low_mean": 0.0031250000465661287, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013778409222140908, "entropy": 6.427646636962891, "epoch": 0.00148, "grad_norm": 0.021598653867840767, "kl": 0.384829880669713, "learning_rate": 9.999994194642413e-06, "loss": -0.0248, "step": 148, "step_time": 5.884060685006261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0024038462433964014, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0024038462433964014, "completions/clipped_ratio": 0.0, "completions/max_length": 1634.0, "completions/max_terminated_length": 1634.0, "completions/mean_length": 572.59375, "completions/mean_terminated_length": 572.59375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.716081321239471, "epoch": 0.00149, "frac_reward_zero_std": 0.0, "grad_norm": 0.02412598952651024, "kl": 0.5214381571859121, "learning_rate": 9.99999409051254e-06, "loss": -0.0062, "num_tokens": 3266106.0, "reward": 0.8084443807601929, "reward_std": 1.608506679534912, "rewards/rollout_reward_func/mean": 0.8084443807601929, "rewards/rollout_reward_func/std": 1.8740150928497314, "sampling/importance_sampling_ratio/max": 0.35724881291389465, "sampling/importance_sampling_ratio/mean": 0.06643085926771164, "sampling/importance_sampling_ratio/min": 1.711776272073174e-25, "sampling/sampling_logp_difference/max": 11.3582124710083, "sampling/sampling_logp_difference/mean": 1.4573895931243896, "step": 149, "step_time": 11.15577777800354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 6.698237925767899, "epoch": 0.0015, "grad_norm": 0.02065202407538891, "kl": 0.5115248821675777, "learning_rate": 9.999993985457072e-06, "loss": -0.0062, "step": 150, "step_time": 6.200609722996887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1270.0, "completions/max_terminated_length": 1270.0, "completions/mean_length": 420.375, "completions/mean_terminated_length": 420.375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.743380188941956, "epoch": 0.00151, "frac_reward_zero_std": 0.25, "grad_norm": 0.08343182504177094, "kl": 0.619354072958231, "learning_rate": 9.999993879476003e-06, "loss": -0.0054, "num_tokens": 3313477.0, "reward": 0.9337829947471619, "reward_std": 1.1797125339508057, "rewards/rollout_reward_func/mean": 0.9337829947471619, "rewards/rollout_reward_func/std": 1.5069445371627808, "sampling/importance_sampling_ratio/max": 0.5402515530586243, "sampling/importance_sampling_ratio/mean": 0.16568925976753235, "sampling/importance_sampling_ratio/min": 5.5183573944401435e-15, "sampling/sampling_logp_difference/max": 11.288495063781738, "sampling/sampling_logp_difference/mean": 1.175795078277588, "step": 151, "step_time": 10.222675440001694 }, { "clip_ratio/high_max": 0.043750000186264515, "clip_ratio/high_mean": 0.021875000093132257, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.028125000186264515, "entropy": 5.719349235296249, "epoch": 0.00152, "grad_norm": 0.03260941430926323, "kl": 0.6179546937346458, "learning_rate": 9.999993772569339e-06, "loss": -0.0056, "step": 152, "step_time": 5.591452245000255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1087.0, "completions/max_terminated_length": 1087.0, "completions/mean_length": 404.25, "completions/mean_terminated_length": 404.25, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.79553359746933, "epoch": 0.00153, "frac_reward_zero_std": 0.0, "grad_norm": 0.2938048839569092, "kl": 0.6594468373805285, "learning_rate": 9.999993664737076e-06, "loss": -0.0087, "num_tokens": 3362299.0, "reward": 0.7172097563743591, "reward_std": 1.767270803451538, "rewards/rollout_reward_func/mean": 0.7172097563743591, "rewards/rollout_reward_func/std": 1.7848536968231201, "sampling/importance_sampling_ratio/max": 0.3302801549434662, "sampling/importance_sampling_ratio/mean": 0.06501597911119461, "sampling/importance_sampling_ratio/min": 4.700225126642865e-13, "sampling/sampling_logp_difference/max": 12.837610244750977, "sampling/sampling_logp_difference/mean": 1.4568088054656982, "step": 153, "step_time": 9.263248439003291 }, { "clip_ratio/high_max": 0.028125000186264515, "clip_ratio/high_mean": 0.014062500093132257, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.020312500186264515, "entropy": 6.784326195716858, "epoch": 0.00154, "grad_norm": 0.042445454746484756, "kl": 0.3237547315657139, "learning_rate": 9.999993555979215e-06, "loss": -0.01, "step": 154, "step_time": 5.456604184000753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1219.0, "completions/max_terminated_length": 1219.0, "completions/mean_length": 350.4375, "completions/mean_terminated_length": 350.4375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.976927816867828, "epoch": 0.00155, "frac_reward_zero_std": 0.5, "grad_norm": 0.020192181691527367, "kl": 0.48494697734713554, "learning_rate": 9.999993446295754e-06, "loss": -0.0071, "num_tokens": 3407222.0, "reward": 1.4804209470748901, "reward_std": 0.777044951915741, "rewards/rollout_reward_func/mean": 1.4804209470748901, "rewards/rollout_reward_func/std": 1.1609394550323486, "sampling/importance_sampling_ratio/max": 0.5524735450744629, "sampling/importance_sampling_ratio/mean": 0.23949170112609863, "sampling/importance_sampling_ratio/min": 2.438021908801602e-07, "sampling/sampling_logp_difference/max": 2.7963478565216064, "sampling/sampling_logp_difference/mean": 1.0860295295715332, "step": 155, "step_time": 9.355763131998174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.963953197002411, "epoch": 0.00156, "grad_norm": 0.01975266821682453, "kl": 0.4849714897572994, "learning_rate": 9.999993335686697e-06, "loss": -0.0071, "step": 156, "step_time": 5.359726311997292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.004464285913854837, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004464285913854837, "completions/clipped_ratio": 0.0, "completions/max_length": 1005.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 362.75, "completions/mean_terminated_length": 362.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.557612299919128, "epoch": 0.00157, "frac_reward_zero_std": 0.0, "grad_norm": 0.07156626135110855, "kl": 0.36865008249878883, "learning_rate": 9.999993224152043e-06, "loss": -0.0152, "num_tokens": 3454426.0, "reward": 0.5658593773841858, "reward_std": 1.8324594497680664, "rewards/rollout_reward_func/mean": 0.5658593773841858, "rewards/rollout_reward_func/std": 1.801273226737976, "sampling/importance_sampling_ratio/max": 0.5471495389938354, "sampling/importance_sampling_ratio/mean": 0.12589004635810852, "sampling/importance_sampling_ratio/min": 4.972985134799046e-09, "sampling/sampling_logp_difference/max": 3.589749574661255, "sampling/sampling_logp_difference/mean": 1.256396770477295, "step": 157, "step_time": 8.876376242000333 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.004464285913854837, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012276785913854837, "entropy": 6.530344784259796, "epoch": 0.00158, "grad_norm": 0.029787888750433922, "kl": 0.3682572655379772, "learning_rate": 9.999993111691792e-06, "loss": -0.0153, "step": 158, "step_time": 4.910718766001082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1594.0, "completions/max_terminated_length": 1594.0, "completions/mean_length": 310.375, "completions/mean_terminated_length": 310.375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.460212409496307, "epoch": 0.00159, "frac_reward_zero_std": 0.5, "grad_norm": 0.02835588902235031, "kl": 0.5374187082052231, "learning_rate": 9.999992998305941e-06, "loss": -0.0028, "num_tokens": 3497244.0, "reward": 1.1894067525863647, "reward_std": 0.7383108139038086, "rewards/rollout_reward_func/mean": 1.1894067525863647, "rewards/rollout_reward_func/std": 1.4968420267105103, "sampling/importance_sampling_ratio/max": 0.5524495244026184, "sampling/importance_sampling_ratio/mean": 0.25248873233795166, "sampling/importance_sampling_ratio/min": 8.010410965653136e-07, "sampling/sampling_logp_difference/max": 3.1090216636657715, "sampling/sampling_logp_difference/mean": 1.0261459350585938, "step": 159, "step_time": 10.670578221001051 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 5.432766407728195, "epoch": 0.0016, "grad_norm": 0.020340725779533386, "kl": 0.5383671633899212, "learning_rate": 9.999992883994494e-06, "loss": -0.0028, "step": 160, "step_time": 6.3994470779962285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1108.0, "completions/max_terminated_length": 1108.0, "completions/mean_length": 266.40625, "completions/mean_terminated_length": 274.4838562011719, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.271368682384491, "epoch": 0.00161, "frac_reward_zero_std": 0.0, "grad_norm": 0.12705928087234497, "kl": 0.49628811702132225, "learning_rate": 9.999992768757449e-06, "loss": -0.0218, "num_tokens": 3539657.0, "reward": 0.9759573936462402, "reward_std": 1.620126724243164, "rewards/rollout_reward_func/mean": 0.9759573936462402, "rewards/rollout_reward_func/std": 1.5594216585159302, "sampling/importance_sampling_ratio/max": 0.5443903207778931, "sampling/importance_sampling_ratio/mean": 0.2562691569328308, "sampling/importance_sampling_ratio/min": 5.054309401564261e-11, "sampling/sampling_logp_difference/max": 3.87623929977417, "sampling/sampling_logp_difference/mean": 1.0143762826919556, "step": 161, "step_time": 8.926538710999012 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 5.222929507493973, "epoch": 0.00162, "grad_norm": 0.1310614049434662, "kl": 0.5034007281064987, "learning_rate": 9.999992652594807e-06, "loss": -0.0223, "step": 162, "step_time": 5.118272855001123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 587.0, "completions/max_terminated_length": 587.0, "completions/mean_length": 82.5, "completions/mean_terminated_length": 82.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.705794394016266, "epoch": 0.00163, "frac_reward_zero_std": 0.75, "grad_norm": 0.03128790110349655, "kl": 0.8748921379446983, "learning_rate": 9.999992535506568e-06, "loss": -0.0003, "num_tokens": 3573010.0, "reward": 1.1238605976104736, "reward_std": 0.4045417904853821, "rewards/rollout_reward_func/mean": 1.1238605976104736, "rewards/rollout_reward_func/std": 1.2640471458435059, "sampling/importance_sampling_ratio/max": 0.5497341156005859, "sampling/importance_sampling_ratio/mean": 0.4179760813713074, "sampling/importance_sampling_ratio/min": 0.0008850442827679217, "sampling/sampling_logp_difference/max": 2.6361653804779053, "sampling/sampling_logp_difference/mean": 0.5517135858535767, "step": 163, "step_time": 7.0462583050011744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 3.6765071153640747, "epoch": 0.00164, "grad_norm": 0.05255028232932091, "kl": 0.969824306666851, "learning_rate": 9.99999241749273e-06, "loss": -0.0002, "step": 164, "step_time": 3.9924210520021006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0031250000465661287, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250000465661287, "completions/clipped_ratio": 0.0, "completions/max_length": 841.0, "completions/max_terminated_length": 841.0, "completions/mean_length": 145.59375, "completions/mean_terminated_length": 145.59375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.976941674947739, "epoch": 0.00165, "frac_reward_zero_std": 0.0, "grad_norm": 0.09259577095508575, "kl": 0.44746802002191544, "learning_rate": 9.999992298553295e-06, "loss": -0.0098, "num_tokens": 3611662.0, "reward": 0.8316409587860107, "reward_std": 1.668660044670105, "rewards/rollout_reward_func/mean": 0.8316409587860107, "rewards/rollout_reward_func/std": 1.6130337715148926, "sampling/importance_sampling_ratio/max": 0.5561427474021912, "sampling/importance_sampling_ratio/mean": 0.2730861008167267, "sampling/importance_sampling_ratio/min": 3.842557877638377e-22, "sampling/sampling_logp_difference/max": 11.8527250289917, "sampling/sampling_logp_difference/mean": 1.016627311706543, "step": 165, "step_time": 9.159016465000605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 4.965047270059586, "epoch": 0.00166, "grad_norm": 0.09502418339252472, "kl": 0.4395981952548027, "learning_rate": 9.999992178688262e-06, "loss": -0.01, "step": 166, "step_time": 4.526903657995717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 917.0, "completions/max_terminated_length": 917.0, "completions/mean_length": 166.78125, "completions/mean_terminated_length": 166.78125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.213619232177734, "epoch": 0.00167, "frac_reward_zero_std": 0.25, "grad_norm": 0.07700156420469284, "kl": 0.5283562056720257, "learning_rate": 9.999992057897633e-06, "loss": -0.0106, "num_tokens": 3651346.0, "reward": 0.6084503531455994, "reward_std": 0.9882460236549377, "rewards/rollout_reward_func/mean": 0.6084503531455994, "rewards/rollout_reward_func/std": 1.5716466903686523, "sampling/importance_sampling_ratio/max": 0.553176999092102, "sampling/importance_sampling_ratio/mean": 0.27047812938690186, "sampling/importance_sampling_ratio/min": 6.847552867839113e-05, "sampling/sampling_logp_difference/max": 2.4595117568969727, "sampling/sampling_logp_difference/mean": 0.9074888229370117, "step": 167, "step_time": 8.346014934997584 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.018229166977107525, "entropy": 5.16839262843132, "epoch": 0.00168, "grad_norm": 0.047545596957206726, "kl": 0.5280695781111717, "learning_rate": 9.999991936181406e-06, "loss": -0.0107, "step": 168, "step_time": 4.638782616999379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1725.0, "completions/max_terminated_length": 1725.0, "completions/mean_length": 354.875, "completions/mean_terminated_length": 354.875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.955816030502319, "epoch": 0.00169, "frac_reward_zero_std": 0.0, "grad_norm": 0.11261148750782013, "kl": 0.8029100820422173, "learning_rate": 9.999991813539582e-06, "loss": -0.0098, "num_tokens": 3696369.0, "reward": 0.6662740707397461, "reward_std": 1.5012513399124146, "rewards/rollout_reward_func/mean": 0.6662740707397461, "rewards/rollout_reward_func/std": 1.8583002090454102, "sampling/importance_sampling_ratio/max": 0.5562251210212708, "sampling/importance_sampling_ratio/mean": 0.2357712984085083, "sampling/importance_sampling_ratio/min": 5.10116387886228e-06, "sampling/sampling_logp_difference/max": 2.705940008163452, "sampling/sampling_logp_difference/mean": 0.7665170431137085, "step": 169, "step_time": 11.466522851000263 }, { "clip_ratio/high_max": 0.0625, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0052083334885537624, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03645833348855376, "entropy": 4.846392661333084, "epoch": 0.0017, "grad_norm": 0.08255884051322937, "kl": 0.817722525447607, "learning_rate": 9.999991689972159e-06, "loss": -0.0104, "step": 170, "step_time": 6.42272757900173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0031250000465661287, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250000465661287, "completions/clipped_ratio": 0.0, "completions/max_length": 613.0, "completions/max_terminated_length": 613.0, "completions/mean_length": 177.875, "completions/mean_terminated_length": 177.875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.34237664937973, "epoch": 0.00171, "frac_reward_zero_std": 0.25, "grad_norm": 0.07669908553361893, "kl": 0.4646000824868679, "learning_rate": 9.999991565479141e-06, "loss": -0.0079, "num_tokens": 3736261.0, "reward": 0.7481116056442261, "reward_std": 1.0200226306915283, "rewards/rollout_reward_func/mean": 0.7481116056442261, "rewards/rollout_reward_func/std": 1.4853065013885498, "sampling/importance_sampling_ratio/max": 0.6421053409576416, "sampling/importance_sampling_ratio/mean": 0.2677951455116272, "sampling/importance_sampling_ratio/min": 1.272704055858831e-12, "sampling/sampling_logp_difference/max": 4.016266345977783, "sampling/sampling_logp_difference/mean": 0.9971870183944702, "step": 171, "step_time": 8.052830951999567 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.018229166977107525, "entropy": 5.311924040317535, "epoch": 0.00172, "grad_norm": 0.04414352774620056, "kl": 0.4668066892772913, "learning_rate": 9.999991440060524e-06, "loss": -0.0079, "step": 172, "step_time": 3.9593631380030274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1385.0, "completions/max_terminated_length": 1385.0, "completions/mean_length": 291.0, "completions/mean_terminated_length": 291.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.957534611225128, "epoch": 0.00173, "frac_reward_zero_std": 0.0, "grad_norm": 0.07133731991052628, "kl": 0.3659055456519127, "learning_rate": 9.99999131371631e-06, "loss": -0.0138, "num_tokens": 3780619.0, "reward": 1.230137586593628, "reward_std": 1.6522855758666992, "rewards/rollout_reward_func/mean": 1.230137586593628, "rewards/rollout_reward_func/std": 1.6534727811813354, "sampling/importance_sampling_ratio/max": 0.5533027648925781, "sampling/importance_sampling_ratio/mean": 0.1785704791545868, "sampling/importance_sampling_ratio/min": 5.316166381530916e-17, "sampling/sampling_logp_difference/max": 9.174009323120117, "sampling/sampling_logp_difference/mean": 1.2334866523742676, "step": 173, "step_time": 10.526826801002244 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0052083334885537624, "clip_ratio/low_mean": 0.0062806373462080956, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011488970834761858, "entropy": 5.987121224403381, "epoch": 0.00174, "grad_norm": 0.057008206844329834, "kl": 0.3569801915436983, "learning_rate": 9.999991186446498e-06, "loss": -0.0139, "step": 174, "step_time": 5.728656974997648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "completions/clipped_ratio": 0.0, "completions/max_length": 1042.0, "completions/max_terminated_length": 1042.0, "completions/mean_length": 317.8125, "completions/mean_terminated_length": 317.8125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.135067820549011, "epoch": 0.00175, "frac_reward_zero_std": 0.0, "grad_norm": 0.0968991369009018, "kl": 0.4115459620952606, "learning_rate": 9.99999105825109e-06, "loss": -0.0108, "num_tokens": 3827041.0, "reward": -0.028134871274232864, "reward_std": 1.021851658821106, "rewards/rollout_reward_func/mean": -0.028134871274232864, "rewards/rollout_reward_func/std": 1.3559147119522095, "sampling/importance_sampling_ratio/max": 0.5127630233764648, "sampling/importance_sampling_ratio/mean": 0.15375852584838867, "sampling/importance_sampling_ratio/min": 1.8468339249011478e-06, "sampling/sampling_logp_difference/max": 3.556701183319092, "sampling/sampling_logp_difference/mean": 1.0764145851135254, "step": 175, "step_time": 9.444200449001073 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.054687500931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.06093750149011612, "entropy": 6.219360649585724, "epoch": 0.00176, "grad_norm": 0.07355621457099915, "kl": 0.3992091342806816, "learning_rate": 9.999990929130086e-06, "loss": -0.0112, "step": 176, "step_time": 5.5850061510009255 }, { "clip_ratio/high_max": 0.008928571827709675, "clip_ratio/high_mean": 0.004464285913854837, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004464285913854837, "completions/clipped_ratio": 0.03125, "completions/max_length": 1489.0, "completions/max_terminated_length": 1489.0, "completions/mean_length": 299.84375, "completions/mean_terminated_length": 296.3548278808594, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.050711095333099, "epoch": 0.00177, "frac_reward_zero_std": 0.25, "grad_norm": 0.06759195774793625, "kl": 0.4806966111063957, "learning_rate": 9.999990799083483e-06, "loss": 0.0019, "num_tokens": 3870405.0, "reward": 0.6735219359397888, "reward_std": 1.1138962507247925, "rewards/rollout_reward_func/mean": 0.6735219359397888, "rewards/rollout_reward_func/std": 1.5127373933792114, "sampling/importance_sampling_ratio/max": 0.5526368021965027, "sampling/importance_sampling_ratio/mean": 0.2922145426273346, "sampling/importance_sampling_ratio/min": 1.4729282247008535e-12, "sampling/sampling_logp_difference/max": 3.825349807739258, "sampling/sampling_logp_difference/mean": 0.8867816925048828, "step": 177, "step_time": 10.659176168999693 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.03645833348855376, "clip_ratio/low_min": 0.03125, "clip_ratio/region_mean": 0.04270833358168602, "entropy": 5.0936920046806335, "epoch": 0.00178, "grad_norm": 0.070233054459095, "kl": 0.47114233672618866, "learning_rate": 9.999990668111284e-06, "loss": 0.0018, "step": 178, "step_time": 5.962138981001772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1306.0, "completions/max_terminated_length": 1306.0, "completions/mean_length": 479.59375, "completions/mean_terminated_length": 481.6773986816406, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.927120327949524, "epoch": 0.00179, "frac_reward_zero_std": 0.25, "grad_norm": 0.05900624766945839, "kl": 0.49644314870238304, "learning_rate": 9.999990536213489e-06, "loss": -0.0066, "num_tokens": 3920254.0, "reward": 1.1663615703582764, "reward_std": 1.3172894716262817, "rewards/rollout_reward_func/mean": 1.1663615703582764, "rewards/rollout_reward_func/std": 1.592987298965454, "sampling/importance_sampling_ratio/max": 0.5482404828071594, "sampling/importance_sampling_ratio/mean": 0.179252490401268, "sampling/importance_sampling_ratio/min": 4.5864189182620585e-12, "sampling/sampling_logp_difference/max": 4.79319953918457, "sampling/sampling_logp_difference/mean": 1.0514920949935913, "step": 179, "step_time": 10.36788942199928 }, { "clip_ratio/high_max": 0.008928571827709675, "clip_ratio/high_mean": 0.004464285913854837, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012276785913854837, "entropy": 5.967328310012817, "epoch": 0.0018, "grad_norm": 0.02552778832614422, "kl": 0.49383626878261566, "learning_rate": 9.999990403390095e-06, "loss": -0.0068, "step": 180, "step_time": 5.621117488999516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "completions/clipped_ratio": 0.0, "completions/max_length": 1739.0, "completions/max_terminated_length": 1739.0, "completions/mean_length": 448.71875, "completions/mean_terminated_length": 448.71875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.227092444896698, "epoch": 0.00181, "frac_reward_zero_std": 0.25, "grad_norm": 0.0778268352150917, "kl": 0.5568580236285925, "learning_rate": 9.999990269641104e-06, "loss": -0.0028, "num_tokens": 3969358.0, "reward": -0.05799638479948044, "reward_std": 1.191164493560791, "rewards/rollout_reward_func/mean": -0.05799638479948044, "rewards/rollout_reward_func/std": 1.5059521198272705, "sampling/importance_sampling_ratio/max": 0.5525257587432861, "sampling/importance_sampling_ratio/mean": 0.25480154156684875, "sampling/importance_sampling_ratio/min": 7.332294385792548e-19, "sampling/sampling_logp_difference/max": 4.69362211227417, "sampling/sampling_logp_difference/mean": 1.0694538354873657, "step": 181, "step_time": 11.58369078600117 }, { "clip_ratio/high_max": 0.061011905781924725, "clip_ratio/high_mean": 0.030505952890962362, "clip_ratio/low_mean": 0.024643640965223312, "clip_ratio/low_min": 0.00657894741743803, "clip_ratio/region_mean": 0.055149593856185675, "entropy": 5.3043752908706665, "epoch": 0.00182, "grad_norm": 0.0388735756278038, "kl": 0.5593919493257999, "learning_rate": 9.999990134966518e-06, "loss": -0.0034, "step": 182, "step_time": 6.9211284420016455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2241.0, "completions/max_terminated_length": 2241.0, "completions/mean_length": 519.84375, "completions/mean_terminated_length": 519.84375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.926717638969421, "epoch": 0.00183, "frac_reward_zero_std": 0.25, "grad_norm": 0.03752990439534187, "kl": 0.5144079811871052, "learning_rate": 9.999989999366333e-06, "loss": -0.0095, "num_tokens": 4020094.0, "reward": 0.8245809078216553, "reward_std": 1.6425037384033203, "rewards/rollout_reward_func/mean": 0.8245809078216553, "rewards/rollout_reward_func/std": 2.0338006019592285, "sampling/importance_sampling_ratio/max": 0.5255031585693359, "sampling/importance_sampling_ratio/mean": 0.17194995284080505, "sampling/importance_sampling_ratio/min": 3.976845030922262e-31, "sampling/sampling_logp_difference/max": 4.415557861328125, "sampling/sampling_logp_difference/mean": 1.1365923881530762, "step": 183, "step_time": 13.768604946995765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 5.97254341840744, "epoch": 0.00184, "grad_norm": 0.04853087663650513, "kl": 0.5095285959541798, "learning_rate": 9.999989862840553e-06, "loss": -0.0094, "step": 184, "step_time": 7.411158719000014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1299.0, "completions/max_terminated_length": 1299.0, "completions/mean_length": 577.78125, "completions/mean_terminated_length": 599.300048828125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.484279632568359, "epoch": 0.00185, "frac_reward_zero_std": 0.0, "grad_norm": 0.1085602194070816, "kl": 0.41525014862418175, "learning_rate": 9.999989725389174e-06, "loss": -0.0071, "num_tokens": 4074814.0, "reward": 0.25989145040512085, "reward_std": 1.6954600811004639, "rewards/rollout_reward_func/mean": 0.25989145040512085, "rewards/rollout_reward_func/std": 1.719590425491333, "sampling/importance_sampling_ratio/max": 0.5405915379524231, "sampling/importance_sampling_ratio/mean": 0.0650598481297493, "sampling/importance_sampling_ratio/min": 4.021739316933326e-18, "sampling/sampling_logp_difference/max": 14.764680862426758, "sampling/sampling_logp_difference/mean": 1.352997064590454, "step": 185, "step_time": 10.357603460000973 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.015625000465661287, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.026041667442768812, "entropy": 6.448411166667938, "epoch": 0.00186, "grad_norm": 0.0882006287574768, "kl": 0.4281228333711624, "learning_rate": 9.9999895870122e-06, "loss": -0.0073, "step": 186, "step_time": 5.598368594997737 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0052083334885537624, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0052083334885537624, "completions/clipped_ratio": 0.03125, "completions/max_length": 916.0, "completions/max_terminated_length": 916.0, "completions/mean_length": 460.15625, "completions/mean_terminated_length": 450.06451416015625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.002282619476318, "epoch": 0.00187, "frac_reward_zero_std": 0.0, "grad_norm": 0.050232402980327606, "kl": 0.6270738206803799, "learning_rate": 9.999989447709628e-06, "loss": -0.0174, "num_tokens": 4126341.0, "reward": 0.11134302616119385, "reward_std": 2.0587821006774902, "rewards/rollout_reward_func/mean": 0.11134302616119385, "rewards/rollout_reward_func/std": 2.3266868591308594, "sampling/importance_sampling_ratio/max": 0.46557074785232544, "sampling/importance_sampling_ratio/mean": 0.13627785444259644, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 4.920948505401611, "sampling/sampling_logp_difference/mean": 1.2818732261657715, "step": 187, "step_time": 9.648845336003433 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0052083334885537624, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0052083334885537624, "entropy": 5.945888012647629, "epoch": 0.00188, "grad_norm": 0.042693741619586945, "kl": 0.6327341198921204, "learning_rate": 9.99998930748146e-06, "loss": -0.0176, "step": 188, "step_time": 5.5798524799974984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 969.0, "completions/max_terminated_length": 969.0, "completions/mean_length": 382.0625, "completions/mean_terminated_length": 382.0625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.778222173452377, "epoch": 0.00189, "frac_reward_zero_std": 0.0, "grad_norm": 0.023183109238743782, "kl": 0.4832927491515875, "learning_rate": 9.999989166327695e-06, "loss": -0.0202, "num_tokens": 4175619.0, "reward": 0.8100322484970093, "reward_std": 1.5623064041137695, "rewards/rollout_reward_func/mean": 0.8100322484970093, "rewards/rollout_reward_func/std": 1.8035727739334106, "sampling/importance_sampling_ratio/max": 0.5118405818939209, "sampling/importance_sampling_ratio/mean": 0.1435375064611435, "sampling/importance_sampling_ratio/min": 1.0402826869897564e-17, "sampling/sampling_logp_difference/max": 2.9993107318878174, "sampling/sampling_logp_difference/mean": 1.0713605880737305, "step": 189, "step_time": 8.925270419002118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.739318251609802, "epoch": 0.0019, "grad_norm": 0.01966405101120472, "kl": 0.48102382104843855, "learning_rate": 9.999989024248333e-06, "loss": -0.0203, "step": 190, "step_time": 4.84443004700006 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "completions/clipped_ratio": 0.0625, "completions/max_length": 1159.0, "completions/max_terminated_length": 1159.0, "completions/mean_length": 492.4375, "completions/mean_terminated_length": 501.9000244140625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.633974671363831, "epoch": 0.00191, "frac_reward_zero_std": 0.0, "grad_norm": 0.05310504138469696, "kl": 0.4698046762496233, "learning_rate": 9.999988881243376e-06, "loss": -0.0126, "num_tokens": 4228281.0, "reward": 0.1959221512079239, "reward_std": 1.8671578168869019, "rewards/rollout_reward_func/mean": 0.1959221512079239, "rewards/rollout_reward_func/std": 1.9828778505325317, "sampling/importance_sampling_ratio/max": 0.53467857837677, "sampling/importance_sampling_ratio/mean": 0.09168924391269684, "sampling/importance_sampling_ratio/min": 2.0099950741983923e-36, "sampling/sampling_logp_difference/max": 9.515901565551758, "sampling/sampling_logp_difference/mean": 1.5070074796676636, "step": 191, "step_time": 10.270253836002667 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.009215130528900772, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.019631797506008297, "entropy": 6.611230909824371, "epoch": 0.00192, "grad_norm": 0.02176094427704811, "kl": 0.46961820870637894, "learning_rate": 9.99998873731282e-06, "loss": -0.0128, "step": 192, "step_time": 5.327718836000713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 932.0, "completions/max_terminated_length": 932.0, "completions/mean_length": 236.96875, "completions/mean_terminated_length": 237.06451416015625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.6850475668907166, "epoch": 0.00193, "frac_reward_zero_std": 0.0, "grad_norm": 0.15953786671161652, "kl": 0.43335629254579544, "learning_rate": 9.99998859245667e-06, "loss": -0.0145, "num_tokens": 4271675.0, "reward": 0.7048100233078003, "reward_std": 1.8009850978851318, "rewards/rollout_reward_func/mean": 0.7048100233078003, "rewards/rollout_reward_func/std": 2.001183032989502, "sampling/importance_sampling_ratio/max": 0.533510684967041, "sampling/importance_sampling_ratio/mean": 0.12078151851892471, "sampling/importance_sampling_ratio/min": 8.407790785948902e-45, "sampling/sampling_logp_difference/max": 4.618961334228516, "sampling/sampling_logp_difference/mean": 1.2926521301269531, "step": 193, "step_time": 8.810922076998395 }, { "clip_ratio/high_max": 0.06250000186264515, "clip_ratio/high_mean": 0.031250000931322575, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.039062500931322575, "entropy": 6.537840306758881, "epoch": 0.00194, "grad_norm": 0.036303386092185974, "kl": 0.42241672426462173, "learning_rate": 9.999988446674922e-06, "loss": -0.0149, "step": 194, "step_time": 5.514664204001747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1132.0, "completions/max_terminated_length": 1132.0, "completions/mean_length": 465.375, "completions/mean_terminated_length": 479.8709411621094, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.608734667301178, "epoch": 0.00195, "frac_reward_zero_std": 0.0, "grad_norm": 0.17148666083812714, "kl": 0.5234170034527779, "learning_rate": 9.999988299967575e-06, "loss": -0.0199, "num_tokens": 4323105.0, "reward": 0.44169479608535767, "reward_std": 1.6200597286224365, "rewards/rollout_reward_func/mean": 0.44169479608535767, "rewards/rollout_reward_func/std": 1.927075982093811, "sampling/importance_sampling_ratio/max": 0.5377144813537598, "sampling/importance_sampling_ratio/mean": 0.14771556854248047, "sampling/importance_sampling_ratio/min": 4.685827305888197e-26, "sampling/sampling_logp_difference/max": 12.047843933105469, "sampling/sampling_logp_difference/mean": 1.2355775833129883, "step": 195, "step_time": 9.90919329299868 }, { "clip_ratio/high_max": 0.033333334140479565, "clip_ratio/high_mean": 0.02291666716337204, "clip_ratio/low_mean": 0.0026041667442768812, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02552083390764892, "entropy": 5.604083240032196, "epoch": 0.00196, "grad_norm": 0.09379852563142776, "kl": 0.4288702215999365, "learning_rate": 9.999988152334635e-06, "loss": -0.0203, "step": 196, "step_time": 5.27584279599796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1079.0, "completions/max_terminated_length": 1079.0, "completions/mean_length": 204.9375, "completions/mean_terminated_length": 194.3870849609375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.54690283536911, "epoch": 0.00197, "frac_reward_zero_std": 0.5, "grad_norm": 0.021037179976701736, "kl": 0.6129151955246925, "learning_rate": 9.999988003776098e-06, "loss": -0.0084, "num_tokens": 4362550.0, "reward": 1.1271319389343262, "reward_std": 0.804468035697937, "rewards/rollout_reward_func/mean": 1.1271319389343262, "rewards/rollout_reward_func/std": 1.7291862964630127, "sampling/importance_sampling_ratio/max": 0.5456136465072632, "sampling/importance_sampling_ratio/mean": 0.35906317830085754, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 7.437076091766357, "sampling/sampling_logp_difference/mean": 1.1850206851959229, "step": 197, "step_time": 9.198634773998492 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.004697389260400087, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010947389353532344, "entropy": 4.5655679404735565, "epoch": 0.00198, "grad_norm": 0.021192507818341255, "kl": 0.6104511320590973, "learning_rate": 9.999987854291966e-06, "loss": -0.0084, "step": 198, "step_time": 5.033952443000089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1205.0, "completions/max_terminated_length": 1205.0, "completions/mean_length": 350.125, "completions/mean_terminated_length": 350.125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.280531108379364, "epoch": 0.00199, "frac_reward_zero_std": 0.25, "grad_norm": 0.38074564933776855, "kl": 0.6126711554825306, "learning_rate": 9.999987703882235e-06, "loss": -0.0075, "num_tokens": 4407712.0, "reward": 1.1138367652893066, "reward_std": 1.2647085189819336, "rewards/rollout_reward_func/mean": 1.1138367652893066, "rewards/rollout_reward_func/std": 1.9577386379241943, "sampling/importance_sampling_ratio/max": 0.5473842024803162, "sampling/importance_sampling_ratio/mean": 0.28862839937210083, "sampling/importance_sampling_ratio/min": 2.176510787773949e-40, "sampling/sampling_logp_difference/max": 6.000038146972656, "sampling/sampling_logp_difference/mean": 0.8228292465209961, "step": 199, "step_time": 10.540333507004107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 4.272719115018845, "epoch": 0.002, "grad_norm": 0.04824836179614067, "kl": 0.5727039501070976, "learning_rate": 9.999987552546909e-06, "loss": -0.0079, "step": 200, "step_time": 5.206676514000719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "completions/clipped_ratio": 0.03125, "completions/max_length": 1146.0, "completions/max_terminated_length": 1146.0, "completions/mean_length": 292.625, "completions/mean_terminated_length": 301.0322570800781, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.778183937072754, "epoch": 0.00201, "frac_reward_zero_std": 0.0, "grad_norm": 0.1697830706834793, "kl": 0.30634311120957136, "learning_rate": 9.999987400285985e-06, "loss": -0.0147, "num_tokens": 4452928.0, "reward": 0.28450649976730347, "reward_std": 1.4359863996505737, "rewards/rollout_reward_func/mean": 0.28450649976730347, "rewards/rollout_reward_func/std": 1.6503934860229492, "sampling/importance_sampling_ratio/max": 0.6762108206748962, "sampling/importance_sampling_ratio/mean": 0.14384080469608307, "sampling/importance_sampling_ratio/min": 1.4764199862778315e-33, "sampling/sampling_logp_difference/max": 13.874082565307617, "sampling/sampling_logp_difference/mean": 1.663938283920288, "step": 201, "step_time": 9.442577963000076 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.019507576245814562, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.029924243222922087, "entropy": 6.810061991214752, "epoch": 0.00202, "grad_norm": 0.03791467100381851, "kl": 0.305682685226202, "learning_rate": 9.999987247099467e-06, "loss": -0.0152, "step": 202, "step_time": 5.13021500600189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "completions/clipped_ratio": 0.03125, "completions/max_length": 425.0, "completions/max_terminated_length": 425.0, "completions/mean_length": 62.34375, "completions/mean_terminated_length": 63.838706970214844, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.926365494728088, "epoch": 0.00203, "frac_reward_zero_std": 0.25, "grad_norm": 0.09084746986627579, "kl": 0.5910516902804375, "learning_rate": 9.999987092987352e-06, "loss": -0.0122, "num_tokens": 4486885.0, "reward": 0.7428113222122192, "reward_std": 0.8225956559181213, "rewards/rollout_reward_func/mean": 0.7428113222122192, "rewards/rollout_reward_func/std": 1.421898603439331, "sampling/importance_sampling_ratio/max": 0.5507006645202637, "sampling/importance_sampling_ratio/mean": 0.32911941409111023, "sampling/importance_sampling_ratio/min": 3.2548269635057636e-12, "sampling/sampling_logp_difference/max": 4.845285892486572, "sampling/sampling_logp_difference/mean": 0.960766077041626, "step": 203, "step_time": 6.46900819000075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.9773023426532745, "epoch": 0.00204, "grad_norm": 0.09865245223045349, "kl": 0.5725810378789902, "learning_rate": 9.999986937949641e-06, "loss": -0.0122, "step": 204, "step_time": 3.5298662109998986 }, { "clip_ratio/high_max": 0.01785714365541935, "clip_ratio/high_mean": 0.008928571827709675, "clip_ratio/low_mean": 0.0024038462433964014, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011332418071106076, "completions/clipped_ratio": 0.0, "completions/max_length": 1003.0, "completions/max_terminated_length": 1003.0, "completions/mean_length": 225.28125, "completions/mean_terminated_length": 225.28125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.545470267534256, "epoch": 0.00205, "frac_reward_zero_std": 0.0, "grad_norm": 0.06779895722866058, "kl": 0.5986991114914417, "learning_rate": 9.999986781986334e-06, "loss": -0.0052, "num_tokens": 4527614.0, "reward": 1.2748353481292725, "reward_std": 1.1812279224395752, "rewards/rollout_reward_func/mean": 1.2748353481292725, "rewards/rollout_reward_func/std": 1.184450387954712, "sampling/importance_sampling_ratio/max": 0.5532639026641846, "sampling/importance_sampling_ratio/mean": 0.329262375831604, "sampling/importance_sampling_ratio/min": 2.4957288454322045e-16, "sampling/sampling_logp_difference/max": 4.829587936401367, "sampling/sampling_logp_difference/mean": 1.0258030891418457, "step": 205, "step_time": 9.32463395800005 }, { "clip_ratio/high_max": 0.11160714365541935, "clip_ratio/high_mean": 0.055803571827709675, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.055803571827709675, "entropy": 4.464525938034058, "epoch": 0.00206, "grad_norm": 0.055603526532649994, "kl": 0.6101174242794514, "learning_rate": 9.999986625097431e-06, "loss": -0.0053, "step": 206, "step_time": 4.919812664995334 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.0, "completions/max_length": 1320.0, "completions/max_terminated_length": 1320.0, "completions/mean_length": 462.34375, "completions/mean_terminated_length": 462.34375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.705791771411896, "epoch": 0.00207, "frac_reward_zero_std": 0.0, "grad_norm": 0.06313570588827133, "kl": 0.4582593012601137, "learning_rate": 9.999986467282931e-06, "loss": -0.0178, "num_tokens": 4578935.0, "reward": 1.0536935329437256, "reward_std": 1.657710075378418, "rewards/rollout_reward_func/mean": 1.0536935329437256, "rewards/rollout_reward_func/std": 1.7301957607269287, "sampling/importance_sampling_ratio/max": 0.5476665496826172, "sampling/importance_sampling_ratio/mean": 0.12923644483089447, "sampling/importance_sampling_ratio/min": 5.405500808249031e-13, "sampling/sampling_logp_difference/max": 4.1591410636901855, "sampling/sampling_logp_difference/mean": 1.1311194896697998, "step": 207, "step_time": 10.326185688001715 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 5.661291033029556, "epoch": 0.00208, "grad_norm": 0.07332885265350342, "kl": 0.4599694926291704, "learning_rate": 9.999986308542834e-06, "loss": -0.0178, "step": 208, "step_time": 5.6349571049977385 }, { "clip_ratio/high_max": 0.0034722222480922937, "clip_ratio/high_mean": 0.0017361111240461469, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0017361111240461469, "completions/clipped_ratio": 0.0, "completions/max_length": 1056.0, "completions/max_terminated_length": 1056.0, "completions/mean_length": 284.0625, "completions/mean_terminated_length": 284.0625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.037483513355255, "epoch": 0.00209, "frac_reward_zero_std": 0.25, "grad_norm": 0.07064621150493622, "kl": 0.4769252873957157, "learning_rate": 9.999986148877143e-06, "loss": -0.01, "num_tokens": 4622393.0, "reward": 0.6237622499465942, "reward_std": 1.1161028146743774, "rewards/rollout_reward_func/mean": 0.6237622499465942, "rewards/rollout_reward_func/std": 1.4785090684890747, "sampling/importance_sampling_ratio/max": 0.5531884431838989, "sampling/importance_sampling_ratio/mean": 0.23544597625732422, "sampling/importance_sampling_ratio/min": 5.337405116821502e-23, "sampling/sampling_logp_difference/max": 4.086203098297119, "sampling/sampling_logp_difference/mean": 1.3757338523864746, "step": 209, "step_time": 9.203588907008452 }, { "clip_ratio/high_max": 0.01597222243435681, "clip_ratio/high_mean": 0.007986111217178404, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007986111217178404, "entropy": 6.016080558300018, "epoch": 0.0021, "grad_norm": 0.015436006709933281, "kl": 0.46961749345064163, "learning_rate": 9.999985988285857e-06, "loss": -0.0102, "step": 210, "step_time": 5.021962450999126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.004166666883975267, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004166666883975267, "completions/clipped_ratio": 0.0, "completions/max_length": 570.0, "completions/max_terminated_length": 570.0, "completions/mean_length": 239.65625, "completions/mean_terminated_length": 239.65625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.426139563322067, "epoch": 0.00211, "frac_reward_zero_std": 0.25, "grad_norm": 0.062313906848430634, "kl": 0.5304891504347324, "learning_rate": 9.999985826768975e-06, "loss": -0.0149, "num_tokens": 4666371.0, "reward": 0.9415457248687744, "reward_std": 1.2529354095458984, "rewards/rollout_reward_func/mean": 0.9415457248687744, "rewards/rollout_reward_func/std": 1.5951881408691406, "sampling/importance_sampling_ratio/max": 0.55078125, "sampling/importance_sampling_ratio/mean": 0.2398495227098465, "sampling/importance_sampling_ratio/min": 2.5038588091774727e-08, "sampling/sampling_logp_difference/max": 4.65727424621582, "sampling/sampling_logp_difference/mean": 1.0584197044372559, "step": 211, "step_time": 8.977877531993727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 5.381116092205048, "epoch": 0.00212, "grad_norm": 0.021537791937589645, "kl": 0.5263635702431202, "learning_rate": 9.999985664326495e-06, "loss": -0.015, "step": 212, "step_time": 4.082370914009516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1540.0, "completions/max_terminated_length": 1540.0, "completions/mean_length": 441.1875, "completions/mean_terminated_length": 441.1875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.559887230396271, "epoch": 0.00213, "frac_reward_zero_std": 0.25, "grad_norm": 0.013537976890802383, "kl": 0.5053754337131977, "learning_rate": 9.99998550095842e-06, "loss": -0.0162, "num_tokens": 4715669.0, "reward": 1.287748098373413, "reward_std": 1.3667700290679932, "rewards/rollout_reward_func/mean": 1.287748098373413, "rewards/rollout_reward_func/std": 1.5709222555160522, "sampling/importance_sampling_ratio/max": 0.5515864491462708, "sampling/importance_sampling_ratio/mean": 0.21904610097408295, "sampling/importance_sampling_ratio/min": 8.014809083078944e-08, "sampling/sampling_logp_difference/max": 2.6077325344085693, "sampling/sampling_logp_difference/mean": 0.9959990978240967, "step": 213, "step_time": 10.726121320996754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.53397136926651, "epoch": 0.00214, "grad_norm": 0.010728896595537663, "kl": 0.5079537983983755, "learning_rate": 9.999985336664749e-06, "loss": -0.0162, "step": 214, "step_time": 6.050620052996237 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "completions/clipped_ratio": 0.03125, "completions/max_length": 1600.0, "completions/max_terminated_length": 1600.0, "completions/mean_length": 475.875, "completions/mean_terminated_length": 475.1290283203125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.184921592473984, "epoch": 0.00215, "frac_reward_zero_std": 0.25, "grad_norm": 0.038627222180366516, "kl": 0.7024459168314934, "learning_rate": 9.999985171445482e-06, "loss": -0.0106, "num_tokens": 4767101.0, "reward": 0.8611672520637512, "reward_std": 1.3453055620193481, "rewards/rollout_reward_func/mean": 0.8611672520637512, "rewards/rollout_reward_func/std": 1.7943239212036133, "sampling/importance_sampling_ratio/max": 0.5404050350189209, "sampling/importance_sampling_ratio/mean": 0.1775681972503662, "sampling/importance_sampling_ratio/min": 1.0541321981971663e-10, "sampling/sampling_logp_difference/max": 10.292664527893066, "sampling/sampling_logp_difference/mean": 0.9675716161727905, "step": 215, "step_time": 10.726247799000703 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 5.176157861948013, "epoch": 0.00216, "grad_norm": 0.028092656284570694, "kl": 0.7041196450591087, "learning_rate": 9.99998500530062e-06, "loss": -0.0106, "step": 216, "step_time": 6.469179286999861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1436.0, "completions/max_terminated_length": 1436.0, "completions/mean_length": 228.5, "completions/mean_terminated_length": 228.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.3556437492370605, "epoch": 0.00217, "frac_reward_zero_std": 0.25, "grad_norm": 0.19188183546066284, "kl": 0.5503736659884453, "learning_rate": 9.999984838230163e-06, "loss": -0.0101, "num_tokens": 4807414.0, "reward": 1.3514528274536133, "reward_std": 1.1823900938034058, "rewards/rollout_reward_func/mean": 1.3514528274536133, "rewards/rollout_reward_func/std": 1.5364975929260254, "sampling/importance_sampling_ratio/max": 0.5541555285453796, "sampling/importance_sampling_ratio/mean": 0.3371211290359497, "sampling/importance_sampling_ratio/min": 5.023943110864609e-24, "sampling/sampling_logp_difference/max": 8.790553092956543, "sampling/sampling_logp_difference/mean": 0.9337519407272339, "step": 217, "step_time": 10.361504899003194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.03645833395421505, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03645833395421505, "entropy": 4.354745537042618, "epoch": 0.00218, "grad_norm": 0.04451144114136696, "kl": 0.5603490471839905, "learning_rate": 9.999984670234109e-06, "loss": -0.0102, "step": 218, "step_time": 5.643073603998346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 650.0, "completions/max_terminated_length": 650.0, "completions/mean_length": 235.0, "completions/mean_terminated_length": 235.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.125527232885361, "epoch": 0.00219, "frac_reward_zero_std": 0.0, "grad_norm": 0.06974940001964569, "kl": 0.5904657058417797, "learning_rate": 9.99998450131246e-06, "loss": -0.0206, "num_tokens": 4849812.0, "reward": 0.391598641872406, "reward_std": 1.5391111373901367, "rewards/rollout_reward_func/mean": 0.391598641872406, "rewards/rollout_reward_func/std": 1.7338849306106567, "sampling/importance_sampling_ratio/max": 0.5494987964630127, "sampling/importance_sampling_ratio/mean": 0.24112631380558014, "sampling/importance_sampling_ratio/min": 1.73592888492445e-18, "sampling/sampling_logp_difference/max": 4.469204425811768, "sampling/sampling_logp_difference/mean": 1.011073350906372, "step": 219, "step_time": 7.706611304001854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0031250000465661287, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250000465661287, "entropy": 5.129674971103668, "epoch": 0.0022, "grad_norm": 0.07037139683961868, "kl": 0.5848035663366318, "learning_rate": 9.999984331465216e-06, "loss": -0.0206, "step": 220, "step_time": 4.182144486003381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1362.0, "completions/max_terminated_length": 1362.0, "completions/mean_length": 546.3125, "completions/mean_terminated_length": 546.3125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.5657608807086945, "epoch": 0.00221, "frac_reward_zero_std": 0.0, "grad_norm": 0.017873046919703484, "kl": 0.6118797399103642, "learning_rate": 9.999984160692378e-06, "loss": -0.0145, "num_tokens": 4904338.0, "reward": 0.9363380670547485, "reward_std": 1.784096360206604, "rewards/rollout_reward_func/mean": 0.9363380670547485, "rewards/rollout_reward_func/std": 1.9403388500213623, "sampling/importance_sampling_ratio/max": 0.41132453083992004, "sampling/importance_sampling_ratio/mean": 0.11238078027963638, "sampling/importance_sampling_ratio/min": 4.0598928663371225e-09, "sampling/sampling_logp_difference/max": 4.869526386260986, "sampling/sampling_logp_difference/mean": 0.984228253364563, "step": 221, "step_time": 10.698347289006051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.572328805923462, "epoch": 0.00222, "grad_norm": 0.018735237419605255, "kl": 0.6076029725372791, "learning_rate": 9.999983988993942e-06, "loss": -0.0144, "step": 222, "step_time": 6.804170254003111 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "completions/clipped_ratio": 0.0, "completions/max_length": 1452.0, "completions/max_terminated_length": 1452.0, "completions/mean_length": 310.96875, "completions/mean_terminated_length": 310.96875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.4720436334609985, "epoch": 0.00223, "frac_reward_zero_std": 0.25, "grad_norm": 0.1630937159061432, "kl": 0.6774674952030182, "learning_rate": 9.99998381636991e-06, "loss": -0.0071, "num_tokens": 4948443.0, "reward": 1.7771830558776855, "reward_std": 1.002368688583374, "rewards/rollout_reward_func/mean": 1.7771830558776855, "rewards/rollout_reward_func/std": 1.2602592706680298, "sampling/importance_sampling_ratio/max": 0.5537646412849426, "sampling/importance_sampling_ratio/mean": 0.28350746631622314, "sampling/importance_sampling_ratio/min": 6.617405842040958e-12, "sampling/sampling_logp_difference/max": 4.683642864227295, "sampling/sampling_logp_difference/mean": 0.8154048919677734, "step": 223, "step_time": 10.074482987998636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.021875000093132257, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.021875000093132257, "entropy": 4.464451551437378, "epoch": 0.00224, "grad_norm": 0.040026452392339706, "kl": 0.7037984132766724, "learning_rate": 9.999983642820286e-06, "loss": -0.0074, "step": 224, "step_time": 5.694238545002008 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "completions/clipped_ratio": 0.0, "completions/max_length": 958.0, "completions/max_terminated_length": 958.0, "completions/mean_length": 311.875, "completions/mean_terminated_length": 311.875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.9656336307525635, "epoch": 0.00225, "frac_reward_zero_std": 0.0, "grad_norm": 0.024458620697259903, "kl": 0.6297993147745728, "learning_rate": 9.999983468345063e-06, "loss": -0.015, "num_tokens": 4995039.0, "reward": 0.976101279258728, "reward_std": 1.1637855768203735, "rewards/rollout_reward_func/mean": 0.976101279258728, "rewards/rollout_reward_func/std": 1.5842832326889038, "sampling/importance_sampling_ratio/max": 0.5065286159515381, "sampling/importance_sampling_ratio/mean": 0.13444875180721283, "sampling/importance_sampling_ratio/min": 1.1091989108535927e-05, "sampling/sampling_logp_difference/max": 3.4168050289154053, "sampling/sampling_logp_difference/mean": 1.0691113471984863, "step": 225, "step_time": 8.548558571998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.02083333395421505, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02083333395421505, "entropy": 5.9415998458862305, "epoch": 0.00226, "grad_norm": 0.05145682394504547, "kl": 0.6814805222675204, "learning_rate": 9.999983292944247e-06, "loss": -0.0149, "step": 226, "step_time": 4.813730529000168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1586.0, "completions/max_terminated_length": 1586.0, "completions/mean_length": 383.875, "completions/mean_terminated_length": 383.875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.05870908498764, "epoch": 0.00227, "frac_reward_zero_std": 0.0, "grad_norm": 0.10143199563026428, "kl": 0.59431217238307, "learning_rate": 9.999983116617835e-06, "loss": -0.0188, "num_tokens": 5042375.0, "reward": 1.2560642957687378, "reward_std": 1.568943738937378, "rewards/rollout_reward_func/mean": 1.2560642957687378, "rewards/rollout_reward_func/std": 1.5296679735183716, "sampling/importance_sampling_ratio/max": 0.5415928363800049, "sampling/importance_sampling_ratio/mean": 0.21752065420150757, "sampling/importance_sampling_ratio/min": 4.386511553294836e-12, "sampling/sampling_logp_difference/max": 13.512418746948242, "sampling/sampling_logp_difference/mean": 1.078181266784668, "step": 227, "step_time": 10.66103546499653 }, { "clip_ratio/high_max": 0.08333333395421505, "clip_ratio/high_mean": 0.041666666977107525, "clip_ratio/low_mean": 0.012620192486792803, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.05428685946390033, "entropy": 4.99944880604744, "epoch": 0.00228, "grad_norm": 0.026503240689635277, "kl": 0.6040502563118935, "learning_rate": 9.999982939365828e-06, "loss": -0.0191, "step": 228, "step_time": 6.826403917995776 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "completions/clipped_ratio": 0.0, "completions/max_length": 1649.0, "completions/max_terminated_length": 1649.0, "completions/mean_length": 530.375, "completions/mean_terminated_length": 530.375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.234196841716766, "epoch": 0.00229, "frac_reward_zero_std": 0.0, "grad_norm": 0.047300949692726135, "kl": 0.4863976128399372, "learning_rate": 9.999982761188226e-06, "loss": -0.0125, "num_tokens": 5095392.0, "reward": 0.53513503074646, "reward_std": 1.6788750886917114, "rewards/rollout_reward_func/mean": 0.53513503074646, "rewards/rollout_reward_func/std": 1.6560500860214233, "sampling/importance_sampling_ratio/max": 0.5444734692573547, "sampling/importance_sampling_ratio/mean": 0.13112178444862366, "sampling/importance_sampling_ratio/min": 6.255148150557943e-07, "sampling/sampling_logp_difference/max": 3.4378011226654053, "sampling/sampling_logp_difference/mean": 1.1510100364685059, "step": 229, "step_time": 11.070037635006884 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 6.216519832611084, "epoch": 0.0023, "grad_norm": 0.02691039815545082, "kl": 0.4899191651493311, "learning_rate": 9.999982582085029e-06, "loss": -0.0126, "step": 230, "step_time": 6.20570158999908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "completions/clipped_ratio": 0.0, "completions/max_length": 775.0, "completions/max_terminated_length": 775.0, "completions/mean_length": 137.625, "completions/mean_terminated_length": 137.625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.9574858248233795, "epoch": 0.00231, "frac_reward_zero_std": 0.25, "grad_norm": 0.12698481976985931, "kl": 1.0334630981087685, "learning_rate": 9.999982402056237e-06, "loss": -0.006, "num_tokens": 5131739.0, "reward": 0.9848486185073853, "reward_std": 1.060044288635254, "rewards/rollout_reward_func/mean": 0.9848486185073853, "rewards/rollout_reward_func/std": 1.3662832975387573, "sampling/importance_sampling_ratio/max": 0.545970618724823, "sampling/importance_sampling_ratio/mean": 0.34293973445892334, "sampling/importance_sampling_ratio/min": 2.8166079573566094e-05, "sampling/sampling_logp_difference/max": 2.473886013031006, "sampling/sampling_logp_difference/mean": 0.623577356338501, "step": 231, "step_time": 7.623007744005008 }, { "clip_ratio/high_max": 0.0625, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.041666666977107525, "entropy": 3.9039868116378784, "epoch": 0.00232, "grad_norm": 0.10068481415510178, "kl": 1.0551066100597382, "learning_rate": 9.999982221101849e-06, "loss": -0.0065, "step": 232, "step_time": 4.154470723002305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1249.0, "completions/max_terminated_length": 1225.0, "completions/mean_length": 489.34375, "completions/mean_terminated_length": 464.83868408203125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.612375617027283, "epoch": 0.00233, "frac_reward_zero_std": 0.0, "grad_norm": 0.04434690997004509, "kl": 0.6944773402065039, "learning_rate": 9.999982039221867e-06, "loss": -0.0046, "num_tokens": 5183200.0, "reward": -0.2882317006587982, "reward_std": 0.9046006202697754, "rewards/rollout_reward_func/mean": -0.2882317006587982, "rewards/rollout_reward_func/std": 1.307381272315979, "sampling/importance_sampling_ratio/max": 0.541497528553009, "sampling/importance_sampling_ratio/mean": 0.15168800950050354, "sampling/importance_sampling_ratio/min": 1.0289224306554365e-13, "sampling/sampling_logp_difference/max": 13.36201000213623, "sampling/sampling_logp_difference/mean": 1.1563018560409546, "step": 233, "step_time": 10.549277044003247 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 5.616968929767609, "epoch": 0.00234, "grad_norm": 0.05004633218050003, "kl": 0.6958358753472567, "learning_rate": 9.99998185641629e-06, "loss": -0.0047, "step": 234, "step_time": 5.857416997005203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "completions/clipped_ratio": 0.0, "completions/max_length": 889.0, "completions/max_terminated_length": 889.0, "completions/mean_length": 191.46875, "completions/mean_terminated_length": 191.46875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.529074162244797, "epoch": 0.00235, "frac_reward_zero_std": 0.0, "grad_norm": 0.12475937604904175, "kl": 0.8309877887368202, "learning_rate": 9.999981672685119e-06, "loss": -0.0108, "num_tokens": 5222252.0, "reward": 0.8408229351043701, "reward_std": 1.4573485851287842, "rewards/rollout_reward_func/mean": 0.8408229351043701, "rewards/rollout_reward_func/std": 1.405064344406128, "sampling/importance_sampling_ratio/max": 0.5894081592559814, "sampling/importance_sampling_ratio/mean": 0.30403995513916016, "sampling/importance_sampling_ratio/min": 1.3964246136310976e-05, "sampling/sampling_logp_difference/max": 4.4520769119262695, "sampling/sampling_logp_difference/mean": 0.924983561038971, "step": 235, "step_time": 8.495959868007048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 4.558802127838135, "epoch": 0.00236, "grad_norm": 0.12360981106758118, "kl": 0.8235709145665169, "learning_rate": 9.999981488028352e-06, "loss": -0.011, "step": 236, "step_time": 4.671992730003694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 788.0, "completions/max_terminated_length": 788.0, "completions/mean_length": 114.90625, "completions/mean_terminated_length": 114.90625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.731397807598114, "epoch": 0.00237, "frac_reward_zero_std": 0.5, "grad_norm": 0.009464557282626629, "kl": 0.885832317173481, "learning_rate": 9.99998130244599e-06, "loss": -0.0088, "num_tokens": 5256009.0, "reward": 1.8623074293136597, "reward_std": 0.6048402786254883, "rewards/rollout_reward_func/mean": 1.8623074293136597, "rewards/rollout_reward_func/std": 0.8462442755699158, "sampling/importance_sampling_ratio/max": 0.5521999001502991, "sampling/importance_sampling_ratio/mean": 0.4241643249988556, "sampling/importance_sampling_ratio/min": 1.0619122804200742e-05, "sampling/sampling_logp_difference/max": 2.9092791080474854, "sampling/sampling_logp_difference/mean": 0.5986289381980896, "step": 237, "step_time": 7.88448001200959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.745842933654785, "epoch": 0.00238, "grad_norm": 0.010348273441195488, "kl": 0.8854806199669838, "learning_rate": 9.999981115938033e-06, "loss": -0.0089, "step": 238, "step_time": 4.432505930002662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1580.0, "completions/max_terminated_length": 1580.0, "completions/mean_length": 378.1875, "completions/mean_terminated_length": 378.1875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.7280014753341675, "epoch": 0.00239, "frac_reward_zero_std": 0.5, "grad_norm": 0.00894120056182146, "kl": 0.8182286322116852, "learning_rate": 9.999980928504482e-06, "loss": -0.0024, "num_tokens": 5301541.0, "reward": 2.11077618598938, "reward_std": 0.7055091857910156, "rewards/rollout_reward_func/mean": 2.11077618598938, "rewards/rollout_reward_func/std": 0.9574822187423706, "sampling/importance_sampling_ratio/max": 0.5548072457313538, "sampling/importance_sampling_ratio/mean": 0.3407219648361206, "sampling/importance_sampling_ratio/min": 1.8149918901144702e-07, "sampling/sampling_logp_difference/max": 3.8600528240203857, "sampling/sampling_logp_difference/mean": 0.5394055247306824, "step": 239, "step_time": 11.205350068994449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.7417226433753967, "epoch": 0.0024, "grad_norm": 0.009561389684677124, "kl": 0.8173454850912094, "learning_rate": 9.999980740145336e-06, "loss": -0.0024, "step": 240, "step_time": 6.808408495991898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1254.0, "completions/max_terminated_length": 1254.0, "completions/mean_length": 539.21875, "completions/mean_terminated_length": 539.21875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.573432505130768, "epoch": 0.00241, "frac_reward_zero_std": 0.25, "grad_norm": 0.012819021008908749, "kl": 0.5639337822794914, "learning_rate": 9.999980550860597e-06, "loss": -0.0095, "num_tokens": 5353035.0, "reward": 1.2991387844085693, "reward_std": 0.9546839594841003, "rewards/rollout_reward_func/mean": 1.2991387844085693, "rewards/rollout_reward_func/std": 1.3624227046966553, "sampling/importance_sampling_ratio/max": 0.5470429062843323, "sampling/importance_sampling_ratio/mean": 0.1978936642408371, "sampling/importance_sampling_ratio/min": 7.513818900406477e-09, "sampling/sampling_logp_difference/max": 4.151134014129639, "sampling/sampling_logp_difference/mean": 1.0021544694900513, "step": 241, "step_time": 10.377613169996039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.595817565917969, "epoch": 0.00242, "grad_norm": 0.013016695156693459, "kl": 0.5644616149365902, "learning_rate": 9.999980360650262e-06, "loss": -0.0095, "step": 242, "step_time": 5.649274825998873 }, { "clip_ratio/high_max": 0.0031250000465661287, "clip_ratio/high_mean": 0.0015625000232830644, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007812500116415322, "completions/clipped_ratio": 0.0, "completions/max_length": 1018.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 427.9375, "completions/mean_terminated_length": 427.9375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.946184456348419, "epoch": 0.00243, "frac_reward_zero_std": 0.0, "grad_norm": 0.04101103916764259, "kl": 0.5674281641840935, "learning_rate": 9.999980169514331e-06, "loss": -0.0191, "num_tokens": 5402205.0, "reward": 0.9123628735542297, "reward_std": 1.2553555965423584, "rewards/rollout_reward_func/mean": 0.9123628735542297, "rewards/rollout_reward_func/std": 1.5604585409164429, "sampling/importance_sampling_ratio/max": 0.5441422462463379, "sampling/importance_sampling_ratio/mean": 0.1684446930885315, "sampling/importance_sampling_ratio/min": 1.3805131571530249e-17, "sampling/sampling_logp_difference/max": 12.483779907226562, "sampling/sampling_logp_difference/mean": 1.265150785446167, "step": 243, "step_time": 9.389428485996177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.931067764759064, "epoch": 0.00244, "grad_norm": 0.03782140091061592, "kl": 0.568653816357255, "learning_rate": 9.999979977452809e-06, "loss": -0.0191, "step": 244, "step_time": 5.091417640003783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1454.0, "completions/max_terminated_length": 1454.0, "completions/mean_length": 309.5, "completions/mean_terminated_length": 309.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.268030285835266, "epoch": 0.00245, "frac_reward_zero_std": 0.0, "grad_norm": 0.035126857459545135, "kl": 0.6543568521738052, "learning_rate": 9.999979784465691e-06, "loss": -0.0075, "num_tokens": 5447086.0, "reward": 0.10018618404865265, "reward_std": 1.4139055013656616, "rewards/rollout_reward_func/mean": 0.10018618404865265, "rewards/rollout_reward_func/std": 1.549038052558899, "sampling/importance_sampling_ratio/max": 0.549534261226654, "sampling/importance_sampling_ratio/mean": 0.15007375180721283, "sampling/importance_sampling_ratio/min": 1.0673877580413826e-11, "sampling/sampling_logp_difference/max": 12.791465759277344, "sampling/sampling_logp_difference/mean": 1.3618029356002808, "step": 245, "step_time": 10.972323737001716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.017361111473292112, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.017361111473292112, "entropy": 6.267749667167664, "epoch": 0.00246, "grad_norm": 0.0366402193903923, "kl": 0.6587365306913853, "learning_rate": 9.999979590552979e-06, "loss": -0.0074, "step": 246, "step_time": 5.788836454012198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1189.0, "completions/max_terminated_length": 1189.0, "completions/mean_length": 338.875, "completions/mean_terminated_length": 338.875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.3120289742946625, "epoch": 0.00247, "frac_reward_zero_std": 0.25, "grad_norm": 0.07419715821743011, "kl": 0.5475977621972561, "learning_rate": 9.999979395714672e-06, "loss": -0.0115, "num_tokens": 5492831.0, "reward": 0.8378713130950928, "reward_std": 1.4228227138519287, "rewards/rollout_reward_func/mean": 0.8378713130950928, "rewards/rollout_reward_func/std": 1.7232357263565063, "sampling/importance_sampling_ratio/max": 0.5523884892463684, "sampling/importance_sampling_ratio/mean": 0.2161490023136139, "sampling/importance_sampling_ratio/min": 1.5790436691531795e-06, "sampling/sampling_logp_difference/max": 3.1956984996795654, "sampling/sampling_logp_difference/mean": 0.914116621017456, "step": 247, "step_time": 9.768871989002946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.05133928591385484, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.05133928591385484, "entropy": 5.353802561759949, "epoch": 0.00248, "grad_norm": 0.03258809447288513, "kl": 0.5662508085370064, "learning_rate": 9.99997919995077e-06, "loss": -0.0116, "step": 248, "step_time": 5.419113092000771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1591.0, "completions/max_terminated_length": 1591.0, "completions/mean_length": 535.46875, "completions/mean_terminated_length": 535.46875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.585465222597122, "epoch": 0.00249, "frac_reward_zero_std": 0.25, "grad_norm": 0.07143646478652954, "kl": 0.8058827221393585, "learning_rate": 9.999979003261275e-06, "loss": -0.0048, "num_tokens": 5544306.0, "reward": 1.3900892734527588, "reward_std": 1.2539846897125244, "rewards/rollout_reward_func/mean": 1.3900892734527588, "rewards/rollout_reward_func/std": 1.5711948871612549, "sampling/importance_sampling_ratio/max": 0.5490850210189819, "sampling/importance_sampling_ratio/mean": 0.19279485940933228, "sampling/importance_sampling_ratio/min": 6.672095466181069e-15, "sampling/sampling_logp_difference/max": 4.201253414154053, "sampling/sampling_logp_difference/mean": 0.886339545249939, "step": 249, "step_time": 11.475145686996257 }, { "clip_ratio/high_max": 0.028125000186264515, "clip_ratio/high_mean": 0.014062500093132257, "clip_ratio/low_mean": 0.0018939394503831863, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015956439543515444, "entropy": 4.601656973361969, "epoch": 0.0025, "grad_norm": 0.022828884422779083, "kl": 0.7720931172370911, "learning_rate": 9.999978805646186e-06, "loss": -0.0049, "step": 250, "step_time": 6.212054643998272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.0, "completions/max_terminated_length": 486.0, "completions/mean_length": 236.40625, "completions/mean_terminated_length": 236.40625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.438541233539581, "epoch": 0.00251, "frac_reward_zero_std": 0.25, "grad_norm": 0.09089355170726776, "kl": 0.8382527865469456, "learning_rate": 9.999978607105502e-06, "loss": -0.0156, "num_tokens": 5587311.0, "reward": 1.799784779548645, "reward_std": 1.0079389810562134, "rewards/rollout_reward_func/mean": 1.799784779548645, "rewards/rollout_reward_func/std": 1.317986249923706, "sampling/importance_sampling_ratio/max": 0.5549689531326294, "sampling/importance_sampling_ratio/mean": 0.27160927653312683, "sampling/importance_sampling_ratio/min": 0.00011695075227180496, "sampling/sampling_logp_difference/max": 3.1508493423461914, "sampling/sampling_logp_difference/mean": 0.7775396108627319, "step": 251, "step_time": 8.739903960999072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.438876926898956, "epoch": 0.00252, "grad_norm": 0.05666644498705864, "kl": 0.9028201550245285, "learning_rate": 9.999978407639225e-06, "loss": -0.0159, "step": 252, "step_time": 3.9671894410021196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 865.0, "completions/max_terminated_length": 865.0, "completions/mean_length": 330.125, "completions/mean_terminated_length": 330.125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.245748221874237, "epoch": 0.00253, "frac_reward_zero_std": 0.0, "grad_norm": 0.21459022164344788, "kl": 0.5521058905869722, "learning_rate": 9.999978207247353e-06, "loss": -0.0159, "num_tokens": 5634849.0, "reward": 1.1230366230010986, "reward_std": 1.6680563688278198, "rewards/rollout_reward_func/mean": 1.1230366230010986, "rewards/rollout_reward_func/std": 1.7864972352981567, "sampling/importance_sampling_ratio/max": 0.5126504302024841, "sampling/importance_sampling_ratio/mean": 0.1693393886089325, "sampling/importance_sampling_ratio/min": 2.9900198228460795e-07, "sampling/sampling_logp_difference/max": 3.285916566848755, "sampling/sampling_logp_difference/mean": 0.956021785736084, "step": 253, "step_time": 8.51095527299185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.08177083358168602, "clip_ratio/low_min": 0.012500000186264515, "clip_ratio/region_mean": 0.08177083358168602, "entropy": 5.38451224565506, "epoch": 0.00254, "grad_norm": 0.07143314927816391, "kl": 0.5289687672629952, "learning_rate": 9.999978005929887e-06, "loss": -0.0169, "step": 254, "step_time": 4.549656752995361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 946.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 193.40625, "completions/mean_terminated_length": 193.40625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.596668183803558, "epoch": 0.00255, "frac_reward_zero_std": 0.5, "grad_norm": 0.0202134121209383, "kl": 0.7335504740476608, "learning_rate": 9.999977803686829e-06, "loss": -0.0147, "num_tokens": 5674863.0, "reward": 1.2302132844924927, "reward_std": 0.8281392455101013, "rewards/rollout_reward_func/mean": 1.2302132844924927, "rewards/rollout_reward_func/std": 1.4735708236694336, "sampling/importance_sampling_ratio/max": 0.5564664006233215, "sampling/importance_sampling_ratio/mean": 0.34728801250457764, "sampling/importance_sampling_ratio/min": 1.194048127217684e-05, "sampling/sampling_logp_difference/max": 3.0767533779144287, "sampling/sampling_logp_difference/mean": 0.8657125234603882, "step": 255, "step_time": 8.184867733001738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.654693067073822, "epoch": 0.00256, "grad_norm": 0.021653607487678528, "kl": 0.7288693785667419, "learning_rate": 9.999977600518175e-06, "loss": -0.0147, "step": 256, "step_time": 5.096193117002258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1060.0, "completions/max_terminated_length": 1060.0, "completions/mean_length": 364.0, "completions/mean_terminated_length": 364.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.217250406742096, "epoch": 0.00257, "frac_reward_zero_std": 0.0, "grad_norm": 0.027111167088150978, "kl": 0.5516441464424133, "learning_rate": 9.999977396423928e-06, "loss": -0.0169, "num_tokens": 5723235.0, "reward": 1.3930718898773193, "reward_std": 1.7314631938934326, "rewards/rollout_reward_func/mean": 1.3930718898773193, "rewards/rollout_reward_func/std": 1.8730411529541016, "sampling/importance_sampling_ratio/max": 0.5538252592086792, "sampling/importance_sampling_ratio/mean": 0.17951169610023499, "sampling/importance_sampling_ratio/min": 3.4787142610959165e-10, "sampling/sampling_logp_difference/max": 3.249612331390381, "sampling/sampling_logp_difference/mean": 0.9927442073822021, "step": 257, "step_time": 9.687378393002291 }, { "clip_ratio/high_max": 0.02291666716337204, "clip_ratio/high_mean": 0.01145833358168602, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01145833358168602, "entropy": 5.220255643129349, "epoch": 0.00258, "grad_norm": 0.021305294707417488, "kl": 0.5442744232714176, "learning_rate": 9.999977191404087e-06, "loss": -0.0169, "step": 258, "step_time": 5.0625553950048925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 997.0, "completions/max_terminated_length": 997.0, "completions/mean_length": 290.5625, "completions/mean_terminated_length": 291.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.110257387161255, "epoch": 0.00259, "frac_reward_zero_std": 0.0, "grad_norm": 0.12540705502033234, "kl": 0.47995273023843765, "learning_rate": 9.999976985458653e-06, "loss": -0.0193, "num_tokens": 5766295.0, "reward": 0.7373366951942444, "reward_std": 1.8300553560256958, "rewards/rollout_reward_func/mean": 0.7373366951942444, "rewards/rollout_reward_func/std": 1.917830228805542, "sampling/importance_sampling_ratio/max": 0.5540647506713867, "sampling/importance_sampling_ratio/mean": 0.21796250343322754, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 6.423358917236328, "sampling/sampling_logp_difference/mean": 1.2384378910064697, "step": 259, "step_time": 9.647882035991643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 6.102553606033325, "epoch": 0.0026, "grad_norm": 0.061675094068050385, "kl": 0.49196797236800194, "learning_rate": 9.999976778587625e-06, "loss": -0.0194, "step": 260, "step_time": 4.88231309099865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1604.0, "completions/max_terminated_length": 1604.0, "completions/mean_length": 703.46875, "completions/mean_terminated_length": 704.3870849609375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.775920420885086, "epoch": 0.00261, "frac_reward_zero_std": 0.0, "grad_norm": 0.0840325802564621, "kl": 0.5135885141789913, "learning_rate": 9.999976570791002e-06, "loss": -0.0119, "num_tokens": 5826010.0, "reward": 0.6883369088172913, "reward_std": 1.679599642753601, "rewards/rollout_reward_func/mean": 0.6883369088172913, "rewards/rollout_reward_func/std": 1.6870898008346558, "sampling/importance_sampling_ratio/max": 0.5445951223373413, "sampling/importance_sampling_ratio/mean": 0.14454951882362366, "sampling/importance_sampling_ratio/min": 2.6596421438154894e-22, "sampling/sampling_logp_difference/max": 3.758467674255371, "sampling/sampling_logp_difference/mean": 0.9267882108688354, "step": 261, "step_time": 11.985538320008345 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 4.746225833892822, "epoch": 0.00262, "grad_norm": 0.027954403311014175, "kl": 0.5119652822613716, "learning_rate": 9.999976362068785e-06, "loss": -0.0121, "step": 262, "step_time": 6.582948200990359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 970.0, "completions/max_terminated_length": 970.0, "completions/mean_length": 329.71875, "completions/mean_terminated_length": 316.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.974146246910095, "epoch": 0.00263, "frac_reward_zero_std": 0.0, "grad_norm": 0.03937311843037605, "kl": 0.44061456993222237, "learning_rate": 9.999976152420979e-06, "loss": -0.0208, "num_tokens": 5871541.0, "reward": 0.6597141623497009, "reward_std": 2.2069664001464844, "rewards/rollout_reward_func/mean": 0.6597141623497009, "rewards/rollout_reward_func/std": 2.167710304260254, "sampling/importance_sampling_ratio/max": 0.5548219680786133, "sampling/importance_sampling_ratio/mean": 0.1518714725971222, "sampling/importance_sampling_ratio/min": 1.401298464324817e-45, "sampling/sampling_logp_difference/max": 12.188826560974121, "sampling/sampling_logp_difference/mean": 1.3599529266357422, "step": 263, "step_time": 9.88746423300472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.948187381029129, "epoch": 0.00264, "grad_norm": 0.030596034601330757, "kl": 0.44162424467504025, "learning_rate": 9.999975941847575e-06, "loss": -0.0209, "step": 264, "step_time": 4.844855440998799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0031250000465661287, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250000465661287, "completions/clipped_ratio": 0.03125, "completions/max_length": 639.0, "completions/max_terminated_length": 639.0, "completions/mean_length": 292.46875, "completions/mean_terminated_length": 301.3870849609375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.390927314758301, "epoch": 0.00265, "frac_reward_zero_std": 0.0, "grad_norm": 0.10873399674892426, "kl": 0.5315260328352451, "learning_rate": 9.99997573034858e-06, "loss": -0.0182, "num_tokens": 5916630.0, "reward": 0.3705350458621979, "reward_std": 1.3415186405181885, "rewards/rollout_reward_func/mean": 0.3705350458621979, "rewards/rollout_reward_func/std": 1.5195752382278442, "sampling/importance_sampling_ratio/max": 0.5556864738464355, "sampling/importance_sampling_ratio/mean": 0.2159016877412796, "sampling/importance_sampling_ratio/min": 2.5449415920165774e-17, "sampling/sampling_logp_difference/max": 4.513976097106934, "sampling/sampling_logp_difference/mean": 1.138096809387207, "step": 265, "step_time": 8.078170474000217 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0031250000465661287, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01875000004656613, "entropy": 5.379692167043686, "epoch": 0.00266, "grad_norm": 0.013346758671104908, "kl": 0.523842791095376, "learning_rate": 9.99997551792399e-06, "loss": -0.0184, "step": 266, "step_time": 4.209680891006428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1402.0, "completions/max_terminated_length": 1402.0, "completions/mean_length": 622.28125, "completions/mean_terminated_length": 641.8386840820312, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.25550639629364, "epoch": 0.00267, "frac_reward_zero_std": 0.0, "grad_norm": 0.007510357070714235, "kl": 0.5115852113813162, "learning_rate": 9.999975304573807e-06, "loss": -0.018, "num_tokens": 5971935.0, "reward": 1.1960020065307617, "reward_std": 1.6189783811569214, "rewards/rollout_reward_func/mean": 1.1960020065307617, "rewards/rollout_reward_func/std": 1.6217368841171265, "sampling/importance_sampling_ratio/max": 0.5539007782936096, "sampling/importance_sampling_ratio/mean": 0.18193113803863525, "sampling/importance_sampling_ratio/min": 5.994218793201056e-13, "sampling/sampling_logp_difference/max": 4.410063743591309, "sampling/sampling_logp_difference/mean": 1.0289497375488281, "step": 267, "step_time": 11.03051860999767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.24220597743988, "epoch": 0.00268, "grad_norm": 0.007320808246731758, "kl": 0.5128885265439749, "learning_rate": 9.999975090298031e-06, "loss": -0.018, "step": 268, "step_time": 6.749844891004614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1013.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 173.5, "completions/mean_terminated_length": 173.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.260665327310562, "epoch": 0.00269, "frac_reward_zero_std": 0.5, "grad_norm": 0.010008231736719608, "kl": 0.6009052582085133, "learning_rate": 9.999974875096663e-06, "loss": -0.0055, "num_tokens": 6010089.0, "reward": 1.3615994453430176, "reward_std": 0.7384592294692993, "rewards/rollout_reward_func/mean": 1.3615994453430176, "rewards/rollout_reward_func/std": 1.1898616552352905, "sampling/importance_sampling_ratio/max": 0.5572834610939026, "sampling/importance_sampling_ratio/mean": 0.3548404276371002, "sampling/importance_sampling_ratio/min": 7.856227890457035e-10, "sampling/sampling_logp_difference/max": 12.717487335205078, "sampling/sampling_logp_difference/mean": 0.8492914438247681, "step": 269, "step_time": 8.613326919996325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.240057200193405, "epoch": 0.0027, "grad_norm": 0.009651689790189266, "kl": 0.6060072630643845, "learning_rate": 9.999974658969701e-06, "loss": -0.0055, "step": 270, "step_time": 4.804270737997285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "completions/clipped_ratio": 0.0, "completions/max_length": 1294.0, "completions/max_terminated_length": 1294.0, "completions/mean_length": 215.0, "completions/mean_terminated_length": 215.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.712741643190384, "epoch": 0.00271, "frac_reward_zero_std": 0.0, "grad_norm": 0.12202586978673935, "kl": 0.6579699218273163, "learning_rate": 9.999974441917146e-06, "loss": -0.0164, "num_tokens": 6048096.0, "reward": 0.9251763820648193, "reward_std": 1.3221745491027832, "rewards/rollout_reward_func/mean": 0.9251763820648193, "rewards/rollout_reward_func/std": 1.7914643287658691, "sampling/importance_sampling_ratio/max": 0.628620445728302, "sampling/importance_sampling_ratio/mean": 0.31804242730140686, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 5.743544101715088, "sampling/sampling_logp_difference/mean": 0.9517822265625, "step": 271, "step_time": 10.459652851004648 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02083333395421505, "entropy": 4.699569970369339, "epoch": 0.00272, "grad_norm": 0.050271835178136826, "kl": 0.6653461121022701, "learning_rate": 9.999974223938997e-06, "loss": -0.0164, "step": 272, "step_time": 5.381695145006233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1644.0, "completions/max_terminated_length": 1644.0, "completions/mean_length": 222.625, "completions/mean_terminated_length": 212.60000610351562, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.537850737571716, "epoch": 0.00273, "frac_reward_zero_std": 0.25, "grad_norm": 0.025900790467858315, "kl": 0.46170250326395035, "learning_rate": 9.999974005035256e-06, "loss": -0.0134, "num_tokens": 6090420.0, "reward": 1.1363991498947144, "reward_std": 1.302805781364441, "rewards/rollout_reward_func/mean": 1.1363991498947144, "rewards/rollout_reward_func/std": 1.775572419166565, "sampling/importance_sampling_ratio/max": 0.5540878772735596, "sampling/importance_sampling_ratio/mean": 0.22350946068763733, "sampling/importance_sampling_ratio/min": 2.3474109670384123e-31, "sampling/sampling_logp_difference/max": 12.099420547485352, "sampling/sampling_logp_difference/mean": 1.113742709159851, "step": 273, "step_time": 11.207548408994626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.529681026935577, "epoch": 0.00274, "grad_norm": 0.02142815850675106, "kl": 0.46083664149045944, "learning_rate": 9.999973785205922e-06, "loss": -0.0134, "step": 274, "step_time": 6.789721639997879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1783.0, "completions/max_terminated_length": 1783.0, "completions/mean_length": 463.15625, "completions/mean_terminated_length": 463.15625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.464354574680328, "epoch": 0.00275, "frac_reward_zero_std": 0.0, "grad_norm": 0.05532584711909294, "kl": 0.555010475218296, "learning_rate": 9.999973564450996e-06, "loss": -0.0185, "num_tokens": 6140743.0, "reward": 0.9489171504974365, "reward_std": 1.7605822086334229, "rewards/rollout_reward_func/mean": 0.9489171504974365, "rewards/rollout_reward_func/std": 1.866188406944275, "sampling/importance_sampling_ratio/max": 0.5563283562660217, "sampling/importance_sampling_ratio/mean": 0.17405261099338531, "sampling/importance_sampling_ratio/min": 2.7791298151669253e-09, "sampling/sampling_logp_difference/max": 11.386764526367188, "sampling/sampling_logp_difference/mean": 0.8077976703643799, "step": 275, "step_time": 11.626647664998018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 4.466660022735596, "epoch": 0.00276, "grad_norm": 0.05215058475732803, "kl": 0.552830021828413, "learning_rate": 9.999973342770475e-06, "loss": -0.0185, "step": 276, "step_time": 6.5427133070006676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0052083334885537624, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0052083334885537624, "completions/clipped_ratio": 0.0, "completions/max_length": 1775.0, "completions/max_terminated_length": 1775.0, "completions/mean_length": 449.09375, "completions/mean_terminated_length": 449.09375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.621954560279846, "epoch": 0.00277, "frac_reward_zero_std": 0.0, "grad_norm": 0.015434307977557182, "kl": 0.6753201335668564, "learning_rate": 9.999973120164363e-06, "loss": -0.024, "num_tokens": 6190769.0, "reward": 1.0930795669555664, "reward_std": 1.4873120784759521, "rewards/rollout_reward_func/mean": 1.0930795669555664, "rewards/rollout_reward_func/std": 1.6803134679794312, "sampling/importance_sampling_ratio/max": 0.5468757152557373, "sampling/importance_sampling_ratio/mean": 0.21574926376342773, "sampling/importance_sampling_ratio/min": 5.221087264073866e-16, "sampling/sampling_logp_difference/max": 3.760045289993286, "sampling/sampling_logp_difference/mean": 1.1791489124298096, "step": 277, "step_time": 11.429077950997453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0052083334885537624, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0052083334885537624, "entropy": 5.616360008716583, "epoch": 0.00278, "grad_norm": 0.015992892906069756, "kl": 0.6813095919787884, "learning_rate": 9.999972896632658e-06, "loss": -0.024, "step": 278, "step_time": 6.496287210997252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 891.0, "completions/max_terminated_length": 891.0, "completions/mean_length": 209.6875, "completions/mean_terminated_length": 209.6875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.969407320022583, "epoch": 0.00279, "frac_reward_zero_std": 0.25, "grad_norm": 0.03682897984981537, "kl": 0.7599479742348194, "learning_rate": 9.99997267217536e-06, "loss": -0.0136, "num_tokens": 6231004.0, "reward": 1.1985588073730469, "reward_std": 0.9039798974990845, "rewards/rollout_reward_func/mean": 1.1985588073730469, "rewards/rollout_reward_func/std": 1.3622491359710693, "sampling/importance_sampling_ratio/max": 0.556544840335846, "sampling/importance_sampling_ratio/mean": 0.27690160274505615, "sampling/importance_sampling_ratio/min": 3.4224550463476033e-15, "sampling/sampling_logp_difference/max": 3.282242774963379, "sampling/sampling_logp_difference/mean": 0.8963993191719055, "step": 279, "step_time": 8.657592195992038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.950895547866821, "epoch": 0.0028, "grad_norm": 0.03881290182471275, "kl": 0.7628546617925167, "learning_rate": 9.999972446792469e-06, "loss": -0.0136, "step": 280, "step_time": 5.293070211995655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 995.0, "completions/max_terminated_length": 995.0, "completions/mean_length": 381.09375, "completions/mean_terminated_length": 381.09375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.831907033920288, "epoch": 0.00281, "frac_reward_zero_std": 0.0, "grad_norm": 0.08941048383712769, "kl": 0.4995316043496132, "learning_rate": 9.999972220483987e-06, "loss": -0.0181, "num_tokens": 6279355.0, "reward": 0.7892543077468872, "reward_std": 1.3825602531433105, "rewards/rollout_reward_func/mean": 0.7892543077468872, "rewards/rollout_reward_func/std": 1.569254994392395, "sampling/importance_sampling_ratio/max": 0.5305212140083313, "sampling/importance_sampling_ratio/mean": 0.17129355669021606, "sampling/importance_sampling_ratio/min": 2.489778125891462e-05, "sampling/sampling_logp_difference/max": 3.3504021167755127, "sampling/sampling_logp_difference/mean": 1.124851942062378, "step": 281, "step_time": 9.005066652000096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 5.83450174331665, "epoch": 0.00282, "grad_norm": 0.02345663495361805, "kl": 0.5036344090476632, "learning_rate": 9.99997199324991e-06, "loss": -0.0183, "step": 282, "step_time": 4.878066226992814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1565.0, "completions/max_terminated_length": 1565.0, "completions/mean_length": 405.03125, "completions/mean_terminated_length": 408.06451416015625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.731111109256744, "epoch": 0.00283, "frac_reward_zero_std": 0.25, "grad_norm": 0.014740549959242344, "kl": 0.48140970058739185, "learning_rate": 9.999971765090241e-06, "loss": -0.0102, "num_tokens": 6327741.0, "reward": 1.1944098472595215, "reward_std": 1.2027044296264648, "rewards/rollout_reward_func/mean": 1.1944098472595215, "rewards/rollout_reward_func/std": 1.4883382320404053, "sampling/importance_sampling_ratio/max": 0.5597649216651917, "sampling/importance_sampling_ratio/mean": 0.1889469176530838, "sampling/importance_sampling_ratio/min": 4.271918025248458e-18, "sampling/sampling_logp_difference/max": 12.034357070922852, "sampling/sampling_logp_difference/mean": 1.3506207466125488, "step": 283, "step_time": 10.59948500399696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.716038942337036, "epoch": 0.00284, "grad_norm": 0.01571914553642273, "kl": 0.4831842752173543, "learning_rate": 9.999971536004981e-06, "loss": -0.0103, "step": 284, "step_time": 5.927785894000408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1570.0, "completions/max_terminated_length": 1570.0, "completions/mean_length": 421.6875, "completions/mean_terminated_length": 416.8709411621094, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.8653215169906616, "epoch": 0.00285, "frac_reward_zero_std": 0.5, "grad_norm": 0.01022281963378191, "kl": 0.7120369374752045, "learning_rate": 9.99997130599413e-06, "loss": -0.005, "num_tokens": 6374924.0, "reward": 1.3619346618652344, "reward_std": 0.7215983867645264, "rewards/rollout_reward_func/mean": 1.3619346618652344, "rewards/rollout_reward_func/std": 1.2023857831954956, "sampling/importance_sampling_ratio/max": 0.7566177248954773, "sampling/importance_sampling_ratio/mean": 0.3355520963668823, "sampling/importance_sampling_ratio/min": 8.352109207137837e-20, "sampling/sampling_logp_difference/max": 11.951231002807617, "sampling/sampling_logp_difference/mean": 0.917165994644165, "step": 285, "step_time": 11.12018987299598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.868892401456833, "epoch": 0.00286, "grad_norm": 0.009909704327583313, "kl": 0.710716363042593, "learning_rate": 9.999971075057683e-06, "loss": -0.005, "step": 286, "step_time": 6.314698619004048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1537.0, "completions/max_terminated_length": 1537.0, "completions/mean_length": 750.625, "completions/mean_terminated_length": 750.625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.3167635798454285, "epoch": 0.00287, "frac_reward_zero_std": 0.0, "grad_norm": 0.01070867758244276, "kl": 0.522741787135601, "learning_rate": 9.999970843195648e-06, "loss": -0.0113, "num_tokens": 6436716.0, "reward": 0.7927876710891724, "reward_std": 1.4249576330184937, "rewards/rollout_reward_func/mean": 0.7927876710891724, "rewards/rollout_reward_func/std": 1.8356372117996216, "sampling/importance_sampling_ratio/max": 0.3052220046520233, "sampling/importance_sampling_ratio/mean": 0.10947702825069427, "sampling/importance_sampling_ratio/min": 4.589710373436098e-19, "sampling/sampling_logp_difference/max": 13.633888244628906, "sampling/sampling_logp_difference/mean": 1.1098703145980835, "step": 287, "step_time": 11.723423002993513 }, { "clip_ratio/high_max": 0.005434782709926367, "clip_ratio/high_mean": 0.0027173913549631834, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0027173913549631834, "entropy": 5.3144795298576355, "epoch": 0.00288, "grad_norm": 0.010284801945090294, "kl": 0.5240667965263128, "learning_rate": 9.999970610408019e-06, "loss": -0.0113, "step": 288, "step_time": 6.230125784997654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1382.0, "completions/max_terminated_length": 1382.0, "completions/mean_length": 399.21875, "completions/mean_terminated_length": 382.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.28752937912941, "epoch": 0.00289, "frac_reward_zero_std": 0.0, "grad_norm": 0.02223776839673519, "kl": 0.39546493627130985, "learning_rate": 9.999970376694797e-06, "loss": -0.0133, "num_tokens": 6485605.0, "reward": 0.528472363948822, "reward_std": 1.3387869596481323, "rewards/rollout_reward_func/mean": 0.528472363948822, "rewards/rollout_reward_func/std": 1.648099422454834, "sampling/importance_sampling_ratio/max": 0.5564911365509033, "sampling/importance_sampling_ratio/mean": 0.17778874933719635, "sampling/importance_sampling_ratio/min": 2.2026918955475594e-08, "sampling/sampling_logp_difference/max": 3.6263933181762695, "sampling/sampling_logp_difference/mean": 1.221440315246582, "step": 289, "step_time": 10.321286787002464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 6.278927028179169, "epoch": 0.0029, "grad_norm": 0.014874952845275402, "kl": 0.39385264553129673, "learning_rate": 9.999970142055984e-06, "loss": -0.0133, "step": 290, "step_time": 5.726180590001604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0034722222480922937, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0034722222480922937, "completions/clipped_ratio": 0.0, "completions/max_length": 1177.0, "completions/max_terminated_length": 1177.0, "completions/mean_length": 405.375, "completions/mean_terminated_length": 405.375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.730392724275589, "epoch": 0.00291, "frac_reward_zero_std": 0.0, "grad_norm": 0.019086096435785294, "kl": 0.45476343762129545, "learning_rate": 9.999969906491578e-06, "loss": -0.0166, "num_tokens": 6533994.0, "reward": 1.178751826286316, "reward_std": 1.4918336868286133, "rewards/rollout_reward_func/mean": 1.178751826286316, "rewards/rollout_reward_func/std": 1.5628931522369385, "sampling/importance_sampling_ratio/max": 0.5597690939903259, "sampling/importance_sampling_ratio/mean": 0.18025796115398407, "sampling/importance_sampling_ratio/min": 2.4774422097140203e-11, "sampling/sampling_logp_difference/max": 12.671557426452637, "sampling/sampling_logp_difference/mean": 1.138835072517395, "step": 291, "step_time": 10.542231890995026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0034722222480922937, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0034722222480922937, "entropy": 5.722571104764938, "epoch": 0.00292, "grad_norm": 0.02182145230472088, "kl": 0.45356249064207077, "learning_rate": 9.99996967000158e-06, "loss": -0.0166, "step": 292, "step_time": 5.888602871997136 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "completions/clipped_ratio": 0.0, "completions/max_length": 606.0, "completions/max_terminated_length": 606.0, "completions/mean_length": 156.625, "completions/mean_terminated_length": 156.625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.3578342497348785, "epoch": 0.00293, "frac_reward_zero_std": 0.25, "grad_norm": 0.03377014398574829, "kl": 0.4793526418507099, "learning_rate": 9.999969432585992e-06, "loss": -0.0148, "num_tokens": 6572409.0, "reward": 1.1088016033172607, "reward_std": 1.1098366975784302, "rewards/rollout_reward_func/mean": 1.1088016033172607, "rewards/rollout_reward_func/std": 1.4151866436004639, "sampling/importance_sampling_ratio/max": 0.558397650718689, "sampling/importance_sampling_ratio/mean": 0.26251572370529175, "sampling/importance_sampling_ratio/min": 2.1556381398113444e-05, "sampling/sampling_logp_difference/max": 2.913980007171631, "sampling/sampling_logp_difference/mean": 1.0271241664886475, "step": 293, "step_time": 7.41253353299544 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 5.354303359985352, "epoch": 0.00294, "grad_norm": 0.03666205704212189, "kl": 0.4809299036860466, "learning_rate": 9.99996919424481e-06, "loss": -0.0147, "step": 294, "step_time": 4.0810803700114775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 2.53125, "completions/mean_terminated_length": 2.096774101257324, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.0188043415546417, "epoch": 0.00295, "frac_reward_zero_std": 0.5, "grad_norm": 0.007991176098585129, "kl": 1.0024083778262138, "learning_rate": 9.999968954978038e-06, "loss": -0.0089, "num_tokens": 6599597.0, "reward": 1.807760238647461, "reward_std": 0.5303300619125366, "rewards/rollout_reward_func/mean": 1.807760238647461, "rewards/rollout_reward_func/std": 0.7365753650665283, "sampling/importance_sampling_ratio/max": 0.5582232475280762, "sampling/importance_sampling_ratio/mean": 0.49446478486061096, "sampling/importance_sampling_ratio/min": 2.9260306178002793e-07, "sampling/sampling_logp_difference/max": 3.9029107093811035, "sampling/sampling_logp_difference/mean": 0.5183051824569702, "step": 295, "step_time": 5.986620888001198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.0177638232707977, "epoch": 0.00296, "grad_norm": 0.006651030387729406, "kl": 0.9994005635380745, "learning_rate": 9.999968714785673e-06, "loss": -0.0089, "step": 296, "step_time": 3.600524565001251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1303.0, "completions/max_terminated_length": 1303.0, "completions/mean_length": 422.28125, "completions/mean_terminated_length": 422.28125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.136559069156647, "epoch": 0.00297, "frac_reward_zero_std": 0.5, "grad_norm": 0.00883534923195839, "kl": 0.7513014078140259, "learning_rate": 9.999968473667719e-06, "loss": -0.01, "num_tokens": 6648715.0, "reward": 1.790665864944458, "reward_std": 0.9442904591560364, "rewards/rollout_reward_func/mean": 1.790665864944458, "rewards/rollout_reward_func/std": 1.5476086139678955, "sampling/importance_sampling_ratio/max": 0.5553457736968994, "sampling/importance_sampling_ratio/mean": 0.26676225662231445, "sampling/importance_sampling_ratio/min": 3.487513617453261e-27, "sampling/sampling_logp_difference/max": 5.288309574127197, "sampling/sampling_logp_difference/mean": 0.8581823706626892, "step": 297, "step_time": 11.114017654002964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.131960690021515, "epoch": 0.00298, "grad_norm": 0.009366623125970364, "kl": 0.7525698430836201, "learning_rate": 9.99996823162417e-06, "loss": -0.01, "step": 298, "step_time": 5.535496957003488 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.0, "completions/max_length": 561.0, "completions/max_terminated_length": 561.0, "completions/mean_length": 195.875, "completions/mean_terminated_length": 195.875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.02237856388092, "epoch": 0.00299, "frac_reward_zero_std": 0.5, "grad_norm": 0.270797997713089, "kl": 1.2440642565488815, "learning_rate": 9.99996798865503e-06, "loss": -0.006, "num_tokens": 6687938.0, "reward": 2.0902881622314453, "reward_std": 0.7113924026489258, "rewards/rollout_reward_func/mean": 2.0902881622314453, "rewards/rollout_reward_func/std": 0.977899968624115, "sampling/importance_sampling_ratio/max": 0.559758186340332, "sampling/importance_sampling_ratio/mean": 0.35151946544647217, "sampling/importance_sampling_ratio/min": 8.146806180775457e-07, "sampling/sampling_logp_difference/max": 3.371581792831421, "sampling/sampling_logp_difference/mean": 0.6483873128890991, "step": 299, "step_time": 7.582120315993961 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 4.024043023586273, "epoch": 0.003, "grad_norm": 0.02313867211341858, "kl": 0.760798990726471, "learning_rate": 9.9999677447603e-06, "loss": -0.0072, "step": 300, "step_time": 4.086266507998516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "completions/clipped_ratio": 0.0, "completions/max_length": 1335.0, "completions/max_terminated_length": 1335.0, "completions/mean_length": 369.59375, "completions/mean_terminated_length": 369.59375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.220919579267502, "epoch": 0.00301, "frac_reward_zero_std": 0.0, "grad_norm": 0.1776433289051056, "kl": 0.6563911605626345, "learning_rate": 9.99996749993998e-06, "loss": -0.0131, "num_tokens": 6737143.0, "reward": 0.2807167172431946, "reward_std": 1.0643982887268066, "rewards/rollout_reward_func/mean": 0.2807167172431946, "rewards/rollout_reward_func/std": 1.5161982774734497, "sampling/importance_sampling_ratio/max": 0.5431978106498718, "sampling/importance_sampling_ratio/mean": 0.1818685233592987, "sampling/importance_sampling_ratio/min": 6.959063452447248e-14, "sampling/sampling_logp_difference/max": 4.8484883308410645, "sampling/sampling_logp_difference/mean": 0.9724471569061279, "step": 301, "step_time": 10.114671219998854 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.010416666744276881, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01822916674427688, "entropy": 5.328377902507782, "epoch": 0.00302, "grad_norm": 0.17009276151657104, "kl": 0.5558526050299406, "learning_rate": 9.999967254194065e-06, "loss": -0.0135, "step": 302, "step_time": 6.311877902993729 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "completions/clipped_ratio": 0.0, "completions/max_length": 1144.0, "completions/max_terminated_length": 1144.0, "completions/mean_length": 154.0, "completions/mean_terminated_length": 154.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.769967168569565, "epoch": 0.00303, "frac_reward_zero_std": 0.5, "grad_norm": 0.040497977286577225, "kl": 0.8237781152129173, "learning_rate": 9.999967007522561e-06, "loss": -0.0083, "num_tokens": 6771195.0, "reward": 1.7112095355987549, "reward_std": 0.8213784694671631, "rewards/rollout_reward_func/mean": 1.7112095355987549, "rewards/rollout_reward_func/std": 1.2161858081817627, "sampling/importance_sampling_ratio/max": 0.5542251467704773, "sampling/importance_sampling_ratio/mean": 0.4098864793777466, "sampling/importance_sampling_ratio/min": 1.2557714512695384e-07, "sampling/sampling_logp_difference/max": 4.457819938659668, "sampling/sampling_logp_difference/mean": 0.7358884811401367, "step": 303, "step_time": 9.256168229996547 }, { "clip_ratio/high_max": 0.02500000037252903, "clip_ratio/high_mean": 0.012500000186264515, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012500000186264515, "entropy": 3.7927871644496918, "epoch": 0.00304, "grad_norm": 0.009646243415772915, "kl": 0.8080930449068546, "learning_rate": 9.999966759925464e-06, "loss": -0.0083, "step": 304, "step_time": 5.031052776004799 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.0, "completions/max_length": 895.0, "completions/max_terminated_length": 895.0, "completions/mean_length": 180.4375, "completions/mean_terminated_length": 180.4375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.6280277967453, "epoch": 0.00305, "frac_reward_zero_std": 0.25, "grad_norm": 0.06355079263448715, "kl": 0.6510318033397198, "learning_rate": 9.999966511402779e-06, "loss": -0.0123, "num_tokens": 6809089.0, "reward": 1.4762258529663086, "reward_std": 1.0179951190948486, "rewards/rollout_reward_func/mean": 1.4762258529663086, "rewards/rollout_reward_func/std": 1.1526970863342285, "sampling/importance_sampling_ratio/max": 0.5543712377548218, "sampling/importance_sampling_ratio/mean": 0.279407262802124, "sampling/importance_sampling_ratio/min": 6.165581726236269e-05, "sampling/sampling_logp_difference/max": 3.209479331970215, "sampling/sampling_logp_difference/mean": 0.7803548574447632, "step": 305, "step_time": 8.70296705799774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.687529474496841, "epoch": 0.00306, "grad_norm": 0.08397655189037323, "kl": 0.6446665525436401, "learning_rate": 9.9999662619545e-06, "loss": -0.0122, "step": 306, "step_time": 4.74860392499977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 946.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 324.28125, "completions/mean_terminated_length": 311.06451416015625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.77870437502861, "epoch": 0.00307, "frac_reward_zero_std": 0.0, "grad_norm": 0.06397123634815216, "kl": 0.4410424157977104, "learning_rate": 9.999966011580632e-06, "loss": -0.0084, "num_tokens": 6855187.0, "reward": 0.8447567224502563, "reward_std": 1.6190463304519653, "rewards/rollout_reward_func/mean": 0.8447567224502563, "rewards/rollout_reward_func/std": 1.8460187911987305, "sampling/importance_sampling_ratio/max": 0.5479478240013123, "sampling/importance_sampling_ratio/mean": 0.28577834367752075, "sampling/importance_sampling_ratio/min": 5.978653911026475e-40, "sampling/sampling_logp_difference/max": 10.45187759399414, "sampling/sampling_logp_difference/mean": 0.976198136806488, "step": 307, "step_time": 9.505350705992896 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 4.7936126589775085, "epoch": 0.00308, "grad_norm": 0.04586593434214592, "kl": 0.4414192233234644, "learning_rate": 9.999965760281171e-06, "loss": -0.0086, "step": 308, "step_time": 5.391104890997667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 569.0, "completions/max_terminated_length": 569.0, "completions/mean_length": 255.78125, "completions/mean_terminated_length": 255.78125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.304022789001465, "epoch": 0.00309, "frac_reward_zero_std": 0.0, "grad_norm": 0.2705955505371094, "kl": 0.37042520102113485, "learning_rate": 9.999965508056122e-06, "loss": -0.0127, "num_tokens": 6898997.0, "reward": 1.0304327011108398, "reward_std": 1.3331693410873413, "rewards/rollout_reward_func/mean": 1.0304327011108398, "rewards/rollout_reward_func/std": 1.451238751411438, "sampling/importance_sampling_ratio/max": 0.5531238317489624, "sampling/importance_sampling_ratio/mean": 0.20151951909065247, "sampling/importance_sampling_ratio/min": 6.231956649571657e-05, "sampling/sampling_logp_difference/max": 3.1780643463134766, "sampling/sampling_logp_difference/mean": 1.2225866317749023, "step": 309, "step_time": 8.144619374994363 }, { "clip_ratio/high_max": 0.1041666679084301, "clip_ratio/high_mean": 0.05208333395421505, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.06770833395421505, "entropy": 6.262114763259888, "epoch": 0.0031, "grad_norm": 0.10000497102737427, "kl": 0.37708108592778444, "learning_rate": 9.999965254905479e-06, "loss": -0.0137, "step": 310, "step_time": 4.131031762997736 }, { "clip_ratio/high_max": 0.008928571827709675, "clip_ratio/high_mean": 0.004464285913854837, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004464285913854837, "completions/clipped_ratio": 0.0, "completions/max_length": 950.0, "completions/max_terminated_length": 950.0, "completions/mean_length": 300.0625, "completions/mean_terminated_length": 300.0625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.133997023105621, "epoch": 0.00311, "frac_reward_zero_std": 0.25, "grad_norm": 0.04185439646244049, "kl": 0.340055376291275, "learning_rate": 9.999965000829247e-06, "loss": -0.009, "num_tokens": 6943156.0, "reward": 1.6527063846588135, "reward_std": 1.3919175863265991, "rewards/rollout_reward_func/mean": 1.6527063846588135, "rewards/rollout_reward_func/std": 1.580327033996582, "sampling/importance_sampling_ratio/max": 0.5505726933479309, "sampling/importance_sampling_ratio/mean": 0.21641206741333008, "sampling/importance_sampling_ratio/min": 2.0286968482332527e-15, "sampling/sampling_logp_difference/max": 3.980590343475342, "sampling/sampling_logp_difference/mean": 1.2781758308410645, "step": 311, "step_time": 9.168901405002543 }, { "clip_ratio/high_max": 0.024553571827709675, "clip_ratio/high_mean": 0.012276785913854837, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.022693452890962362, "entropy": 6.118138134479523, "epoch": 0.00312, "grad_norm": 0.03280952200293541, "kl": 0.33732758089900017, "learning_rate": 9.999964745827424e-06, "loss": -0.0092, "step": 312, "step_time": 4.918340869000531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1022.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 303.875, "completions/mean_terminated_length": 303.875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.953430473804474, "epoch": 0.00313, "frac_reward_zero_std": 0.25, "grad_norm": 0.021168513223528862, "kl": 0.5353014655411243, "learning_rate": 9.99996448990001e-06, "loss": -0.015, "num_tokens": 6987592.0, "reward": 1.420615792274475, "reward_std": 1.35750412940979, "rewards/rollout_reward_func/mean": 1.420615792274475, "rewards/rollout_reward_func/std": 1.5361449718475342, "sampling/importance_sampling_ratio/max": 0.5526061058044434, "sampling/importance_sampling_ratio/mean": 0.20590680837631226, "sampling/importance_sampling_ratio/min": 5.1785677968041455e-09, "sampling/sampling_logp_difference/max": 2.993556022644043, "sampling/sampling_logp_difference/mean": 1.1880321502685547, "step": 313, "step_time": 9.649698391000129 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 5.928788870573044, "epoch": 0.00314, "grad_norm": 0.030322240665555, "kl": 0.5411229580640793, "learning_rate": 9.999964233047006e-06, "loss": -0.015, "step": 314, "step_time": 5.40391489600006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1306.0, "completions/max_terminated_length": 1306.0, "completions/mean_length": 488.65625, "completions/mean_terminated_length": 448.20001220703125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.318760335445404, "epoch": 0.00315, "frac_reward_zero_std": 0.0, "grad_norm": 0.01492095086723566, "kl": 0.3996394304558635, "learning_rate": 9.999963975268412e-06, "loss": -0.0105, "num_tokens": 7039383.0, "reward": 0.9512027502059937, "reward_std": 1.9040058851242065, "rewards/rollout_reward_func/mean": 0.9512027502059937, "rewards/rollout_reward_func/std": 2.1555962562561035, "sampling/importance_sampling_ratio/max": 0.5339041352272034, "sampling/importance_sampling_ratio/mean": 0.11489511281251907, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 3.6916520595550537, "sampling/sampling_logp_difference/mean": 1.312880039215088, "step": 315, "step_time": 11.054486360993906 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 6.306908965110779, "epoch": 0.00316, "grad_norm": 0.014480828307569027, "kl": 0.4017223324626684, "learning_rate": 9.999963716564226e-06, "loss": -0.0105, "step": 316, "step_time": 5.627326956000616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1551.0, "completions/max_terminated_length": 1551.0, "completions/mean_length": 594.25, "completions/mean_terminated_length": 594.25, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.053450882434845, "epoch": 0.00317, "frac_reward_zero_std": 0.0, "grad_norm": 0.02585631050169468, "kl": 0.530362319201231, "learning_rate": 9.99996345693445e-06, "loss": -0.0107, "num_tokens": 7093254.0, "reward": 0.7550243139266968, "reward_std": 1.1198030710220337, "rewards/rollout_reward_func/mean": 0.7550243139266968, "rewards/rollout_reward_func/std": 1.2424007654190063, "sampling/importance_sampling_ratio/max": 0.557655930519104, "sampling/importance_sampling_ratio/mean": 0.15764719247817993, "sampling/importance_sampling_ratio/min": 4.9448495076376275e-08, "sampling/sampling_logp_difference/max": 4.302996635437012, "sampling/sampling_logp_difference/mean": 1.1439740657806396, "step": 317, "step_time": 11.350645698992594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 6.0217792093753815, "epoch": 0.00318, "grad_norm": 0.024944903329014778, "kl": 0.5322428867220879, "learning_rate": 9.999963196379084e-06, "loss": -0.0108, "step": 318, "step_time": 6.251373618997604 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.0, "completions/max_length": 1850.0, "completions/max_terminated_length": 1850.0, "completions/mean_length": 290.71875, "completions/mean_terminated_length": 290.71875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.381722092628479, "epoch": 0.00319, "frac_reward_zero_std": 0.0, "grad_norm": 0.03665098547935486, "kl": 0.5847032368183136, "learning_rate": 9.999962934898128e-06, "loss": -0.0146, "num_tokens": 7137912.0, "reward": 0.7414301633834839, "reward_std": 1.5180798768997192, "rewards/rollout_reward_func/mean": 0.7414301633834839, "rewards/rollout_reward_func/std": 1.7848491668701172, "sampling/importance_sampling_ratio/max": 0.5554521083831787, "sampling/importance_sampling_ratio/mean": 0.255093514919281, "sampling/importance_sampling_ratio/min": 2.6006550835370665e-14, "sampling/sampling_logp_difference/max": 4.598544120788574, "sampling/sampling_logp_difference/mean": 1.0434049367904663, "step": 319, "step_time": 11.805362975999742 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 5.369244158267975, "epoch": 0.0032, "grad_norm": 0.03568027913570404, "kl": 0.5826235357671976, "learning_rate": 9.999962672491582e-06, "loss": -0.0147, "step": 320, "step_time": 7.133328291001817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1206.0, "completions/max_terminated_length": 1206.0, "completions/mean_length": 287.65625, "completions/mean_terminated_length": 269.7419128417969, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.552937090396881, "epoch": 0.00321, "frac_reward_zero_std": 0.0, "grad_norm": 0.10182564705610275, "kl": 0.6051398478448391, "learning_rate": 9.999962409159445e-06, "loss": -0.0069, "num_tokens": 7182473.0, "reward": 1.103133201599121, "reward_std": 1.5137407779693604, "rewards/rollout_reward_func/mean": 1.103133201599121, "rewards/rollout_reward_func/std": 1.7199842929840088, "sampling/importance_sampling_ratio/max": 0.5389597415924072, "sampling/importance_sampling_ratio/mean": 0.17337629199028015, "sampling/importance_sampling_ratio/min": 2.919462005479545e-08, "sampling/sampling_logp_difference/max": 3.955629587173462, "sampling/sampling_logp_difference/mean": 1.0253756046295166, "step": 321, "step_time": 9.912568465999357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 5.592137813568115, "epoch": 0.00322, "grad_norm": 0.10544880479574203, "kl": 0.5920405201613903, "learning_rate": 9.999962144901718e-06, "loss": -0.0073, "step": 322, "step_time": 5.42268697299005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1508.0, "completions/max_terminated_length": 1508.0, "completions/mean_length": 632.09375, "completions/mean_terminated_length": 632.09375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.213400453329086, "epoch": 0.00323, "frac_reward_zero_std": 0.25, "grad_norm": 0.030810672789812088, "kl": 0.48950042575597763, "learning_rate": 9.999961879718401e-06, "loss": -0.0059, "num_tokens": 7237136.0, "reward": 1.1013963222503662, "reward_std": 1.2529712915420532, "rewards/rollout_reward_func/mean": 1.1013963222503662, "rewards/rollout_reward_func/std": 1.5072904825210571, "sampling/importance_sampling_ratio/max": 0.5585710406303406, "sampling/importance_sampling_ratio/mean": 0.17565935850143433, "sampling/importance_sampling_ratio/min": 5.960533580378832e-15, "sampling/sampling_logp_difference/max": 4.103453159332275, "sampling/sampling_logp_difference/mean": 1.263371229171753, "step": 323, "step_time": 11.386421089995565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 6.224628448486328, "epoch": 0.00324, "grad_norm": 0.03220371901988983, "kl": 0.48629532661288977, "learning_rate": 9.999961613609494e-06, "loss": -0.006, "step": 324, "step_time": 6.15260823200515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1348.0, "completions/max_terminated_length": 1348.0, "completions/mean_length": 319.9375, "completions/mean_terminated_length": 308.4838562011719, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.4316776394844055, "epoch": 0.00325, "frac_reward_zero_std": 0.25, "grad_norm": 0.021497664973139763, "kl": 0.4698838423937559, "learning_rate": 9.999961346574998e-06, "loss": -0.018, "num_tokens": 7281500.0, "reward": 0.8997714519500732, "reward_std": 1.3757859468460083, "rewards/rollout_reward_func/mean": 0.8997714519500732, "rewards/rollout_reward_func/std": 1.761325716972351, "sampling/importance_sampling_ratio/max": 0.5575658082962036, "sampling/importance_sampling_ratio/mean": 0.20736239850521088, "sampling/importance_sampling_ratio/min": 6.285600895939384e-36, "sampling/sampling_logp_difference/max": 3.085360050201416, "sampling/sampling_logp_difference/mean": 1.0122863054275513, "step": 325, "step_time": 11.27344291099871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.442992597818375, "epoch": 0.00326, "grad_norm": 0.02167992666363716, "kl": 0.4746296554803848, "learning_rate": 9.999961078614912e-06, "loss": -0.0179, "step": 326, "step_time": 6.053298019996873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1658.0, "completions/max_terminated_length": 1658.0, "completions/mean_length": 192.65625, "completions/mean_terminated_length": 198.35482788085938, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.419397473335266, "epoch": 0.00327, "frac_reward_zero_std": 0.25, "grad_norm": 0.06068325787782669, "kl": 0.627469876781106, "learning_rate": 9.999960809729237e-06, "loss": -0.0005, "num_tokens": 7321944.0, "reward": 0.3449864387512207, "reward_std": 0.8253123760223389, "rewards/rollout_reward_func/mean": 0.3449864387512207, "rewards/rollout_reward_func/std": 1.2635442018508911, "sampling/importance_sampling_ratio/max": 0.624188244342804, "sampling/importance_sampling_ratio/mean": 0.3074917793273926, "sampling/importance_sampling_ratio/min": 2.2047744185904605e-10, "sampling/sampling_logp_difference/max": 3.8190598487854004, "sampling/sampling_logp_difference/mean": 1.0146510601043701, "step": 327, "step_time": 10.533764586005418 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0052083334885537624, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013020833488553762, "entropy": 5.42768394947052, "epoch": 0.00328, "grad_norm": 0.032130926847457886, "kl": 0.6228774953633547, "learning_rate": 9.99996053991797e-06, "loss": -0.0008, "step": 328, "step_time": 6.118857855999522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 177.1875, "completions/mean_terminated_length": 177.1875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.84858912229538, "epoch": 0.00329, "frac_reward_zero_std": 0.25, "grad_norm": 0.07754016667604446, "kl": 0.6487583518028259, "learning_rate": 9.999960269181116e-06, "loss": -0.0017, "num_tokens": 7360886.0, "reward": 1.4564917087554932, "reward_std": 0.6917423009872437, "rewards/rollout_reward_func/mean": 1.4564917087554932, "rewards/rollout_reward_func/std": 0.9564886093139648, "sampling/importance_sampling_ratio/max": 0.6267609000205994, "sampling/importance_sampling_ratio/mean": 0.3145931661128998, "sampling/importance_sampling_ratio/min": 4.239129392565738e-13, "sampling/sampling_logp_difference/max": 4.620004653930664, "sampling/sampling_logp_difference/mean": 0.9447206854820251, "step": 329, "step_time": 8.792899291998765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.839884549379349, "epoch": 0.0033, "grad_norm": 0.07925805449485779, "kl": 0.6496336050331593, "learning_rate": 9.999959997518671e-06, "loss": -0.0017, "step": 330, "step_time": 4.82623151900043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1217.0, "completions/max_terminated_length": 1217.0, "completions/mean_length": 395.375, "completions/mean_terminated_length": 395.375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.012967497110367, "epoch": 0.00331, "frac_reward_zero_std": 0.25, "grad_norm": 0.025718487799167633, "kl": 0.5534693002700806, "learning_rate": 9.999959724930638e-06, "loss": -0.0128, "num_tokens": 7408380.0, "reward": 1.0856351852416992, "reward_std": 1.0976052284240723, "rewards/rollout_reward_func/mean": 1.0856351852416992, "rewards/rollout_reward_func/std": 1.4601936340332031, "sampling/importance_sampling_ratio/max": 0.5452085137367249, "sampling/importance_sampling_ratio/mean": 0.22601071000099182, "sampling/importance_sampling_ratio/min": 7.602080631663455e-10, "sampling/sampling_logp_difference/max": 9.493030548095703, "sampling/sampling_logp_difference/mean": 1.0008220672607422, "step": 331, "step_time": 11.229046165997715 }, { "clip_ratio/high_max": 0.008928571827709675, "clip_ratio/high_mean": 0.004464285913854837, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004464285913854837, "entropy": 5.016974657773972, "epoch": 0.00332, "grad_norm": 0.022582395002245903, "kl": 0.5511316359043121, "learning_rate": 9.999959451417012e-06, "loss": -0.0128, "step": 332, "step_time": 5.551644658000441 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "completions/clipped_ratio": 0.0, "completions/max_length": 1054.0, "completions/max_terminated_length": 1054.0, "completions/mean_length": 337.4375, "completions/mean_terminated_length": 337.4375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.900304675102234, "epoch": 0.00333, "frac_reward_zero_std": 0.0, "grad_norm": 0.02369280718266964, "kl": 0.4449819978326559, "learning_rate": 9.9999591769778e-06, "loss": -0.017, "num_tokens": 7454677.0, "reward": 0.9807837605476379, "reward_std": 1.5834590196609497, "rewards/rollout_reward_func/mean": 0.9807837605476379, "rewards/rollout_reward_func/std": 1.8209888935089111, "sampling/importance_sampling_ratio/max": 0.5573937892913818, "sampling/importance_sampling_ratio/mean": 0.1840643286705017, "sampling/importance_sampling_ratio/min": 2.7627554572973168e-06, "sampling/sampling_logp_difference/max": 3.1308090686798096, "sampling/sampling_logp_difference/mean": 1.082637906074524, "step": 333, "step_time": 9.28178468200349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.877592980861664, "epoch": 0.00334, "grad_norm": 0.022702137008309364, "kl": 0.43519189581274986, "learning_rate": 9.999958901612997e-06, "loss": -0.0171, "step": 334, "step_time": 5.104959087002499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2084.0, "completions/max_terminated_length": 2084.0, "completions/mean_length": 582.3125, "completions/mean_terminated_length": 582.3125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.311518639326096, "epoch": 0.00335, "frac_reward_zero_std": 0.0, "grad_norm": 0.03660649061203003, "kl": 0.42611076310276985, "learning_rate": 9.999958625322606e-06, "loss": -0.0231, "num_tokens": 7510701.0, "reward": 0.9982714056968689, "reward_std": 1.1909390687942505, "rewards/rollout_reward_func/mean": 0.9982714056968689, "rewards/rollout_reward_func/std": 1.5233656167984009, "sampling/importance_sampling_ratio/max": 0.5372706651687622, "sampling/importance_sampling_ratio/mean": 0.14571715891361237, "sampling/importance_sampling_ratio/min": 1.0542099004935321e-15, "sampling/sampling_logp_difference/max": 11.722213745117188, "sampling/sampling_logp_difference/mean": 0.9473816156387329, "step": 335, "step_time": 12.740260368002055 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 5.303052246570587, "epoch": 0.00336, "grad_norm": 0.019792664796113968, "kl": 0.42003363743424416, "learning_rate": 9.999958348106625e-06, "loss": -0.0231, "step": 336, "step_time": 7.679750088998844 }, { "clip_ratio/high_max": 0.005681818351149559, "clip_ratio/high_mean": 0.0028409091755747795, "clip_ratio/low_mean": 0.0028409091755747795, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005681818351149559, "completions/clipped_ratio": 0.03125, "completions/max_length": 1135.0, "completions/max_terminated_length": 1135.0, "completions/mean_length": 617.9375, "completions/mean_terminated_length": 615.8709716796875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.987224102020264, "epoch": 0.00337, "frac_reward_zero_std": 0.0, "grad_norm": 0.017452141270041466, "kl": 0.4391068033874035, "learning_rate": 9.999958069965056e-06, "loss": -0.0151, "num_tokens": 7568495.0, "reward": 0.9808289408683777, "reward_std": 1.4591829776763916, "rewards/rollout_reward_func/mean": 0.9808289408683777, "rewards/rollout_reward_func/std": 1.5753253698349, "sampling/importance_sampling_ratio/max": 0.5430482029914856, "sampling/importance_sampling_ratio/mean": 0.12027645856142044, "sampling/importance_sampling_ratio/min": 1.9898939267182225e-12, "sampling/sampling_logp_difference/max": 4.303797245025635, "sampling/sampling_logp_difference/mean": 1.1702733039855957, "step": 337, "step_time": 10.641229085005762 }, { "clip_ratio/high_max": 0.005681818351149559, "clip_ratio/high_mean": 0.0028409091755747795, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0028409091755747795, "entropy": 5.982750952243805, "epoch": 0.00338, "grad_norm": 0.01746898703277111, "kl": 0.44001202657818794, "learning_rate": 9.999957790897897e-06, "loss": -0.0151, "step": 338, "step_time": 5.386922812998819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1181.0, "completions/max_terminated_length": 1181.0, "completions/mean_length": 355.40625, "completions/mean_terminated_length": 355.40625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.862601637840271, "epoch": 0.00339, "frac_reward_zero_std": 0.0, "grad_norm": 0.033418282866477966, "kl": 0.5261186566203833, "learning_rate": 9.999957510905149e-06, "loss": -0.0116, "num_tokens": 7615005.0, "reward": 1.0946519374847412, "reward_std": 1.6486914157867432, "rewards/rollout_reward_func/mean": 1.0946519374847412, "rewards/rollout_reward_func/std": 1.6567738056182861, "sampling/importance_sampling_ratio/max": 0.5574625730514526, "sampling/importance_sampling_ratio/mean": 0.19122497737407684, "sampling/importance_sampling_ratio/min": 2.1497960744909506e-08, "sampling/sampling_logp_difference/max": 3.881655216217041, "sampling/sampling_logp_difference/mean": 1.1191229820251465, "step": 339, "step_time": 9.862237043995265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.870338290929794, "epoch": 0.0034, "grad_norm": 0.03201791271567345, "kl": 0.5284947128966451, "learning_rate": 9.999957229986813e-06, "loss": -0.0116, "step": 340, "step_time": 5.392980971006182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1459.0, "completions/max_terminated_length": 1459.0, "completions/mean_length": 472.84375, "completions/mean_terminated_length": 472.84375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.6431474685668945, "epoch": 0.00341, "frac_reward_zero_std": 0.0, "grad_norm": 0.048352815210819244, "kl": 0.6165034621953964, "learning_rate": 9.999956948142888e-06, "loss": -0.0152, "num_tokens": 7665260.0, "reward": 0.38282477855682373, "reward_std": 1.4222455024719238, "rewards/rollout_reward_func/mean": 0.38282477855682373, "rewards/rollout_reward_func/std": 1.8564258813858032, "sampling/importance_sampling_ratio/max": 0.5572556257247925, "sampling/importance_sampling_ratio/mean": 0.189304381608963, "sampling/importance_sampling_ratio/min": 2.0508061879809247e-06, "sampling/sampling_logp_difference/max": 2.7621493339538574, "sampling/sampling_logp_difference/mean": 1.0310323238372803, "step": 341, "step_time": 10.410257647996332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.611041724681854, "epoch": 0.00342, "grad_norm": 0.04450219124555588, "kl": 0.6165609508752823, "learning_rate": 9.999956665373374e-06, "loss": -0.0153, "step": 342, "step_time": 6.334823569995933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1842.0, "completions/max_terminated_length": 1842.0, "completions/mean_length": 348.75, "completions/mean_terminated_length": 359.4838562011719, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.94247180223465, "epoch": 0.00343, "frac_reward_zero_std": 0.0, "grad_norm": 0.09242787212133408, "kl": 0.9852113192901015, "learning_rate": 9.999956381678271e-06, "loss": -0.0232, "num_tokens": 7711801.0, "reward": 1.377017855644226, "reward_std": 1.6751017570495605, "rewards/rollout_reward_func/mean": 1.377017855644226, "rewards/rollout_reward_func/std": 1.6922348737716675, "sampling/importance_sampling_ratio/max": 0.5567643046379089, "sampling/importance_sampling_ratio/mean": 0.21981878578662872, "sampling/importance_sampling_ratio/min": 2.9609084073456415e-09, "sampling/sampling_logp_difference/max": 4.638091564178467, "sampling/sampling_logp_difference/mean": 0.9497562646865845, "step": 343, "step_time": 12.337980099004199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.943351984024048, "epoch": 0.00344, "grad_norm": 0.059314027428627014, "kl": 0.8673902824521065, "learning_rate": 9.99995609705758e-06, "loss": -0.0234, "step": 344, "step_time": 6.675270194002223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1068.0, "completions/max_terminated_length": 1068.0, "completions/mean_length": 254.4375, "completions/mean_terminated_length": 254.4375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.976771116256714, "epoch": 0.00345, "frac_reward_zero_std": 0.0, "grad_norm": 0.050657764077186584, "kl": 0.4020374696701765, "learning_rate": 9.999955811511302e-06, "loss": -0.0173, "num_tokens": 7755574.0, "reward": 1.0293716192245483, "reward_std": 1.6134212017059326, "rewards/rollout_reward_func/mean": 1.0293716192245483, "rewards/rollout_reward_func/std": 1.6115459203720093, "sampling/importance_sampling_ratio/max": 0.5521588325500488, "sampling/importance_sampling_ratio/mean": 0.2566182017326355, "sampling/importance_sampling_ratio/min": 1.9262968514828632e-19, "sampling/sampling_logp_difference/max": 7.767065048217773, "sampling/sampling_logp_difference/mean": 1.1075544357299805, "step": 345, "step_time": 9.470831689995975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.997177541255951, "epoch": 0.00346, "grad_norm": 0.04910779744386673, "kl": 0.3950863964855671, "learning_rate": 9.999955525039433e-06, "loss": -0.0175, "step": 346, "step_time": 5.147360546001437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "completions/clipped_ratio": 0.0, "completions/max_length": 1569.0, "completions/max_terminated_length": 1569.0, "completions/mean_length": 288.65625, "completions/mean_terminated_length": 288.65625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.179137736558914, "epoch": 0.00347, "frac_reward_zero_std": 0.0, "grad_norm": 0.0889919251203537, "kl": 0.5772857926785946, "learning_rate": 9.999955237641976e-06, "loss": 0.0002, "num_tokens": 7799824.0, "reward": 0.438194215297699, "reward_std": 1.1352328062057495, "rewards/rollout_reward_func/mean": 0.438194215297699, "rewards/rollout_reward_func/std": 1.5407496690750122, "sampling/importance_sampling_ratio/max": 0.560533881187439, "sampling/importance_sampling_ratio/mean": 0.2777433395385742, "sampling/importance_sampling_ratio/min": 3.540046311645284e-10, "sampling/sampling_logp_difference/max": 4.545749187469482, "sampling/sampling_logp_difference/mean": 0.8722558617591858, "step": 347, "step_time": 11.079202531000192 }, { "clip_ratio/high_max": 0.08333333395421505, "clip_ratio/high_mean": 0.041666666977107525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.041666666977107525, "entropy": 5.165306925773621, "epoch": 0.00348, "grad_norm": 0.09956381469964981, "kl": 0.5794185400009155, "learning_rate": 9.999954949318932e-06, "loss": -0.0001, "step": 348, "step_time": 5.985902240998257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1919.0, "completions/max_terminated_length": 1919.0, "completions/mean_length": 592.78125, "completions/mean_terminated_length": 592.78125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.622890949249268, "epoch": 0.00349, "frac_reward_zero_std": 0.0, "grad_norm": 0.03138374909758568, "kl": 0.44767723605036736, "learning_rate": 9.999954660070299e-06, "loss": -0.0123, "num_tokens": 7854037.0, "reward": 0.9516052603721619, "reward_std": 1.3409990072250366, "rewards/rollout_reward_func/mean": 0.9516052603721619, "rewards/rollout_reward_func/std": 1.6854932308197021, "sampling/importance_sampling_ratio/max": 0.5576844215393066, "sampling/importance_sampling_ratio/mean": 0.17641866207122803, "sampling/importance_sampling_ratio/min": 1.3755182756702844e-11, "sampling/sampling_logp_difference/max": 4.34391975402832, "sampling/sampling_logp_difference/mean": 1.0108544826507568, "step": 349, "step_time": 13.139156702000037 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0052083334885537624, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0052083334885537624, "entropy": 5.63739138841629, "epoch": 0.0035, "grad_norm": 0.02404089830815792, "kl": 0.4436808433383703, "learning_rate": 9.999954369896076e-06, "loss": -0.0124, "step": 350, "step_time": 6.94429846300045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1298.0, "completions/max_terminated_length": 1298.0, "completions/mean_length": 281.5625, "completions/mean_terminated_length": 281.5625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.181655943393707, "epoch": 0.00351, "frac_reward_zero_std": 0.5, "grad_norm": 0.03729444742202759, "kl": 0.6867895908653736, "learning_rate": 9.999954078796268e-06, "loss": -0.0028, "num_tokens": 7895719.0, "reward": 1.3155865669250488, "reward_std": 0.7214318513870239, "rewards/rollout_reward_func/mean": 1.3155865669250488, "rewards/rollout_reward_func/std": 1.3697984218597412, "sampling/importance_sampling_ratio/max": 0.5562067627906799, "sampling/importance_sampling_ratio/mean": 0.33280450105667114, "sampling/importance_sampling_ratio/min": 4.9091649998445064e-05, "sampling/sampling_logp_difference/max": 2.4954237937927246, "sampling/sampling_logp_difference/mean": 0.6616454124450684, "step": 351, "step_time": 9.673538740000367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.193941801786423, "epoch": 0.00352, "grad_norm": 0.039376772940158844, "kl": 0.6875719130039215, "learning_rate": 9.99995378677087e-06, "loss": -0.0028, "step": 352, "step_time": 5.4709393919947615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1039.0, "completions/max_terminated_length": 1039.0, "completions/mean_length": 399.71875, "completions/mean_terminated_length": 399.71875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.4110013246536255, "epoch": 0.00353, "frac_reward_zero_std": 0.0, "grad_norm": 0.031703416258096695, "kl": 0.4877760410308838, "learning_rate": 9.999953493819885e-06, "loss": -0.0064, "num_tokens": 7944224.0, "reward": 0.38152581453323364, "reward_std": 1.533202052116394, "rewards/rollout_reward_func/mean": 0.38152581453323364, "rewards/rollout_reward_func/std": 1.6252615451812744, "sampling/importance_sampling_ratio/max": 0.5308147668838501, "sampling/importance_sampling_ratio/mean": 0.14482718706130981, "sampling/importance_sampling_ratio/min": 3.0514665922964923e-06, "sampling/sampling_logp_difference/max": 2.80824613571167, "sampling/sampling_logp_difference/mean": 1.239119291305542, "step": 353, "step_time": 9.669282868002483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 6.435172855854034, "epoch": 0.00354, "grad_norm": 0.030120227485895157, "kl": 0.4884866625070572, "learning_rate": 9.999953199943314e-06, "loss": -0.0064, "step": 354, "step_time": 5.042957411998941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "completions/clipped_ratio": 0.0, "completions/max_length": 1188.0, "completions/max_terminated_length": 1188.0, "completions/mean_length": 409.8125, "completions/mean_terminated_length": 409.8125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.052782505750656, "epoch": 0.00355, "frac_reward_zero_std": 0.0, "grad_norm": 0.08213016390800476, "kl": 0.6203042641282082, "learning_rate": 9.999952905141152e-06, "loss": -0.0033, "num_tokens": 7992532.0, "reward": 0.645427942276001, "reward_std": 1.1515207290649414, "rewards/rollout_reward_func/mean": 0.645427942276001, "rewards/rollout_reward_func/std": 1.7520493268966675, "sampling/importance_sampling_ratio/max": 0.5349584221839905, "sampling/importance_sampling_ratio/mean": 0.20237533748149872, "sampling/importance_sampling_ratio/min": 2.141778074338241e-12, "sampling/sampling_logp_difference/max": 3.8404998779296875, "sampling/sampling_logp_difference/mean": 0.8649352788925171, "step": 355, "step_time": 10.567363583006227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.06361607182770967, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.06361607182770967, "entropy": 5.213342905044556, "epoch": 0.00356, "grad_norm": 0.08283153176307678, "kl": 0.5937098525464535, "learning_rate": 9.999952609413403e-06, "loss": -0.0037, "step": 356, "step_time": 5.430008358001942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1770.0, "completions/max_terminated_length": 1770.0, "completions/mean_length": 510.375, "completions/mean_terminated_length": 510.375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.056172549724579, "epoch": 0.00357, "frac_reward_zero_std": 0.25, "grad_norm": 0.046355150640010834, "kl": 0.39842551574110985, "learning_rate": 9.999952312760068e-06, "loss": -0.0038, "num_tokens": 8044594.0, "reward": 1.0171023607254028, "reward_std": 0.8787654638290405, "rewards/rollout_reward_func/mean": 1.0171023607254028, "rewards/rollout_reward_func/std": 1.336360216140747, "sampling/importance_sampling_ratio/max": 0.5569908022880554, "sampling/importance_sampling_ratio/mean": 0.18586581945419312, "sampling/importance_sampling_ratio/min": 7.462886308928773e-09, "sampling/sampling_logp_difference/max": 4.332623481750488, "sampling/sampling_logp_difference/mean": 1.168393611907959, "step": 357, "step_time": 11.684319112006051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 6.094105899333954, "epoch": 0.00358, "grad_norm": 0.015690509229898453, "kl": 0.3919140361249447, "learning_rate": 9.999952015181144e-06, "loss": -0.0038, "step": 358, "step_time": 6.92863261200182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1331.0, "completions/max_terminated_length": 1331.0, "completions/mean_length": 564.09375, "completions/mean_terminated_length": 571.1290283203125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.590391039848328, "epoch": 0.00359, "frac_reward_zero_std": 0.0, "grad_norm": 0.027620352804660797, "kl": 0.39641477074474096, "learning_rate": 9.999951716676632e-06, "loss": -0.0133, "num_tokens": 8100153.0, "reward": 0.7823125123977661, "reward_std": 1.7506749629974365, "rewards/rollout_reward_func/mean": 0.7823125123977661, "rewards/rollout_reward_func/std": 1.7801638841629028, "sampling/importance_sampling_ratio/max": 0.3330027163028717, "sampling/importance_sampling_ratio/mean": 0.0911916047334671, "sampling/importance_sampling_ratio/min": 2.7951238532378762e-14, "sampling/sampling_logp_difference/max": 4.551399230957031, "sampling/sampling_logp_difference/mean": 1.354062795639038, "step": 359, "step_time": 10.592319724004483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 6.59803980588913, "epoch": 0.0036, "grad_norm": 0.027668118476867676, "kl": 0.3927823882550001, "learning_rate": 9.999951417246534e-06, "loss": -0.0133, "step": 360, "step_time": 6.212761587001296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1219.0, "completions/max_terminated_length": 1219.0, "completions/mean_length": 266.0, "completions/mean_terminated_length": 266.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.150769621133804, "epoch": 0.00361, "frac_reward_zero_std": 0.25, "grad_norm": 0.08495787531137466, "kl": 0.6445882767438889, "learning_rate": 9.999951116890847e-06, "loss": -0.0052, "num_tokens": 8141614.0, "reward": 1.417466640472412, "reward_std": 0.9998583793640137, "rewards/rollout_reward_func/mean": 1.417466640472412, "rewards/rollout_reward_func/std": 1.2395515441894531, "sampling/importance_sampling_ratio/max": 0.5537645816802979, "sampling/importance_sampling_ratio/mean": 0.32748502492904663, "sampling/importance_sampling_ratio/min": 6.13070915278513e-06, "sampling/sampling_logp_difference/max": 3.053173065185547, "sampling/sampling_logp_difference/mean": 0.6555034518241882, "step": 361, "step_time": 10.096509084003628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 4.154880791902542, "epoch": 0.00362, "grad_norm": 0.06197737529873848, "kl": 0.6696761734783649, "learning_rate": 9.999950815609574e-06, "loss": -0.0055, "step": 362, "step_time": 5.501354813000944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 810.0, "completions/max_terminated_length": 810.0, "completions/mean_length": 224.8125, "completions/mean_terminated_length": 224.8125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.3462560176849365, "epoch": 0.00363, "frac_reward_zero_std": 0.0, "grad_norm": 0.05929151922464371, "kl": 0.4110393449664116, "learning_rate": 9.999950513402715e-06, "loss": -0.0195, "num_tokens": 8181642.0, "reward": 1.0950418710708618, "reward_std": 1.7597688436508179, "rewards/rollout_reward_func/mean": 1.0950418710708618, "rewards/rollout_reward_func/std": 1.764603614807129, "sampling/importance_sampling_ratio/max": 0.5517006516456604, "sampling/importance_sampling_ratio/mean": 0.2913203537464142, "sampling/importance_sampling_ratio/min": 1.9244925653083556e-09, "sampling/sampling_logp_difference/max": 4.244528770446777, "sampling/sampling_logp_difference/mean": 1.0818184614181519, "step": 363, "step_time": 8.383603717993537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.336020231246948, "epoch": 0.00364, "grad_norm": 0.060509029775857925, "kl": 0.41189585626125336, "learning_rate": 9.999950210270267e-06, "loss": -0.0196, "step": 364, "step_time": 5.077482985998358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1784.0, "completions/max_terminated_length": 1784.0, "completions/mean_length": 709.65625, "completions/mean_terminated_length": 732.0322265625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.701732099056244, "epoch": 0.00365, "frac_reward_zero_std": 0.0, "grad_norm": 0.05963045358657837, "kl": 0.4519113786518574, "learning_rate": 9.999949906212232e-06, "loss": -0.0058, "num_tokens": 8240823.0, "reward": 0.38078105449676514, "reward_std": 1.419135332107544, "rewards/rollout_reward_func/mean": 0.38078105449676514, "rewards/rollout_reward_func/std": 1.8357330560684204, "sampling/importance_sampling_ratio/max": 0.44802361726760864, "sampling/importance_sampling_ratio/mean": 0.0900266095995903, "sampling/importance_sampling_ratio/min": 4.878314537015977e-17, "sampling/sampling_logp_difference/max": 12.363638877868652, "sampling/sampling_logp_difference/mean": 1.126560091972351, "step": 365, "step_time": 12.382929330004117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.6624884605407715, "epoch": 0.00366, "grad_norm": 0.06295380741357803, "kl": 0.4520964063704014, "learning_rate": 9.999949601228609e-06, "loss": -0.006, "step": 366, "step_time": 6.96099811899694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1426.0, "completions/max_terminated_length": 1426.0, "completions/mean_length": 371.75, "completions/mean_terminated_length": 371.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.960571229457855, "epoch": 0.00367, "frac_reward_zero_std": 0.25, "grad_norm": 0.06009937822818756, "kl": 0.5034156171604991, "learning_rate": 9.9999492953194e-06, "loss": -0.0059, "num_tokens": 8287738.0, "reward": 1.101986289024353, "reward_std": 1.1176074743270874, "rewards/rollout_reward_func/mean": 1.101986289024353, "rewards/rollout_reward_func/std": 1.5597678422927856, "sampling/importance_sampling_ratio/max": 0.5578361749649048, "sampling/importance_sampling_ratio/mean": 0.21649269759655, "sampling/importance_sampling_ratio/min": 9.128985067885509e-11, "sampling/sampling_logp_difference/max": 4.554014682769775, "sampling/sampling_logp_difference/mean": 1.1315046548843384, "step": 367, "step_time": 10.95659799200439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.931352525949478, "epoch": 0.00368, "grad_norm": 0.058811038732528687, "kl": 0.5054951068013906, "learning_rate": 9.999948988484605e-06, "loss": -0.0058, "step": 368, "step_time": 5.869132234001881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0031250000465661287, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250000465661287, "completions/clipped_ratio": 0.0, "completions/max_length": 1036.0, "completions/max_terminated_length": 1036.0, "completions/mean_length": 309.3125, "completions/mean_terminated_length": 309.3125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.755274623632431, "epoch": 0.00369, "frac_reward_zero_std": 0.25, "grad_norm": 0.025453699752688408, "kl": 0.6333158537745476, "learning_rate": 9.999948680724223e-06, "loss": -0.0161, "num_tokens": 8332459.0, "reward": 1.5248576402664185, "reward_std": 1.2726972103118896, "rewards/rollout_reward_func/mean": 1.5248576402664185, "rewards/rollout_reward_func/std": 1.515214443206787, "sampling/importance_sampling_ratio/max": 0.5584135055541992, "sampling/importance_sampling_ratio/mean": 0.2757539451122284, "sampling/importance_sampling_ratio/min": 1.1129984954505456e-11, "sampling/sampling_logp_difference/max": 3.201342821121216, "sampling/sampling_logp_difference/mean": 0.8842809200286865, "step": 369, "step_time": 9.041225863988075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0031250000465661287, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250000465661287, "entropy": 4.736132174730301, "epoch": 0.0037, "grad_norm": 0.025371694937348366, "kl": 0.6325473189353943, "learning_rate": 9.999948372038253e-06, "loss": -0.0162, "step": 370, "step_time": 5.4089946869971754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0034722222480922937, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0034722222480922937, "completions/clipped_ratio": 0.0, "completions/max_length": 1260.0, "completions/max_terminated_length": 1260.0, "completions/mean_length": 309.625, "completions/mean_terminated_length": 309.625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.714890420436859, "epoch": 0.00371, "frac_reward_zero_std": 0.5, "grad_norm": 0.0192919559776783, "kl": 0.5500317290425301, "learning_rate": 9.9999480624267e-06, "loss": -0.002, "num_tokens": 8374935.0, "reward": 1.0251567363739014, "reward_std": 0.8393127918243408, "rewards/rollout_reward_func/mean": 1.0251567363739014, "rewards/rollout_reward_func/std": 1.5107383728027344, "sampling/importance_sampling_ratio/max": 0.5574933290481567, "sampling/importance_sampling_ratio/mean": 0.2685677707195282, "sampling/importance_sampling_ratio/min": 3.62930657749061e-11, "sampling/sampling_logp_difference/max": 11.980302810668945, "sampling/sampling_logp_difference/mean": 1.3397396802902222, "step": 371, "step_time": 9.821558631996595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.698609352111816, "epoch": 0.00372, "grad_norm": 0.01757991872727871, "kl": 0.5454910807311535, "learning_rate": 9.999947751889557e-06, "loss": -0.0021, "step": 372, "step_time": 5.759566484997777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1312.0, "completions/max_terminated_length": 1312.0, "completions/mean_length": 270.65625, "completions/mean_terminated_length": 270.65625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.7741172313690186, "epoch": 0.00373, "frac_reward_zero_std": 0.25, "grad_norm": 0.01687408983707428, "kl": 0.3894306644797325, "learning_rate": 9.99994744042683e-06, "loss": -0.0101, "num_tokens": 8416774.0, "reward": 1.4907221794128418, "reward_std": 1.2425087690353394, "rewards/rollout_reward_func/mean": 1.4907221794128418, "rewards/rollout_reward_func/std": 1.4815707206726074, "sampling/importance_sampling_ratio/max": 0.559299647808075, "sampling/importance_sampling_ratio/mean": 0.262160986661911, "sampling/importance_sampling_ratio/min": 1.487038389313966e-06, "sampling/sampling_logp_difference/max": 3.722182512283325, "sampling/sampling_logp_difference/mean": 1.118861436843872, "step": 373, "step_time": 9.940461340993352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.750543653964996, "epoch": 0.00374, "grad_norm": 0.015388961881399155, "kl": 0.3918864633888006, "learning_rate": 9.999947128038514e-06, "loss": -0.0102, "step": 374, "step_time": 5.528931040993484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 638.0, "completions/max_terminated_length": 638.0, "completions/mean_length": 220.34375, "completions/mean_terminated_length": 220.34375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.097652196884155, "epoch": 0.00375, "frac_reward_zero_std": 0.25, "grad_norm": 0.09195727854967117, "kl": 0.6517333723604679, "learning_rate": 9.999946814724613e-06, "loss": -0.0096, "num_tokens": 8458758.0, "reward": 1.671374797821045, "reward_std": 1.0399678945541382, "rewards/rollout_reward_func/mean": 1.671374797821045, "rewards/rollout_reward_func/std": 1.2354906797409058, "sampling/importance_sampling_ratio/max": 0.5574797987937927, "sampling/importance_sampling_ratio/mean": 0.27136585116386414, "sampling/importance_sampling_ratio/min": 0.00017834390746429563, "sampling/sampling_logp_difference/max": 2.9350595474243164, "sampling/sampling_logp_difference/mean": 0.8883547782897949, "step": 375, "step_time": 7.766368469001463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.0296017825603485, "epoch": 0.00376, "grad_norm": 0.081696517765522, "kl": 0.660970401018858, "learning_rate": 9.999946500485126e-06, "loss": -0.0099, "step": 376, "step_time": 4.679926089000219 }, { "clip_ratio/high_max": 0.0052083334885537624, "clip_ratio/high_mean": 0.0026041667442768812, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0026041667442768812, "completions/clipped_ratio": 0.0, "completions/max_length": 1657.0, "completions/max_terminated_length": 1657.0, "completions/mean_length": 444.5625, "completions/mean_terminated_length": 444.5625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.207773447036743, "epoch": 0.00377, "frac_reward_zero_std": 0.25, "grad_norm": 0.06569475680589676, "kl": 0.5450582206249237, "learning_rate": 9.999946185320051e-06, "loss": -0.0126, "num_tokens": 8507157.0, "reward": 1.308669090270996, "reward_std": 1.2231686115264893, "rewards/rollout_reward_func/mean": 1.308669090270996, "rewards/rollout_reward_func/std": 1.4099957942962646, "sampling/importance_sampling_ratio/max": 0.5530431866645813, "sampling/importance_sampling_ratio/mean": 0.19599980115890503, "sampling/importance_sampling_ratio/min": 9.991618824398157e-22, "sampling/sampling_logp_difference/max": 10.823773384094238, "sampling/sampling_logp_difference/mean": 1.050909399986267, "step": 377, "step_time": 12.321990244992776 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0052083334885537624, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625000465661287, "entropy": 5.1890276074409485, "epoch": 0.00378, "grad_norm": 0.05850473418831825, "kl": 0.5449563600122929, "learning_rate": 9.999945869229393e-06, "loss": -0.0128, "step": 378, "step_time": 6.3359747209906345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.0, "completions/max_length": 936.0, "completions/max_terminated_length": 936.0, "completions/mean_length": 319.375, "completions/mean_terminated_length": 319.375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.168132036924362, "epoch": 0.00379, "frac_reward_zero_std": 0.0, "grad_norm": 0.05528387427330017, "kl": 0.6289991587400436, "learning_rate": 9.999945552213145e-06, "loss": -0.0089, "num_tokens": 8555109.0, "reward": 0.567518949508667, "reward_std": 1.1128826141357422, "rewards/rollout_reward_func/mean": 0.567518949508667, "rewards/rollout_reward_func/std": 1.4689247608184814, "sampling/importance_sampling_ratio/max": 0.5701459050178528, "sampling/importance_sampling_ratio/mean": 0.20620612800121307, "sampling/importance_sampling_ratio/min": 9.52587186020537e-07, "sampling/sampling_logp_difference/max": 4.494022369384766, "sampling/sampling_logp_difference/mean": 0.981772780418396, "step": 379, "step_time": 9.015510755001742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.159718871116638, "epoch": 0.0038, "grad_norm": 0.049195557832717896, "kl": 0.6345720887184143, "learning_rate": 9.999945234271316e-06, "loss": -0.009, "step": 380, "step_time": 4.860575057999085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.0, "completions/max_length": 1159.0, "completions/max_terminated_length": 1159.0, "completions/mean_length": 185.65625, "completions/mean_terminated_length": 185.65625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.377310544252396, "epoch": 0.00381, "frac_reward_zero_std": 0.25, "grad_norm": 0.09388934075832367, "kl": 0.6934993006289005, "learning_rate": 9.9999449154039e-06, "loss": -0.0106, "num_tokens": 8594340.0, "reward": 0.896782398223877, "reward_std": 1.19054114818573, "rewards/rollout_reward_func/mean": 0.896782398223877, "rewards/rollout_reward_func/std": 1.4788548946380615, "sampling/importance_sampling_ratio/max": 0.5593341588973999, "sampling/importance_sampling_ratio/mean": 0.2651205062866211, "sampling/importance_sampling_ratio/min": 0.0004945531254634261, "sampling/sampling_logp_difference/max": 2.7312393188476562, "sampling/sampling_logp_difference/mean": 0.91651451587677, "step": 381, "step_time": 9.62362283499533 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.018229166977107525, "entropy": 5.325634151697159, "epoch": 0.00382, "grad_norm": 0.06309305876493454, "kl": 0.6782357022166252, "learning_rate": 9.999944595610896e-06, "loss": -0.011, "step": 382, "step_time": 5.174083237998275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1021.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 332.59375, "completions/mean_terminated_length": 332.59375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.19293013215065, "epoch": 0.00383, "frac_reward_zero_std": 0.0, "grad_norm": 0.03883906081318855, "kl": 0.5549836568534374, "learning_rate": 9.999944274892308e-06, "loss": -0.0056, "num_tokens": 8641163.0, "reward": 0.50852370262146, "reward_std": 1.330714225769043, "rewards/rollout_reward_func/mean": 0.50852370262146, "rewards/rollout_reward_func/std": 1.6547867059707642, "sampling/importance_sampling_ratio/max": 0.5764347910881042, "sampling/importance_sampling_ratio/mean": 0.16463156044483185, "sampling/importance_sampling_ratio/min": 8.435155582375321e-15, "sampling/sampling_logp_difference/max": 3.589787721633911, "sampling/sampling_logp_difference/mean": 1.192363977432251, "step": 383, "step_time": 9.787927271001536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 6.1821184158325195, "epoch": 0.00384, "grad_norm": 0.04077211767435074, "kl": 0.5550517700612545, "learning_rate": 9.999943953248133e-06, "loss": -0.0056, "step": 384, "step_time": 5.025025578997884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 754.0, "completions/max_terminated_length": 754.0, "completions/mean_length": 146.09375, "completions/mean_terminated_length": 146.09375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.18048757314682, "epoch": 0.00385, "frac_reward_zero_std": 0.5, "grad_norm": 0.04292828589677811, "kl": 0.5383002571761608, "learning_rate": 9.999943630678372e-06, "loss": -0.0112, "num_tokens": 8679183.0, "reward": 1.255942463874817, "reward_std": 0.8713913559913635, "rewards/rollout_reward_func/mean": 1.255942463874817, "rewards/rollout_reward_func/std": 1.4570095539093018, "sampling/importance_sampling_ratio/max": 0.5590506792068481, "sampling/importance_sampling_ratio/mean": 0.3093615770339966, "sampling/importance_sampling_ratio/min": 2.5132178507192293e-06, "sampling/sampling_logp_difference/max": 2.8717637062072754, "sampling/sampling_logp_difference/mean": 0.9469534158706665, "step": 385, "step_time": 7.891915502004849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.166647374629974, "epoch": 0.00386, "grad_norm": 0.04629769176244736, "kl": 0.5421721152961254, "learning_rate": 9.999943307183029e-06, "loss": -0.0111, "step": 386, "step_time": 4.3791502399981255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 802.0, "completions/max_terminated_length": 802.0, "completions/mean_length": 130.1875, "completions/mean_terminated_length": 133.87095642089844, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.344977647066116, "epoch": 0.00387, "frac_reward_zero_std": 0.25, "grad_norm": 0.0372217558324337, "kl": 0.8289019223302603, "learning_rate": 9.999942982762097e-06, "loss": -0.0138, "num_tokens": 8714500.0, "reward": 1.1771515607833862, "reward_std": 1.0244686603546143, "rewards/rollout_reward_func/mean": 1.1771515607833862, "rewards/rollout_reward_func/std": 1.3078984022140503, "sampling/importance_sampling_ratio/max": 0.5587656497955322, "sampling/importance_sampling_ratio/mean": 0.3684234023094177, "sampling/importance_sampling_ratio/min": 1.3687035012310833e-11, "sampling/sampling_logp_difference/max": 4.313111782073975, "sampling/sampling_logp_difference/mean": 0.8290984630584717, "step": 387, "step_time": 8.587751602997741 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 4.343233078718185, "epoch": 0.00388, "grad_norm": 0.010747845284640789, "kl": 0.8289403840899467, "learning_rate": 9.999942657415583e-06, "loss": -0.0139, "step": 388, "step_time": 4.464865705002012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1018.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 458.3125, "completions/mean_terminated_length": 458.3125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.5133101642131805, "epoch": 0.00389, "frac_reward_zero_std": 0.0, "grad_norm": 0.0387776717543602, "kl": 0.40010685846209526, "learning_rate": 9.99994233114348e-06, "loss": -0.0136, "num_tokens": 8766046.0, "reward": 1.2276716232299805, "reward_std": 1.7107542753219604, "rewards/rollout_reward_func/mean": 1.2276716232299805, "rewards/rollout_reward_func/std": 1.6801468133926392, "sampling/importance_sampling_ratio/max": 0.3244013786315918, "sampling/importance_sampling_ratio/mean": 0.12932246923446655, "sampling/importance_sampling_ratio/min": 4.4297516410551907e-07, "sampling/sampling_logp_difference/max": 3.80434513092041, "sampling/sampling_logp_difference/mean": 1.0015125274658203, "step": 389, "step_time": 9.869883216000744 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0052083334885537624, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0052083334885537624, "entropy": 5.5089231133461, "epoch": 0.0039, "grad_norm": 0.03403059020638466, "kl": 0.40300537273287773, "learning_rate": 9.999942003945793e-06, "loss": -0.0137, "step": 390, "step_time": 5.041887775998475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 755.0, "completions/max_terminated_length": 755.0, "completions/mean_length": 135.34375, "completions/mean_terminated_length": 135.34375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.2653746008872986, "epoch": 0.00391, "frac_reward_zero_std": 0.25, "grad_norm": 0.05232638120651245, "kl": 0.8940623924136162, "learning_rate": 9.999941675822523e-06, "loss": -0.0096, "num_tokens": 8801379.0, "reward": 1.9995388984680176, "reward_std": 0.5383970737457275, "rewards/rollout_reward_func/mean": 1.9995388984680176, "rewards/rollout_reward_func/std": 0.7764491438865662, "sampling/importance_sampling_ratio/max": 0.5572080016136169, "sampling/importance_sampling_ratio/mean": 0.41280561685562134, "sampling/importance_sampling_ratio/min": 0.00010898274194914848, "sampling/sampling_logp_difference/max": 4.275876998901367, "sampling/sampling_logp_difference/mean": 0.4828230142593384, "step": 391, "step_time": 7.90293744399969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.2456391155719757, "epoch": 0.00392, "grad_norm": 0.047957245260477066, "kl": 0.8959127366542816, "learning_rate": 9.999941346773667e-06, "loss": -0.0096, "step": 392, "step_time": 4.458605719999468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 733.0, "completions/max_terminated_length": 733.0, "completions/mean_length": 203.4375, "completions/mean_terminated_length": 203.4375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.435404658317566, "epoch": 0.00393, "frac_reward_zero_std": 0.5, "grad_norm": 0.010297203436493874, "kl": 0.80428446829319, "learning_rate": 9.999941016799226e-06, "loss": -0.0103, "num_tokens": 8840342.0, "reward": 1.6674561500549316, "reward_std": 0.904381275177002, "rewards/rollout_reward_func/mean": 1.6674561500549316, "rewards/rollout_reward_func/std": 1.272407054901123, "sampling/importance_sampling_ratio/max": 0.560138463973999, "sampling/importance_sampling_ratio/mean": 0.3296244442462921, "sampling/importance_sampling_ratio/min": 1.2855444992965204e-06, "sampling/sampling_logp_difference/max": 2.512185573577881, "sampling/sampling_logp_difference/mean": 0.7147455215454102, "step": 393, "step_time": 8.776599319004163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.39168193936348, "epoch": 0.00394, "grad_norm": 0.009544246830046177, "kl": 0.8096169605851173, "learning_rate": 9.9999406858992e-06, "loss": -0.0103, "step": 394, "step_time": 4.410857386003045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1571.0, "completions/max_terminated_length": 1571.0, "completions/mean_length": 553.875, "completions/mean_terminated_length": 553.875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.284769177436829, "epoch": 0.00395, "frac_reward_zero_std": 0.25, "grad_norm": 0.024887166917324066, "kl": 0.7128092497587204, "learning_rate": 9.999940354073589e-06, "loss": -0.0072, "num_tokens": 8892732.0, "reward": 1.7472113370895386, "reward_std": 1.4052729606628418, "rewards/rollout_reward_func/mean": 1.7472113370895386, "rewards/rollout_reward_func/std": 1.5981494188308716, "sampling/importance_sampling_ratio/max": 0.554250180721283, "sampling/importance_sampling_ratio/mean": 0.17264117300510406, "sampling/importance_sampling_ratio/min": 2.861216125138455e-11, "sampling/sampling_logp_difference/max": 11.52453899383545, "sampling/sampling_logp_difference/mean": 0.9983559846878052, "step": 395, "step_time": 12.366949975992611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.293820232152939, "epoch": 0.00396, "grad_norm": 0.027111129835247993, "kl": 0.7233806438744068, "learning_rate": 9.999940021322394e-06, "loss": -0.0072, "step": 396, "step_time": 6.268874022003729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1577.0, "completions/max_terminated_length": 1577.0, "completions/mean_length": 325.90625, "completions/mean_terminated_length": 306.0322570800781, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.224949270486832, "epoch": 0.00397, "frac_reward_zero_std": 0.25, "grad_norm": 0.08760719001293182, "kl": 0.594563826918602, "learning_rate": 9.999939687645615e-06, "loss": -0.0057, "num_tokens": 8936006.0, "reward": 1.3018276691436768, "reward_std": 1.0350311994552612, "rewards/rollout_reward_func/mean": 1.3018276691436768, "rewards/rollout_reward_func/std": 1.3839619159698486, "sampling/importance_sampling_ratio/max": 0.5604166388511658, "sampling/importance_sampling_ratio/mean": 0.33607757091522217, "sampling/importance_sampling_ratio/min": 9.172091353093492e-08, "sampling/sampling_logp_difference/max": 4.151338577270508, "sampling/sampling_logp_difference/mean": 0.7038729190826416, "step": 397, "step_time": 11.19307595200371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.234046965837479, "epoch": 0.00398, "grad_norm": 0.08353494852781296, "kl": 0.5892412252724171, "learning_rate": 9.999939353043252e-06, "loss": -0.006, "step": 398, "step_time": 6.133292504004203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/max_terminated_length": 524.0, "completions/mean_length": 98.625, "completions/mean_terminated_length": 98.625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.146430134773254, "epoch": 0.00399, "frac_reward_zero_std": 0.25, "grad_norm": 0.10513349622488022, "kl": 0.7532705217599869, "learning_rate": 9.999939017515304e-06, "loss": -0.0097, "num_tokens": 8972145.0, "reward": 1.4403347969055176, "reward_std": 1.1817069053649902, "rewards/rollout_reward_func/mean": 1.4403347969055176, "rewards/rollout_reward_func/std": 1.5787806510925293, "sampling/importance_sampling_ratio/max": 0.565384030342102, "sampling/importance_sampling_ratio/mean": 0.35656917095184326, "sampling/importance_sampling_ratio/min": 5.257584749100751e-15, "sampling/sampling_logp_difference/max": 13.697186470031738, "sampling/sampling_logp_difference/mean": 0.7921455502510071, "step": 399, "step_time": 6.80521066399524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.03645833395421505, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03645833395421505, "entropy": 4.232576847076416, "epoch": 0.004, "grad_norm": 0.03189977630972862, "kl": 0.7360877133905888, "learning_rate": 9.99993868106177e-06, "loss": -0.0099, "step": 400, "step_time": 4.259436173997528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 900.0, "completions/max_terminated_length": 900.0, "completions/mean_length": 388.0, "completions/mean_terminated_length": 388.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.7978197038173676, "epoch": 0.00401, "frac_reward_zero_std": 0.25, "grad_norm": 0.030903248116374016, "kl": 0.6359038352966309, "learning_rate": 9.999938343682654e-06, "loss": -0.0044, "num_tokens": 9020955.0, "reward": 0.1474951058626175, "reward_std": 0.7032032012939453, "rewards/rollout_reward_func/mean": 0.1474951058626175, "rewards/rollout_reward_func/std": 1.3246235847473145, "sampling/importance_sampling_ratio/max": 0.6890487670898438, "sampling/importance_sampling_ratio/mean": 0.19303777813911438, "sampling/importance_sampling_ratio/min": 4.266968517185887e-06, "sampling/sampling_logp_difference/max": 2.776979923248291, "sampling/sampling_logp_difference/mean": 1.0636305809020996, "step": 401, "step_time": 8.553937959000905 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 5.81035315990448, "epoch": 0.00402, "grad_norm": 0.024173224344849586, "kl": 0.6302591413259506, "learning_rate": 9.999938005377952e-06, "loss": -0.0046, "step": 402, "step_time": 4.8217280649914755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 965.0, "completions/max_terminated_length": 965.0, "completions/mean_length": 306.9375, "completions/mean_terminated_length": 306.9375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.120020389556885, "epoch": 0.00403, "frac_reward_zero_std": 0.25, "grad_norm": 0.016700899228453636, "kl": 0.5775265023112297, "learning_rate": 9.999937666147667e-06, "loss": -0.005, "num_tokens": 9065748.0, "reward": 1.4704687595367432, "reward_std": 1.0301995277404785, "rewards/rollout_reward_func/mean": 1.4704687595367432, "rewards/rollout_reward_func/std": 1.3413044214248657, "sampling/importance_sampling_ratio/max": 0.5516108274459839, "sampling/importance_sampling_ratio/mean": 0.21890228986740112, "sampling/importance_sampling_ratio/min": 0.00023265132040251046, "sampling/sampling_logp_difference/max": 3.172787666320801, "sampling/sampling_logp_difference/mean": 0.8700809478759766, "step": 403, "step_time": 9.161412030000065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.138303488492966, "epoch": 0.00404, "grad_norm": 0.016461197286844254, "kl": 0.5785522442311049, "learning_rate": 9.999937325991797e-06, "loss": -0.005, "step": 404, "step_time": 5.387379667001369 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "completions/clipped_ratio": 0.0, "completions/max_length": 1480.0, "completions/max_terminated_length": 1480.0, "completions/mean_length": 476.1875, "completions/mean_terminated_length": 476.1875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.736002057790756, "epoch": 0.00405, "frac_reward_zero_std": 0.0, "grad_norm": 0.04473930224776268, "kl": 0.49209342151880264, "learning_rate": 9.999936984910345e-06, "loss": -0.0143, "num_tokens": 9118380.0, "reward": 1.1781691312789917, "reward_std": 1.6197021007537842, "rewards/rollout_reward_func/mean": 1.1781691312789917, "rewards/rollout_reward_func/std": 1.6086628437042236, "sampling/importance_sampling_ratio/max": 0.43594956398010254, "sampling/importance_sampling_ratio/mean": 0.1381254643201828, "sampling/importance_sampling_ratio/min": 1.304460113260575e-07, "sampling/sampling_logp_difference/max": 3.8885374069213867, "sampling/sampling_logp_difference/mean": 1.0996683835983276, "step": 405, "step_time": 10.918850780999492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 5.761937916278839, "epoch": 0.00406, "grad_norm": 0.07197494059801102, "kl": 0.4894934482872486, "learning_rate": 9.999936642903308e-06, "loss": -0.0143, "step": 406, "step_time": 6.37683955600005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1602.0, "completions/max_terminated_length": 1602.0, "completions/mean_length": 538.28125, "completions/mean_terminated_length": 538.28125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.140889585018158, "epoch": 0.00407, "frac_reward_zero_std": 0.0, "grad_norm": 0.018120011314749718, "kl": 0.3582300841808319, "learning_rate": 9.999936299970686e-06, "loss": -0.0124, "num_tokens": 9172587.0, "reward": 1.0525150299072266, "reward_std": 1.719954252243042, "rewards/rollout_reward_func/mean": 1.0525150299072266, "rewards/rollout_reward_func/std": 1.9266690015792847, "sampling/importance_sampling_ratio/max": 0.3242656886577606, "sampling/importance_sampling_ratio/mean": 0.09736452251672745, "sampling/importance_sampling_ratio/min": 2.6414399556529133e-09, "sampling/sampling_logp_difference/max": 3.155035972595215, "sampling/sampling_logp_difference/mean": 1.100440502166748, "step": 407, "step_time": 11.393297947000974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 6.165465593338013, "epoch": 0.00408, "grad_norm": 0.018614530563354492, "kl": 0.3611386977136135, "learning_rate": 9.999935956112484e-06, "loss": -0.0124, "step": 408, "step_time": 6.198337475001608 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.03125, "completions/max_length": 1042.0, "completions/max_terminated_length": 1042.0, "completions/mean_length": 262.0625, "completions/mean_terminated_length": 270.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.799312204122543, "epoch": 0.00409, "frac_reward_zero_std": 0.25, "grad_norm": 0.026622770354151726, "kl": 0.6537652797996998, "learning_rate": 9.999935611328696e-06, "loss": -0.0013, "num_tokens": 9216154.0, "reward": 0.8215748071670532, "reward_std": 0.9857813119888306, "rewards/rollout_reward_func/mean": 0.8215748071670532, "rewards/rollout_reward_func/std": 1.571685552597046, "sampling/importance_sampling_ratio/max": 0.5597729086875916, "sampling/importance_sampling_ratio/mean": 0.2425946742296219, "sampling/importance_sampling_ratio/min": 1.0181219582683365e-11, "sampling/sampling_logp_difference/max": 4.249424934387207, "sampling/sampling_logp_difference/mean": 1.1613661050796509, "step": 409, "step_time": 8.923256500005664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.791296482086182, "epoch": 0.0041, "grad_norm": 0.026020459830760956, "kl": 0.6521888971328735, "learning_rate": 9.999935265619325e-06, "loss": -0.0014, "step": 410, "step_time": 5.5407495250001375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1525.0, "completions/max_terminated_length": 1525.0, "completions/mean_length": 407.21875, "completions/mean_terminated_length": 407.21875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.8735604882240295, "epoch": 0.00411, "frac_reward_zero_std": 0.0, "grad_norm": 0.019103432074189186, "kl": 0.4025208353996277, "learning_rate": 9.999934918984371e-06, "loss": -0.0103, "num_tokens": 9266847.0, "reward": 1.419196367263794, "reward_std": 1.6035230159759521, "rewards/rollout_reward_func/mean": 1.419196367263794, "rewards/rollout_reward_func/std": 1.7052334547042847, "sampling/importance_sampling_ratio/max": 0.5422931909561157, "sampling/importance_sampling_ratio/mean": 0.1543068289756775, "sampling/importance_sampling_ratio/min": 2.6059240099129966e-06, "sampling/sampling_logp_difference/max": 2.987858295440674, "sampling/sampling_logp_difference/mean": 1.0713709592819214, "step": 411, "step_time": 10.740566692995344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.876011908054352, "epoch": 0.00412, "grad_norm": 0.019153667613863945, "kl": 0.4026651866734028, "learning_rate": 9.999934571423834e-06, "loss": -0.0103, "step": 412, "step_time": 6.605243732999952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 973.0, "completions/max_terminated_length": 973.0, "completions/mean_length": 267.71875, "completions/mean_terminated_length": 267.71875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.94400742650032, "epoch": 0.00413, "frac_reward_zero_std": 0.25, "grad_norm": 0.09996869415044785, "kl": 0.4589342698454857, "learning_rate": 9.999934222937713e-06, "loss": -0.0153, "num_tokens": 9310303.0, "reward": 1.2086492776870728, "reward_std": 1.2881426811218262, "rewards/rollout_reward_func/mean": 1.2086492776870728, "rewards/rollout_reward_func/std": 1.6273298263549805, "sampling/importance_sampling_ratio/max": 0.558160126209259, "sampling/importance_sampling_ratio/mean": 0.2643689513206482, "sampling/importance_sampling_ratio/min": 1.3513767953554634e-05, "sampling/sampling_logp_difference/max": 2.9924635887145996, "sampling/sampling_logp_difference/mean": 0.8280004262924194, "step": 413, "step_time": 8.726650262000476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.02083333395421505, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02083333395421505, "entropy": 4.935866475105286, "epoch": 0.00414, "grad_norm": 0.04630538076162338, "kl": 0.45519769936800003, "learning_rate": 9.99993387352601e-06, "loss": -0.0157, "step": 414, "step_time": 4.888872259998607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1038.0, "completions/max_terminated_length": 1038.0, "completions/mean_length": 528.65625, "completions/mean_terminated_length": 522.2257690429688, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.759725034236908, "epoch": 0.00415, "frac_reward_zero_std": 0.0, "grad_norm": 0.023311449214816093, "kl": 0.3571118339896202, "learning_rate": 9.999933523188722e-06, "loss": -0.0112, "num_tokens": 9363478.0, "reward": 0.5657892227172852, "reward_std": 1.5159099102020264, "rewards/rollout_reward_func/mean": 0.5657892227172852, "rewards/rollout_reward_func/std": 1.8198846578598022, "sampling/importance_sampling_ratio/max": 0.3149279057979584, "sampling/importance_sampling_ratio/mean": 0.07700353115797043, "sampling/importance_sampling_ratio/min": 4.587389093706218e-16, "sampling/sampling_logp_difference/max": 3.6050217151641846, "sampling/sampling_logp_difference/mean": 1.2798235416412354, "step": 415, "step_time": 10.32217384100295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.007612179731950164, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007612179731950164, "entropy": 6.754984080791473, "epoch": 0.00416, "grad_norm": 0.02207508310675621, "kl": 0.35791795514523983, "learning_rate": 9.999933171925851e-06, "loss": -0.0113, "step": 416, "step_time": 4.967721542005165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1572.0, "completions/max_terminated_length": 1572.0, "completions/mean_length": 713.5625, "completions/mean_terminated_length": 707.7333984375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.734476625919342, "epoch": 0.00417, "frac_reward_zero_std": 0.0, "grad_norm": 0.020384453237056732, "kl": 0.39023756980895996, "learning_rate": 9.999932819737398e-06, "loss": -0.0088, "num_tokens": 9423724.0, "reward": 1.010758876800537, "reward_std": 1.2940044403076172, "rewards/rollout_reward_func/mean": 1.010758876800537, "rewards/rollout_reward_func/std": 1.440181016921997, "sampling/importance_sampling_ratio/max": 0.31902939081192017, "sampling/importance_sampling_ratio/mean": 0.09120268374681473, "sampling/importance_sampling_ratio/min": 1.229129320025335e-12, "sampling/sampling_logp_difference/max": 12.669626235961914, "sampling/sampling_logp_difference/mean": 1.126428246498108, "step": 417, "step_time": 11.565247268994426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010044642956927419, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010044642956927419, "entropy": 5.712919652462006, "epoch": 0.00418, "grad_norm": 0.01606805808842182, "kl": 0.392491115257144, "learning_rate": 9.999932466623362e-06, "loss": -0.0089, "step": 418, "step_time": 6.765329018009652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1309.0, "completions/max_terminated_length": 1309.0, "completions/mean_length": 441.4375, "completions/mean_terminated_length": 441.4375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.26163387298584, "epoch": 0.00419, "frac_reward_zero_std": 0.0, "grad_norm": 0.08023633062839508, "kl": 0.401345893740654, "learning_rate": 9.999932112583745e-06, "loss": -0.0148, "num_tokens": 9473499.0, "reward": 0.42433854937553406, "reward_std": 1.3024184703826904, "rewards/rollout_reward_func/mean": 0.42433854937553406, "rewards/rollout_reward_func/std": 1.3187437057495117, "sampling/importance_sampling_ratio/max": 0.6233136057853699, "sampling/importance_sampling_ratio/mean": 0.15590211749076843, "sampling/importance_sampling_ratio/min": 3.457134667428363e-08, "sampling/sampling_logp_difference/max": 3.599639415740967, "sampling/sampling_logp_difference/mean": 1.1551927328109741, "step": 419, "step_time": 10.115457393996621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 6.272966504096985, "epoch": 0.0042, "grad_norm": 0.07719407230615616, "kl": 0.39284662157297134, "learning_rate": 9.999931757618544e-06, "loss": -0.0152, "step": 420, "step_time": 5.5886151069971675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1614.0, "completions/max_terminated_length": 1614.0, "completions/mean_length": 412.46875, "completions/mean_terminated_length": 412.46875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.714192867279053, "epoch": 0.00421, "frac_reward_zero_std": 0.0, "grad_norm": 0.037944089621305466, "kl": 0.3562403544783592, "learning_rate": 9.999931401727761e-06, "loss": -0.0168, "num_tokens": 9523231.0, "reward": 0.9326214790344238, "reward_std": 1.5905532836914062, "rewards/rollout_reward_func/mean": 0.9326214790344238, "rewards/rollout_reward_func/std": 1.5858045816421509, "sampling/importance_sampling_ratio/max": 0.5545491576194763, "sampling/importance_sampling_ratio/mean": 0.09569959342479706, "sampling/importance_sampling_ratio/min": 7.155965974825238e-20, "sampling/sampling_logp_difference/max": 9.595454216003418, "sampling/sampling_logp_difference/mean": 1.4951900243759155, "step": 421, "step_time": 11.802705673999299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 6.710549771785736, "epoch": 0.00422, "grad_norm": 0.03732854500412941, "kl": 0.3666410930454731, "learning_rate": 9.999931044911395e-06, "loss": -0.0168, "step": 422, "step_time": 6.150768473005883 }, { "clip_ratio/high_max": 0.011363636702299118, "clip_ratio/high_mean": 0.005681818351149559, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005681818351149559, "completions/clipped_ratio": 0.0, "completions/max_length": 685.0, "completions/max_terminated_length": 685.0, "completions/mean_length": 280.78125, "completions/mean_terminated_length": 280.78125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.6493881046772, "epoch": 0.00423, "frac_reward_zero_std": 0.0, "grad_norm": 0.06903441250324249, "kl": 0.44315644539892673, "learning_rate": 9.999930687169449e-06, "loss": -0.0111, "num_tokens": 9567620.0, "reward": 0.86236572265625, "reward_std": 1.4366116523742676, "rewards/rollout_reward_func/mean": 0.86236572265625, "rewards/rollout_reward_func/std": 1.568967342376709, "sampling/importance_sampling_ratio/max": 0.5397995114326477, "sampling/importance_sampling_ratio/mean": 0.1520881950855255, "sampling/importance_sampling_ratio/min": 2.894454424404169e-13, "sampling/sampling_logp_difference/max": 4.49141263961792, "sampling/sampling_logp_difference/mean": 1.4397032260894775, "step": 423, "step_time": 8.105679962998693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 6.65572664141655, "epoch": 0.00424, "grad_norm": 0.07121329009532928, "kl": 0.4312774110585451, "learning_rate": 9.999930328501917e-06, "loss": -0.0111, "step": 424, "step_time": 4.869022030001361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1835.0, "completions/max_terminated_length": 1835.0, "completions/mean_length": 279.34375, "completions/mean_terminated_length": 279.34375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.199639707803726, "epoch": 0.00425, "frac_reward_zero_std": 0.25, "grad_norm": 0.07271145284175873, "kl": 0.6584330908954144, "learning_rate": 9.999929968908805e-06, "loss": -0.0118, "num_tokens": 9608705.0, "reward": 0.6209284663200378, "reward_std": 0.6849035024642944, "rewards/rollout_reward_func/mean": 0.6209284663200378, "rewards/rollout_reward_func/std": 1.5085989236831665, "sampling/importance_sampling_ratio/max": 0.6604400873184204, "sampling/importance_sampling_ratio/mean": 0.34561407566070557, "sampling/importance_sampling_ratio/min": 4.141673226598641e-09, "sampling/sampling_logp_difference/max": 3.5699470043182373, "sampling/sampling_logp_difference/mean": 0.8209027647972107, "step": 425, "step_time": 11.857556831000693 }, { "clip_ratio/high_max": 0.0625, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 4.183584362268448, "epoch": 0.00426, "grad_norm": 0.01710578054189682, "kl": 0.6620018593966961, "learning_rate": 9.99992960839011e-06, "loss": -0.0121, "step": 426, "step_time": 6.6479626770014875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1663.0, "completions/max_terminated_length": 1663.0, "completions/mean_length": 395.46875, "completions/mean_terminated_length": 395.46875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.956536948680878, "epoch": 0.00427, "frac_reward_zero_std": 0.25, "grad_norm": 0.06258437782526016, "kl": 0.5683906115591526, "learning_rate": 9.999929246945834e-06, "loss": -0.0038, "num_tokens": 9655571.0, "reward": 0.9627177715301514, "reward_std": 1.0512380599975586, "rewards/rollout_reward_func/mean": 0.9627177715301514, "rewards/rollout_reward_func/std": 1.3123871088027954, "sampling/importance_sampling_ratio/max": 0.5497295260429382, "sampling/importance_sampling_ratio/mean": 0.2660088539123535, "sampling/importance_sampling_ratio/min": 2.3796369532647077e-06, "sampling/sampling_logp_difference/max": 2.9198546409606934, "sampling/sampling_logp_difference/mean": 0.9169349670410156, "step": 427, "step_time": 11.573856096008967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.938460022211075, "epoch": 0.00428, "grad_norm": 0.05814637243747711, "kl": 0.5719614699482918, "learning_rate": 9.999928884575976e-06, "loss": -0.0038, "step": 428, "step_time": 6.23654261300544 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.0, "completions/max_length": 474.0, "completions/max_terminated_length": 474.0, "completions/mean_length": 117.25, "completions/mean_terminated_length": 117.25, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.9894153475761414, "epoch": 0.00429, "frac_reward_zero_std": 0.5, "grad_norm": 0.011769694276154041, "kl": 0.7750250473618507, "learning_rate": 9.999928521280536e-06, "loss": -0.0088, "num_tokens": 9691470.0, "reward": 1.1608076095581055, "reward_std": 0.7045012712478638, "rewards/rollout_reward_func/mean": 1.1608076095581055, "rewards/rollout_reward_func/std": 1.4966137409210205, "sampling/importance_sampling_ratio/max": 0.5555559992790222, "sampling/importance_sampling_ratio/mean": 0.38191282749176025, "sampling/importance_sampling_ratio/min": 2.237215968037276e-09, "sampling/sampling_logp_difference/max": 12.908434867858887, "sampling/sampling_logp_difference/mean": 0.8885732889175415, "step": 429, "step_time": 7.6001354759973765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.99026095867157, "epoch": 0.0043, "grad_norm": 0.14014151692390442, "kl": 0.7780681364238262, "learning_rate": 9.999928157059513e-06, "loss": -0.0083, "step": 430, "step_time": 3.9394190370003344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1327.0, "completions/max_terminated_length": 1327.0, "completions/mean_length": 330.625, "completions/mean_terminated_length": 330.625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.48806095123291, "epoch": 0.00431, "frac_reward_zero_std": 0.25, "grad_norm": 0.010925721377134323, "kl": 0.47164774499833584, "learning_rate": 9.99992779191291e-06, "loss": -0.0146, "num_tokens": 9735953.0, "reward": 0.6972054839134216, "reward_std": 1.1898692846298218, "rewards/rollout_reward_func/mean": 0.6972054839134216, "rewards/rollout_reward_func/std": 1.801132321357727, "sampling/importance_sampling_ratio/max": 0.5581368207931519, "sampling/importance_sampling_ratio/mean": 0.2567925453186035, "sampling/importance_sampling_ratio/min": 1.7750243941350163e-08, "sampling/sampling_logp_difference/max": 4.609691619873047, "sampling/sampling_logp_difference/mean": 1.1321215629577637, "step": 431, "step_time": 9.97086575400317 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 5.481993019580841, "epoch": 0.00432, "grad_norm": 0.010706587694585323, "kl": 0.4720798386260867, "learning_rate": 9.999927425840725e-06, "loss": -0.0146, "step": 432, "step_time": 6.129679984001996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "completions/clipped_ratio": 0.0, "completions/max_length": 1527.0, "completions/max_terminated_length": 1527.0, "completions/mean_length": 664.9375, "completions/mean_terminated_length": 664.9375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.347112596035004, "epoch": 0.00433, "frac_reward_zero_std": 0.0, "grad_norm": 0.014270790852606297, "kl": 0.46598032861948013, "learning_rate": 9.999927058842958e-06, "loss": -0.0151, "num_tokens": 9791897.0, "reward": 0.7182175517082214, "reward_std": 1.3316922187805176, "rewards/rollout_reward_func/mean": 0.7182175517082214, "rewards/rollout_reward_func/std": 1.415018081665039, "sampling/importance_sampling_ratio/max": 0.5583627820014954, "sampling/importance_sampling_ratio/mean": 0.16692538559436798, "sampling/importance_sampling_ratio/min": 6.854437519285923e-15, "sampling/sampling_logp_difference/max": 4.550000190734863, "sampling/sampling_logp_difference/mean": 1.072846531867981, "step": 433, "step_time": 11.720339707993844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.341281950473785, "epoch": 0.00434, "grad_norm": 0.013910140842199326, "kl": 0.46357715129852295, "learning_rate": 9.999926690919612e-06, "loss": -0.0152, "step": 434, "step_time": 6.15407204299845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1444.0, "completions/max_terminated_length": 1444.0, "completions/mean_length": 238.1875, "completions/mean_terminated_length": 238.1875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.346038103103638, "epoch": 0.00435, "frac_reward_zero_std": 0.25, "grad_norm": 0.005989632569253445, "kl": 0.35281339660286903, "learning_rate": 9.999926322070682e-06, "loss": -0.0104, "num_tokens": 9833682.0, "reward": 0.9085401892662048, "reward_std": 1.2272958755493164, "rewards/rollout_reward_func/mean": 0.9085401892662048, "rewards/rollout_reward_func/std": 1.5224215984344482, "sampling/importance_sampling_ratio/max": 0.5558167695999146, "sampling/importance_sampling_ratio/mean": 0.21820981800556183, "sampling/importance_sampling_ratio/min": 3.695139304265996e-15, "sampling/sampling_logp_difference/max": 13.403695106506348, "sampling/sampling_logp_difference/mean": 1.5558828115463257, "step": 435, "step_time": 10.612834944000497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 6.347519487142563, "epoch": 0.00436, "grad_norm": 0.006038116291165352, "kl": 0.35293289087712765, "learning_rate": 9.999925952296172e-06, "loss": -0.0104, "step": 436, "step_time": 5.713359941000817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1820.0, "completions/max_terminated_length": 1820.0, "completions/mean_length": 709.9375, "completions/mean_terminated_length": 709.9375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.84841650724411, "epoch": 0.00437, "frac_reward_zero_std": 0.0, "grad_norm": 0.08360286802053452, "kl": 0.41237788647413254, "learning_rate": 9.99992558159608e-06, "loss": -0.0138, "num_tokens": 9893746.0, "reward": 0.26843181252479553, "reward_std": 1.0296015739440918, "rewards/rollout_reward_func/mean": 0.26843181252479553, "rewards/rollout_reward_func/std": 1.1607555150985718, "sampling/importance_sampling_ratio/max": 0.31000199913978577, "sampling/importance_sampling_ratio/mean": 0.10038264095783234, "sampling/importance_sampling_ratio/min": 1.871192511647326e-14, "sampling/sampling_logp_difference/max": 11.943882942199707, "sampling/sampling_logp_difference/mean": 1.1960196495056152, "step": 437, "step_time": 12.236663442992722 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 5.856674492359161, "epoch": 0.00438, "grad_norm": 0.02451838180422783, "kl": 0.3971049655228853, "learning_rate": 9.999925209970408e-06, "loss": -0.0143, "step": 438, "step_time": 7.118845341010456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1684.0, "completions/max_terminated_length": 1684.0, "completions/mean_length": 664.71875, "completions/mean_terminated_length": 675.4193115234375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.976594179868698, "epoch": 0.00439, "frac_reward_zero_std": 0.0, "grad_norm": 0.05601168051362038, "kl": 0.5310670668259263, "learning_rate": 9.999924837419155e-06, "loss": -0.0047, "num_tokens": 9951017.0, "reward": 1.7108237743377686, "reward_std": 1.501123309135437, "rewards/rollout_reward_func/mean": 1.7108237743377686, "rewards/rollout_reward_func/std": 1.4700242280960083, "sampling/importance_sampling_ratio/max": 0.5597662925720215, "sampling/importance_sampling_ratio/mean": 0.2022438943386078, "sampling/importance_sampling_ratio/min": 2.331571227680856e-30, "sampling/sampling_logp_difference/max": 10.182361602783203, "sampling/sampling_logp_difference/mean": 1.060805320739746, "step": 439, "step_time": 11.591118557003938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.978935539722443, "epoch": 0.0044, "grad_norm": 0.05983751267194748, "kl": 0.5287634134292603, "learning_rate": 9.999924463942322e-06, "loss": -0.0048, "step": 440, "step_time": 6.44396644200242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1554.0, "completions/max_terminated_length": 1554.0, "completions/mean_length": 676.8125, "completions/mean_terminated_length": 689.2333984375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.788927614688873, "epoch": 0.00441, "frac_reward_zero_std": 0.0, "grad_norm": 0.05320155993103981, "kl": 0.4771611522883177, "learning_rate": 9.999924089539907e-06, "loss": -0.0122, "num_tokens": 10010141.0, "reward": 0.48160189390182495, "reward_std": 1.6980388164520264, "rewards/rollout_reward_func/mean": 0.48160189390182495, "rewards/rollout_reward_func/std": 1.7056901454925537, "sampling/importance_sampling_ratio/max": 0.4922370910644531, "sampling/importance_sampling_ratio/mean": 0.09932322055101395, "sampling/importance_sampling_ratio/min": 1.4566634683622968e-22, "sampling/sampling_logp_difference/max": 12.738789558410645, "sampling/sampling_logp_difference/mean": 1.2804847955703735, "step": 441, "step_time": 11.760857180997846 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 5.803362727165222, "epoch": 0.00442, "grad_norm": 0.04815090447664261, "kl": 0.44139356911182404, "learning_rate": 9.999923714211912e-06, "loss": -0.0122, "step": 442, "step_time": 6.185113644994999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/max_terminated_length": 379.0, "completions/mean_length": 88.5625, "completions/mean_terminated_length": 88.5625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.7753049433231354, "epoch": 0.00443, "frac_reward_zero_std": 0.5, "grad_norm": 0.04244082793593407, "kl": 0.6918343640863895, "learning_rate": 9.999923337958336e-06, "loss": -0.0103, "num_tokens": 10044174.0, "reward": 1.7325806617736816, "reward_std": 0.7132753133773804, "rewards/rollout_reward_func/mean": 1.7325806617736816, "rewards/rollout_reward_func/std": 1.0188854932785034, "sampling/importance_sampling_ratio/max": 0.5579346418380737, "sampling/importance_sampling_ratio/mean": 0.4174155592918396, "sampling/importance_sampling_ratio/min": 0.00039840134559199214, "sampling/sampling_logp_difference/max": 2.729783773422241, "sampling/sampling_logp_difference/mean": 0.5884220600128174, "step": 443, "step_time": 6.515765558004205 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 3.763574093580246, "epoch": 0.00444, "grad_norm": 0.04092271998524666, "kl": 0.6885731816291809, "learning_rate": 9.99992296077918e-06, "loss": -0.0104, "step": 444, "step_time": 4.078345084006287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1131.0, "completions/max_terminated_length": 1102.0, "completions/mean_length": 457.53125, "completions/mean_terminated_length": 435.8064270019531, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.063609600067139, "epoch": 0.00445, "frac_reward_zero_std": 0.0, "grad_norm": 0.04404602199792862, "kl": 0.2980833798646927, "learning_rate": 9.999922582674445e-06, "loss": -0.0134, "num_tokens": 10095869.0, "reward": 0.6259787082672119, "reward_std": 1.1653742790222168, "rewards/rollout_reward_func/mean": 0.6259787082672119, "rewards/rollout_reward_func/std": 1.4461935758590698, "sampling/importance_sampling_ratio/max": 0.5457441806793213, "sampling/importance_sampling_ratio/mean": 0.13152502477169037, "sampling/importance_sampling_ratio/min": 8.567022091732213e-12, "sampling/sampling_logp_difference/max": 12.284520149230957, "sampling/sampling_logp_difference/mean": 1.2972503900527954, "step": 445, "step_time": 9.50957278099304 }, { "clip_ratio/high_max": 0.028125000186264515, "clip_ratio/high_mean": 0.014062500093132257, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.014062500093132257, "entropy": 6.06259423494339, "epoch": 0.00446, "grad_norm": 0.020866824313998222, "kl": 0.29284022748470306, "learning_rate": 9.999922203644126e-06, "loss": -0.0135, "step": 446, "step_time": 5.708364539008471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 937.0, "completions/max_terminated_length": 937.0, "completions/mean_length": 101.65625, "completions/mean_terminated_length": 101.65625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.142716139554977, "epoch": 0.00447, "frac_reward_zero_std": 0.5, "grad_norm": 0.006573494523763657, "kl": 0.6846996173262596, "learning_rate": 9.99992182368823e-06, "loss": -0.0075, "num_tokens": 10131046.0, "reward": 1.019373893737793, "reward_std": 0.8458171486854553, "rewards/rollout_reward_func/mean": 1.019373893737793, "rewards/rollout_reward_func/std": 1.5093032121658325, "sampling/importance_sampling_ratio/max": 0.5567635297775269, "sampling/importance_sampling_ratio/mean": 0.4083460867404938, "sampling/importance_sampling_ratio/min": 4.803010233445093e-05, "sampling/sampling_logp_difference/max": 2.9099602699279785, "sampling/sampling_logp_difference/mean": 0.7147508263587952, "step": 447, "step_time": 7.967607736998616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.127072125673294, "epoch": 0.00448, "grad_norm": 0.006018102169036865, "kl": 0.686904177069664, "learning_rate": 9.999921442806754e-06, "loss": -0.0076, "step": 448, "step_time": 4.559707153995987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1661.0, "completions/max_terminated_length": 1661.0, "completions/mean_length": 653.375, "completions/mean_terminated_length": 653.375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.065308213233948, "epoch": 0.00449, "frac_reward_zero_std": 0.0, "grad_norm": 0.0714055597782135, "kl": 0.2840062044560909, "learning_rate": 9.999921060999696e-06, "loss": -0.0144, "num_tokens": 10188982.0, "reward": 1.3861685991287231, "reward_std": 2.010119915008545, "rewards/rollout_reward_func/mean": 1.3861685991287231, "rewards/rollout_reward_func/std": 1.9576605558395386, "sampling/importance_sampling_ratio/max": 0.45086169242858887, "sampling/importance_sampling_ratio/mean": 0.09590598940849304, "sampling/importance_sampling_ratio/min": 1.1007595723810937e-08, "sampling/sampling_logp_difference/max": 4.3701581954956055, "sampling/sampling_logp_difference/mean": 1.1601300239562988, "step": 449, "step_time": 11.322972630998265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 6.0573011338710785, "epoch": 0.0045, "grad_norm": 0.06904001533985138, "kl": 0.2854825723916292, "learning_rate": 9.99992067826706e-06, "loss": -0.0147, "step": 450, "step_time": 6.856086746000074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 901.0, "completions/max_terminated_length": 901.0, "completions/mean_length": 369.53125, "completions/mean_terminated_length": 366.2257995605469, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.357503682374954, "epoch": 0.00451, "frac_reward_zero_std": 0.0, "grad_norm": 0.07255925983190536, "kl": 0.39494318701326847, "learning_rate": 9.999920294608844e-06, "loss": -0.0212, "num_tokens": 10237423.0, "reward": 0.8553340435028076, "reward_std": 1.3916518688201904, "rewards/rollout_reward_func/mean": 0.8553340435028076, "rewards/rollout_reward_func/std": 1.5870027542114258, "sampling/importance_sampling_ratio/max": 0.4952279329299927, "sampling/importance_sampling_ratio/mean": 0.1286488175392151, "sampling/importance_sampling_ratio/min": 1.0399288201401458e-11, "sampling/sampling_logp_difference/max": 3.9853858947753906, "sampling/sampling_logp_difference/mean": 1.173343300819397, "step": 451, "step_time": 8.529109744002199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 6.3516638576984406, "epoch": 0.00452, "grad_norm": 0.07109247893095016, "kl": 0.40008511394262314, "learning_rate": 9.999919910025047e-06, "loss": -0.0212, "step": 452, "step_time": 5.193169930007571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1689.0, "completions/max_terminated_length": 1689.0, "completions/mean_length": 584.78125, "completions/mean_terminated_length": 584.78125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.126995295286179, "epoch": 0.00453, "frac_reward_zero_std": 0.0, "grad_norm": 0.07763219624757767, "kl": 0.31419203616678715, "learning_rate": 9.999919524515672e-06, "loss": -0.0207, "num_tokens": 10293406.0, "reward": 1.0883967876434326, "reward_std": 1.7324373722076416, "rewards/rollout_reward_func/mean": 1.0883967876434326, "rewards/rollout_reward_func/std": 1.6615298986434937, "sampling/importance_sampling_ratio/max": 0.3089352250099182, "sampling/importance_sampling_ratio/mean": 0.11222019046545029, "sampling/importance_sampling_ratio/min": 1.1186022419451547e-07, "sampling/sampling_logp_difference/max": 3.7846899032592773, "sampling/sampling_logp_difference/mean": 1.178532361984253, "step": 453, "step_time": 11.550554409001052 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0052083334885537624, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0052083334885537624, "entropy": 6.119528651237488, "epoch": 0.00454, "grad_norm": 0.05431535094976425, "kl": 0.3070220332592726, "learning_rate": 9.999919138080717e-06, "loss": -0.0208, "step": 454, "step_time": 6.3378964719995565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 781.0, "completions/max_terminated_length": 781.0, "completions/mean_length": 273.9375, "completions/mean_terminated_length": 268.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.113408327102661, "epoch": 0.00455, "frac_reward_zero_std": 0.0, "grad_norm": 0.06667012721300125, "kl": 0.32650371827185154, "learning_rate": 9.999918750720182e-06, "loss": -0.0107, "num_tokens": 10337399.0, "reward": 1.3157343864440918, "reward_std": 1.6813887357711792, "rewards/rollout_reward_func/mean": 1.3157343864440918, "rewards/rollout_reward_func/std": 1.7002211809158325, "sampling/importance_sampling_ratio/max": 0.558424174785614, "sampling/importance_sampling_ratio/mean": 0.15373452007770538, "sampling/importance_sampling_ratio/min": 1.0168213542185295e-23, "sampling/sampling_logp_difference/max": 7.891948699951172, "sampling/sampling_logp_difference/mean": 1.2783186435699463, "step": 455, "step_time": 8.402349135005352 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.01846590917557478, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.028882576152682304, "entropy": 6.1295377016067505, "epoch": 0.00456, "grad_norm": 0.03700656816363335, "kl": 0.36279009841382504, "learning_rate": 9.99991836243407e-06, "loss": -0.0108, "step": 456, "step_time": 4.855722212996625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 151.96875, "completions/mean_terminated_length": 151.96875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.641539365053177, "epoch": 0.00457, "frac_reward_zero_std": 0.0, "grad_norm": 0.07372410595417023, "kl": 0.5899173580110073, "learning_rate": 9.999917973222375e-06, "loss": -0.0142, "num_tokens": 10376174.0, "reward": 0.8108024597167969, "reward_std": 1.0620660781860352, "rewards/rollout_reward_func/mean": 0.8108024597167969, "rewards/rollout_reward_func/std": 1.5296704769134521, "sampling/importance_sampling_ratio/max": 0.6037182807922363, "sampling/importance_sampling_ratio/mean": 0.3035627603530884, "sampling/importance_sampling_ratio/min": 3.1784075371632525e-10, "sampling/sampling_logp_difference/max": 4.512628078460693, "sampling/sampling_logp_difference/mean": 0.8740010261535645, "step": 457, "step_time": 6.708204745005787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.026041666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.026041666977107525, "entropy": 4.683338284492493, "epoch": 0.00458, "grad_norm": 0.047262270003557205, "kl": 0.5800814926624298, "learning_rate": 9.999917583085104e-06, "loss": -0.0144, "step": 458, "step_time": 4.166858000011416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1211.0, "completions/max_terminated_length": 1211.0, "completions/mean_length": 378.78125, "completions/mean_terminated_length": 377.7419128417969, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.3796610832214355, "epoch": 0.00459, "frac_reward_zero_std": 0.25, "grad_norm": 0.135538712143898, "kl": 0.43306262604892254, "learning_rate": 9.999917192022251e-06, "loss": -0.0139, "num_tokens": 10424438.0, "reward": 0.6462821960449219, "reward_std": 0.9912929534912109, "rewards/rollout_reward_func/mean": 0.6462821960449219, "rewards/rollout_reward_func/std": 1.3663272857666016, "sampling/importance_sampling_ratio/max": 0.5604783892631531, "sampling/importance_sampling_ratio/mean": 0.221684530377388, "sampling/importance_sampling_ratio/min": 5.990129833127744e-12, "sampling/sampling_logp_difference/max": 3.894779682159424, "sampling/sampling_logp_difference/mean": 1.045230746269226, "step": 459, "step_time": 9.826673558989569 }, { "clip_ratio/high_max": 0.0416666679084301, "clip_ratio/high_mean": 0.02083333395421505, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.031250000931322575, "entropy": 5.434610545635223, "epoch": 0.0046, "grad_norm": 0.019399452954530716, "kl": 0.4062286149710417, "learning_rate": 9.999916800033823e-06, "loss": -0.0142, "step": 460, "step_time": 5.3992978439928265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1009.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 366.09375, "completions/mean_terminated_length": 356.0689697265625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.680139183998108, "epoch": 0.00461, "frac_reward_zero_std": 0.25, "grad_norm": 0.06580350548028946, "kl": 0.3380232425406575, "learning_rate": 9.999916407119812e-06, "loss": -0.0124, "num_tokens": 10470576.0, "reward": 0.9321739673614502, "reward_std": 1.156758427619934, "rewards/rollout_reward_func/mean": 0.9321739673614502, "rewards/rollout_reward_func/std": 1.4725542068481445, "sampling/importance_sampling_ratio/max": 0.5597972273826599, "sampling/importance_sampling_ratio/mean": 0.21860218048095703, "sampling/importance_sampling_ratio/min": 7.291420394812169e-22, "sampling/sampling_logp_difference/max": 3.884229898452759, "sampling/sampling_logp_difference/mean": 1.1474050283432007, "step": 461, "step_time": 9.352781770990987 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 5.688766419887543, "epoch": 0.00462, "grad_norm": 0.029689563438296318, "kl": 0.3383090244606137, "learning_rate": 9.999916013280226e-06, "loss": -0.0126, "step": 462, "step_time": 5.415063651998935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1036.0, "completions/max_terminated_length": 1036.0, "completions/mean_length": 431.78125, "completions/mean_terminated_length": 431.78125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.299673169851303, "epoch": 0.00463, "frac_reward_zero_std": 0.25, "grad_norm": 0.1286477893590927, "kl": 0.5264384485781193, "learning_rate": 9.999915618515059e-06, "loss": -0.0049, "num_tokens": 10520273.0, "reward": 1.3141767978668213, "reward_std": 0.9098021984100342, "rewards/rollout_reward_func/mean": 1.3141767978668213, "rewards/rollout_reward_func/std": 1.2744005918502808, "sampling/importance_sampling_ratio/max": 0.5485273599624634, "sampling/importance_sampling_ratio/mean": 0.25441455841064453, "sampling/importance_sampling_ratio/min": 1.2584310482566252e-08, "sampling/sampling_logp_difference/max": 2.931121349334717, "sampling/sampling_logp_difference/mean": 0.7289259433746338, "step": 463, "step_time": 9.676339748999453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.347157925367355, "epoch": 0.00464, "grad_norm": 0.12937144935131073, "kl": 0.5200077183544636, "learning_rate": 9.999915222824314e-06, "loss": -0.0049, "step": 464, "step_time": 5.024439723005344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 606.0, "completions/max_terminated_length": 606.0, "completions/mean_length": 144.71875, "completions/mean_terminated_length": 136.87095642089844, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.731504946947098, "epoch": 0.00465, "frac_reward_zero_std": 0.0, "grad_norm": 0.09789691120386124, "kl": 0.5231475867331028, "learning_rate": 9.999914826207992e-06, "loss": -0.0154, "num_tokens": 10559152.0, "reward": -0.17726245522499084, "reward_std": 1.1656975746154785, "rewards/rollout_reward_func/mean": -0.17726245522499084, "rewards/rollout_reward_func/std": 1.3988040685653687, "sampling/importance_sampling_ratio/max": 0.5958135724067688, "sampling/importance_sampling_ratio/mean": 0.2557991147041321, "sampling/importance_sampling_ratio/min": 4.4129214984423015e-06, "sampling/sampling_logp_difference/max": 4.761212348937988, "sampling/sampling_logp_difference/mean": 0.7861743569374084, "step": 465, "step_time": 7.1587190110076335 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.03750000009313226, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.04791666707023978, "entropy": 4.885305881500244, "epoch": 0.00466, "grad_norm": 0.0885859876871109, "kl": 0.4927307330071926, "learning_rate": 9.99991442866609e-06, "loss": -0.0159, "step": 466, "step_time": 3.9010737920070824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 945.0, "completions/max_terminated_length": 945.0, "completions/mean_length": 258.1875, "completions/mean_terminated_length": 258.1875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.134659618139267, "epoch": 0.00467, "frac_reward_zero_std": 0.0, "grad_norm": 0.09896962344646454, "kl": 0.4232446402311325, "learning_rate": 9.999914030198609e-06, "loss": -0.005, "num_tokens": 10602064.0, "reward": 0.9962072372436523, "reward_std": 1.3886470794677734, "rewards/rollout_reward_func/mean": 0.9962072372436523, "rewards/rollout_reward_func/std": 1.4805740118026733, "sampling/importance_sampling_ratio/max": 0.5561938881874084, "sampling/importance_sampling_ratio/mean": 0.23504450917243958, "sampling/importance_sampling_ratio/min": 1.095395759875828e-06, "sampling/sampling_logp_difference/max": 4.5745720863342285, "sampling/sampling_logp_difference/mean": 0.9571558833122253, "step": 467, "step_time": 9.067476765998435 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0052083334885537624, "clip_ratio/low_mean": 0.033854166977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03906250046566129, "entropy": 5.241246074438095, "epoch": 0.00468, "grad_norm": 0.06928636878728867, "kl": 0.4010443650186062, "learning_rate": 9.999913630805554e-06, "loss": -0.0054, "step": 468, "step_time": 4.740601591995073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1255.0, "completions/max_terminated_length": 1255.0, "completions/mean_length": 132.0, "completions/mean_terminated_length": 132.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.574756890535355, "epoch": 0.00469, "frac_reward_zero_std": 0.0, "grad_norm": 0.060799721628427505, "kl": 0.47700563073158264, "learning_rate": 9.999913230486916e-06, "loss": -0.0021, "num_tokens": 10639878.0, "reward": -0.032495930790901184, "reward_std": 0.6265119314193726, "rewards/rollout_reward_func/mean": -0.032495930790901184, "rewards/rollout_reward_func/std": 1.290546178817749, "sampling/importance_sampling_ratio/max": 0.5594556927680969, "sampling/importance_sampling_ratio/mean": 0.30568766593933105, "sampling/importance_sampling_ratio/min": 2.502421736494398e-08, "sampling/sampling_logp_difference/max": 4.5017781257629395, "sampling/sampling_logp_difference/mean": 1.1543467044830322, "step": 469, "step_time": 9.764615777996369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 5.622347950935364, "epoch": 0.0047, "grad_norm": 0.059030793607234955, "kl": 0.4658178500831127, "learning_rate": 9.999912829242704e-06, "loss": -0.0022, "step": 470, "step_time": 5.384453365008085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1535.0, "completions/max_terminated_length": 1535.0, "completions/mean_length": 292.40625, "completions/mean_terminated_length": 293.6128845214844, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.752136915922165, "epoch": 0.00471, "frac_reward_zero_std": 0.25, "grad_norm": 0.041409265249967575, "kl": 0.4961445666849613, "learning_rate": 9.999912427072911e-06, "loss": -0.0102, "num_tokens": 10683192.0, "reward": 1.1945858001708984, "reward_std": 0.9169143438339233, "rewards/rollout_reward_func/mean": 1.1945858001708984, "rewards/rollout_reward_func/std": 1.1519593000411987, "sampling/importance_sampling_ratio/max": 0.5591006278991699, "sampling/importance_sampling_ratio/mean": 0.30268508195877075, "sampling/importance_sampling_ratio/min": 1.6140519859374073e-16, "sampling/sampling_logp_difference/max": 12.125242233276367, "sampling/sampling_logp_difference/mean": 1.0053260326385498, "step": 471, "step_time": 10.875091881996923 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0052083334885537624, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625000465661287, "entropy": 4.7788888812065125, "epoch": 0.00472, "grad_norm": 0.0456790067255497, "kl": 0.4969482906162739, "learning_rate": 9.999912023977543e-06, "loss": -0.0102, "step": 472, "step_time": 5.974942226996063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1517.0, "completions/max_terminated_length": 1517.0, "completions/mean_length": 598.9375, "completions/mean_terminated_length": 598.9375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.023067951202393, "epoch": 0.00473, "frac_reward_zero_std": 0.0, "grad_norm": 0.04734015092253685, "kl": 0.32362375035881996, "learning_rate": 9.999911619956595e-06, "loss": -0.0144, "num_tokens": 10739584.0, "reward": 1.496825933456421, "reward_std": 1.5663262605667114, "rewards/rollout_reward_func/mean": 1.496825933456421, "rewards/rollout_reward_func/std": 1.5563936233520508, "sampling/importance_sampling_ratio/max": 0.29625800251960754, "sampling/importance_sampling_ratio/mean": 0.1026591807603836, "sampling/importance_sampling_ratio/min": 5.4643635394313606e-08, "sampling/sampling_logp_difference/max": 2.7126963138580322, "sampling/sampling_logp_difference/mean": 1.1417382955551147, "step": 473, "step_time": 11.418121699993208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 6.037487089633942, "epoch": 0.00474, "grad_norm": 0.04741457849740982, "kl": 0.3303474299609661, "learning_rate": 9.999911215010072e-06, "loss": -0.0144, "step": 474, "step_time": 6.071564123998542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1524.0, "completions/max_terminated_length": 1524.0, "completions/mean_length": 418.0, "completions/mean_terminated_length": 418.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.752989828586578, "epoch": 0.00475, "frac_reward_zero_std": 0.0, "grad_norm": 0.0438365638256073, "kl": 0.35597626818343997, "learning_rate": 9.99991080913797e-06, "loss": -0.0148, "num_tokens": 10790320.0, "reward": 0.8688272833824158, "reward_std": 1.7239190340042114, "rewards/rollout_reward_func/mean": 0.8688272833824158, "rewards/rollout_reward_func/std": 1.7004854679107666, "sampling/importance_sampling_ratio/max": 0.5035021901130676, "sampling/importance_sampling_ratio/mean": 0.10160921514034271, "sampling/importance_sampling_ratio/min": 2.8015012964618287e-22, "sampling/sampling_logp_difference/max": 12.281123161315918, "sampling/sampling_logp_difference/mean": 1.5129725933074951, "step": 475, "step_time": 10.940496402999997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0024999999441206455, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0024999999441206455, "entropy": 6.732030212879181, "epoch": 0.00476, "grad_norm": 0.04257773980498314, "kl": 0.35589541820809245, "learning_rate": 9.999910402340289e-06, "loss": -0.0148, "step": 476, "step_time": 5.925510688997747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1026.0, "completions/max_terminated_length": 1026.0, "completions/mean_length": 445.15625, "completions/mean_terminated_length": 445.15625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.793590277433395, "epoch": 0.00477, "frac_reward_zero_std": 0.0, "grad_norm": 0.03710019588470459, "kl": 0.4395819902420044, "learning_rate": 9.999909994617032e-06, "loss": -0.0214, "num_tokens": 10840881.0, "reward": 1.1745967864990234, "reward_std": 1.855344533920288, "rewards/rollout_reward_func/mean": 1.1745967864990234, "rewards/rollout_reward_func/std": 1.8272613286972046, "sampling/importance_sampling_ratio/max": 0.33211591839790344, "sampling/importance_sampling_ratio/mean": 0.1212308406829834, "sampling/importance_sampling_ratio/min": 3.998496256318873e-10, "sampling/sampling_logp_difference/max": 4.327573299407959, "sampling/sampling_logp_difference/mean": 1.1751818656921387, "step": 477, "step_time": 9.08175059599671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.765201985836029, "epoch": 0.00478, "grad_norm": 0.0351886972784996, "kl": 0.43998008221387863, "learning_rate": 9.9999095859682e-06, "loss": -0.0214, "step": 478, "step_time": 4.995825575002527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1762.0, "completions/max_terminated_length": 1762.0, "completions/mean_length": 526.96875, "completions/mean_terminated_length": 526.96875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.9707552790641785, "epoch": 0.00479, "frac_reward_zero_std": 0.0, "grad_norm": 0.047413285821676254, "kl": 0.35562717355787754, "learning_rate": 9.99990917639379e-06, "loss": -0.0209, "num_tokens": 10894576.0, "reward": 1.2330212593078613, "reward_std": 1.7297337055206299, "rewards/rollout_reward_func/mean": 1.2330212593078613, "rewards/rollout_reward_func/std": 1.8572264909744263, "sampling/importance_sampling_ratio/max": 0.5346525311470032, "sampling/importance_sampling_ratio/mean": 0.12663234770298004, "sampling/importance_sampling_ratio/min": 3.150470329060795e-09, "sampling/sampling_logp_difference/max": 4.138359069824219, "sampling/sampling_logp_difference/mean": 1.2018709182739258, "step": 479, "step_time": 11.759912339002767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.9343313574790955, "epoch": 0.0048, "grad_norm": 0.047861821949481964, "kl": 0.3546925336122513, "learning_rate": 9.999908765893802e-06, "loss": -0.0211, "step": 480, "step_time": 6.349591218000569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 922.0, "completions/max_terminated_length": 922.0, "completions/mean_length": 224.78125, "completions/mean_terminated_length": 209.86668395996094, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.0515642166137695, "epoch": 0.00481, "frac_reward_zero_std": 0.25, "grad_norm": 0.013845378533005714, "kl": 0.4134952984750271, "learning_rate": 9.999908354468237e-06, "loss": -0.0158, "num_tokens": 10933657.0, "reward": 0.8232392072677612, "reward_std": 1.1298654079437256, "rewards/rollout_reward_func/mean": 0.8232392072677612, "rewards/rollout_reward_func/std": 1.5773009061813354, "sampling/importance_sampling_ratio/max": 0.5533003807067871, "sampling/importance_sampling_ratio/mean": 0.2905173897743225, "sampling/importance_sampling_ratio/min": 6.427831281225426e-16, "sampling/sampling_logp_difference/max": 12.973699569702148, "sampling/sampling_logp_difference/mean": 1.247070074081421, "step": 481, "step_time": 9.10406485999556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0028409091755747795, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0028409091755747795, "entropy": 5.032102942466736, "epoch": 0.00482, "grad_norm": 0.013687053695321083, "kl": 0.4128969181329012, "learning_rate": 9.999907942117095e-06, "loss": -0.0158, "step": 482, "step_time": 4.675323833998846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.004807692486792803, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004807692486792803, "completions/clipped_ratio": 0.0, "completions/max_length": 895.0, "completions/max_terminated_length": 895.0, "completions/mean_length": 291.125, "completions/mean_terminated_length": 291.125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.6999199986457825, "epoch": 0.00483, "frac_reward_zero_std": 0.25, "grad_norm": 0.07878007739782333, "kl": 0.376712828874588, "learning_rate": 9.999907528840379e-06, "loss": -0.0054, "num_tokens": 10976706.0, "reward": 0.010065913200378418, "reward_std": 1.118899941444397, "rewards/rollout_reward_func/mean": 0.010065913200378418, "rewards/rollout_reward_func/std": 1.746634840965271, "sampling/importance_sampling_ratio/max": 0.5556411743164062, "sampling/importance_sampling_ratio/mean": 0.1880187690258026, "sampling/importance_sampling_ratio/min": 1.8188956471629908e-13, "sampling/sampling_logp_difference/max": 9.745512008666992, "sampling/sampling_logp_difference/mean": 1.214148759841919, "step": 483, "step_time": 8.7375833469996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0036764706019312143, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0036764706019312143, "entropy": 5.706200361251831, "epoch": 0.00484, "grad_norm": 0.07793043553829193, "kl": 0.37163357250392437, "learning_rate": 9.999907114638084e-06, "loss": -0.0056, "step": 484, "step_time": 5.134719288002088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 826.0, "completions/max_terminated_length": 826.0, "completions/mean_length": 418.71875, "completions/mean_terminated_length": 418.71875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.913100451231003, "epoch": 0.00485, "frac_reward_zero_std": 0.0, "grad_norm": 0.024624589830636978, "kl": 0.4135253205895424, "learning_rate": 9.999906699510213e-06, "loss": -0.0055, "num_tokens": 11027637.0, "reward": 0.6997238397598267, "reward_std": 1.4746203422546387, "rewards/rollout_reward_func/mean": 0.6997238397598267, "rewards/rollout_reward_func/std": 1.476837158203125, "sampling/importance_sampling_ratio/max": 0.5426992774009705, "sampling/importance_sampling_ratio/mean": 0.14925137162208557, "sampling/importance_sampling_ratio/min": 2.685778781597037e-05, "sampling/sampling_logp_difference/max": 2.5090489387512207, "sampling/sampling_logp_difference/mean": 1.0418404340744019, "step": 485, "step_time": 8.364776371003245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 5.894925832748413, "epoch": 0.00486, "grad_norm": 0.024266429245471954, "kl": 0.4132997840642929, "learning_rate": 9.999906283456766e-06, "loss": -0.0055, "step": 486, "step_time": 4.559740900000179 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "completions/clipped_ratio": 0.0, "completions/max_length": 1462.0, "completions/max_terminated_length": 1462.0, "completions/mean_length": 517.40625, "completions/mean_terminated_length": 517.40625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.5973511934280396, "epoch": 0.00487, "frac_reward_zero_std": 0.0, "grad_norm": 0.07270132750272751, "kl": 0.3127886261790991, "learning_rate": 9.999905866477743e-06, "loss": -0.0096, "num_tokens": 11079366.0, "reward": 0.6842202544212341, "reward_std": 1.648805856704712, "rewards/rollout_reward_func/mean": 0.6842202544212341, "rewards/rollout_reward_func/std": 1.6396468877792358, "sampling/importance_sampling_ratio/max": 0.5484845042228699, "sampling/importance_sampling_ratio/mean": 0.09674228727817535, "sampling/importance_sampling_ratio/min": 5.6529213683089927e-14, "sampling/sampling_logp_difference/max": 9.459551811218262, "sampling/sampling_logp_difference/mean": 1.4115467071533203, "step": 487, "step_time": 11.565686953996192 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.018229166977107525, "entropy": 6.61372709274292, "epoch": 0.00488, "grad_norm": 0.14041651785373688, "kl": 0.3069666912779212, "learning_rate": 9.999905448573144e-06, "loss": -0.0095, "step": 488, "step_time": 6.002298683000845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1703.0, "completions/max_terminated_length": 1703.0, "completions/mean_length": 501.84375, "completions/mean_terminated_length": 501.84375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.9533089101314545, "epoch": 0.00489, "frac_reward_zero_std": 0.0, "grad_norm": 0.030805138871073723, "kl": 0.3733921591192484, "learning_rate": 9.999905029742968e-06, "loss": -0.0121, "num_tokens": 11129558.0, "reward": 0.7639365792274475, "reward_std": 1.3950855731964111, "rewards/rollout_reward_func/mean": 0.7639365792274475, "rewards/rollout_reward_func/std": 1.662253499031067, "sampling/importance_sampling_ratio/max": 0.5545467734336853, "sampling/importance_sampling_ratio/mean": 0.1850632131099701, "sampling/importance_sampling_ratio/min": 8.618826138895308e-13, "sampling/sampling_logp_difference/max": 8.850357055664062, "sampling/sampling_logp_difference/mean": 1.189075231552124, "step": 489, "step_time": 11.31431251399772 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 5.92033463716507, "epoch": 0.0049, "grad_norm": 0.018758855760097504, "kl": 0.37603417225182056, "learning_rate": 9.99990460998722e-06, "loss": -0.0122, "step": 490, "step_time": 6.750503540999489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 701.0, "completions/max_terminated_length": 701.0, "completions/mean_length": 199.5, "completions/mean_terminated_length": 199.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.92076101899147, "epoch": 0.00491, "frac_reward_zero_std": 0.0, "grad_norm": 0.1815633624792099, "kl": 0.6393758989870548, "learning_rate": 9.999904189305892e-06, "loss": -0.0175, "num_tokens": 11168683.0, "reward": 1.316900610923767, "reward_std": 1.45253586769104, "rewards/rollout_reward_func/mean": 1.316900610923767, "rewards/rollout_reward_func/std": 1.535162329673767, "sampling/importance_sampling_ratio/max": 0.5599895119667053, "sampling/importance_sampling_ratio/mean": 0.27486005425453186, "sampling/importance_sampling_ratio/min": 5.011134183511065e-18, "sampling/sampling_logp_difference/max": 10.42982292175293, "sampling/sampling_logp_difference/mean": 1.0862815380096436, "step": 491, "step_time": 7.830512549997366 }, { "clip_ratio/high_max": 0.0729166679084301, "clip_ratio/high_mean": 0.03645833395421505, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875000931322575, "entropy": 4.824669808149338, "epoch": 0.00492, "grad_norm": 0.04252741113305092, "kl": 0.7215222418308258, "learning_rate": 9.999903767698988e-06, "loss": -0.0178, "step": 492, "step_time": 4.811460298999009 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 952.0, "completions/max_terminated_length": 952.0, "completions/mean_length": 279.46875, "completions/mean_terminated_length": 279.46875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.060103416442871, "epoch": 0.00493, "frac_reward_zero_std": 0.0, "grad_norm": 0.06778580695390701, "kl": 0.3177030077204108, "learning_rate": 9.999903345166511e-06, "loss": -0.0158, "num_tokens": 11212990.0, "reward": 0.659441351890564, "reward_std": 1.4112093448638916, "rewards/rollout_reward_func/mean": 0.659441351890564, "rewards/rollout_reward_func/std": 1.402155876159668, "sampling/importance_sampling_ratio/max": 0.5544271469116211, "sampling/importance_sampling_ratio/mean": 0.18244582414627075, "sampling/importance_sampling_ratio/min": 6.512676009151619e-06, "sampling/sampling_logp_difference/max": 4.653391361236572, "sampling/sampling_logp_difference/mean": 1.1899173259735107, "step": 493, "step_time": 8.655934911996155 }, { "clip_ratio/high_max": 0.026041666977107525, "clip_ratio/high_mean": 0.013020833488553762, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013020833488553762, "entropy": 6.015966534614563, "epoch": 0.00494, "grad_norm": 0.0419531986117363, "kl": 0.32825828716158867, "learning_rate": 9.999902921708457e-06, "loss": -0.0161, "step": 494, "step_time": 4.795817022004485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.03125, "completions/max_length": 1057.0, "completions/max_terminated_length": 1057.0, "completions/mean_length": 269.4375, "completions/mean_terminated_length": 263.7742004394531, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.314268112182617, "epoch": 0.00495, "frac_reward_zero_std": 0.25, "grad_norm": 0.02476826310157776, "kl": 0.4656895361840725, "learning_rate": 9.999902497324827e-06, "loss": -0.0105, "num_tokens": 11254270.0, "reward": 0.6073051691055298, "reward_std": 1.195123553276062, "rewards/rollout_reward_func/mean": 0.6073051691055298, "rewards/rollout_reward_func/std": 1.5569570064544678, "sampling/importance_sampling_ratio/max": 0.5560732483863831, "sampling/importance_sampling_ratio/mean": 0.24397027492523193, "sampling/importance_sampling_ratio/min": 8.264243789348466e-23, "sampling/sampling_logp_difference/max": 3.772409439086914, "sampling/sampling_logp_difference/mean": 1.0003118515014648, "step": 495, "step_time": 9.668731943995226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.303662061691284, "epoch": 0.00496, "grad_norm": 0.0317908450961113, "kl": 0.46424250677227974, "learning_rate": 9.999902072015623e-06, "loss": -0.0105, "step": 496, "step_time": 4.984079721998569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 974.0, "completions/max_terminated_length": 974.0, "completions/mean_length": 322.21875, "completions/mean_terminated_length": 322.21875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.170296609401703, "epoch": 0.00497, "frac_reward_zero_std": 0.0, "grad_norm": 0.026953978464007378, "kl": 0.3826829120516777, "learning_rate": 9.999901645780843e-06, "loss": -0.0167, "num_tokens": 11300228.0, "reward": 0.7644609212875366, "reward_std": 1.849981665611267, "rewards/rollout_reward_func/mean": 0.7644609212875366, "rewards/rollout_reward_func/std": 1.8913110494613647, "sampling/importance_sampling_ratio/max": 0.5500313639640808, "sampling/importance_sampling_ratio/mean": 0.17149722576141357, "sampling/importance_sampling_ratio/min": 7.900941767502445e-08, "sampling/sampling_logp_difference/max": 2.4838852882385254, "sampling/sampling_logp_difference/mean": 1.187967300415039, "step": 497, "step_time": 8.942662747002032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 6.158687710762024, "epoch": 0.00498, "grad_norm": 0.027776118367910385, "kl": 0.3911930900067091, "learning_rate": 9.99990121862049e-06, "loss": -0.0167, "step": 498, "step_time": 5.1952234619966475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1156.0, "completions/max_terminated_length": 1156.0, "completions/mean_length": 297.71875, "completions/mean_terminated_length": 297.71875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.833783596754074, "epoch": 0.00499, "frac_reward_zero_std": 0.25, "grad_norm": 0.024808594956994057, "kl": 0.5631073638796806, "learning_rate": 9.99990079053456e-06, "loss": -0.0075, "num_tokens": 11345333.0, "reward": 0.9619324207305908, "reward_std": 0.6361227035522461, "rewards/rollout_reward_func/mean": 0.9619324207305908, "rewards/rollout_reward_func/std": 1.3557453155517578, "sampling/importance_sampling_ratio/max": 0.5607099533081055, "sampling/importance_sampling_ratio/mean": 0.22679336369037628, "sampling/importance_sampling_ratio/min": 1.4682386790809687e-06, "sampling/sampling_logp_difference/max": 4.3429412841796875, "sampling/sampling_logp_difference/mean": 0.8019988536834717, "step": 499, "step_time": 9.441332251997665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.833355367183685, "epoch": 0.005, "grad_norm": 0.02451508305966854, "kl": 0.5581651926040649, "learning_rate": 9.999900361523054e-06, "loss": -0.0075, "step": 500, "step_time": 5.1808828550092585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1013.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 378.75, "completions/mean_terminated_length": 377.4193420410156, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.473606884479523, "epoch": 0.00501, "frac_reward_zero_std": 0.25, "grad_norm": 0.05428498610854149, "kl": 0.43442607671022415, "learning_rate": 9.999899931585976e-06, "loss": -0.0155, "num_tokens": 11392160.0, "reward": 0.6870443224906921, "reward_std": 1.215329647064209, "rewards/rollout_reward_func/mean": 0.6870443224906921, "rewards/rollout_reward_func/std": 1.5752055644989014, "sampling/importance_sampling_ratio/max": 0.5576465725898743, "sampling/importance_sampling_ratio/mean": 0.22573426365852356, "sampling/importance_sampling_ratio/min": 1.1697412298223427e-18, "sampling/sampling_logp_difference/max": 4.4972028732299805, "sampling/sampling_logp_difference/mean": 1.0984174013137817, "step": 501, "step_time": 9.75258950100033 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 5.4757373332977295, "epoch": 0.00502, "grad_norm": 0.0218656025826931, "kl": 0.4317471645772457, "learning_rate": 9.999899500723323e-06, "loss": -0.0157, "step": 502, "step_time": 4.976322137001262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1576.0, "completions/max_terminated_length": 1576.0, "completions/mean_length": 432.3125, "completions/mean_terminated_length": 432.3125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.575545638799667, "epoch": 0.00503, "frac_reward_zero_std": 0.25, "grad_norm": 0.023280339315533638, "kl": 0.4503717441111803, "learning_rate": 9.999899068935093e-06, "loss": -0.0048, "num_tokens": 11440674.0, "reward": 1.1707532405853271, "reward_std": 1.216766595840454, "rewards/rollout_reward_func/mean": 1.1707532405853271, "rewards/rollout_reward_func/std": 1.4441280364990234, "sampling/importance_sampling_ratio/max": 0.5559715032577515, "sampling/importance_sampling_ratio/mean": 0.20094944536685944, "sampling/importance_sampling_ratio/min": 9.119288080228216e-08, "sampling/sampling_logp_difference/max": 4.9820332527160645, "sampling/sampling_logp_difference/mean": 0.9780789613723755, "step": 503, "step_time": 11.033767695997085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.574867248535156, "epoch": 0.00504, "grad_norm": 0.023801172152161598, "kl": 0.45106472074985504, "learning_rate": 9.99989863622129e-06, "loss": -0.0049, "step": 504, "step_time": 6.570971177996398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1018.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 298.9375, "completions/mean_terminated_length": 277.1612854003906, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.387293219566345, "epoch": 0.00505, "frac_reward_zero_std": 0.5, "grad_norm": 0.02510070987045765, "kl": 0.44887912180274725, "learning_rate": 9.999898202581914e-06, "loss": -0.0046, "num_tokens": 11484478.0, "reward": 0.9022461175918579, "reward_std": 0.6878317594528198, "rewards/rollout_reward_func/mean": 0.9022461175918579, "rewards/rollout_reward_func/std": 1.2698231935501099, "sampling/importance_sampling_ratio/max": 0.557392418384552, "sampling/importance_sampling_ratio/mean": 0.333138644695282, "sampling/importance_sampling_ratio/min": 1.0273470252286643e-07, "sampling/sampling_logp_difference/max": 4.3519086837768555, "sampling/sampling_logp_difference/mean": 0.7322454452514648, "step": 505, "step_time": 8.680519564008137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.407260954380035, "epoch": 0.00506, "grad_norm": 0.02591274119913578, "kl": 0.44537240359932184, "learning_rate": 9.999897768016961e-06, "loss": -0.0046, "step": 506, "step_time": 4.921475123002892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/max_terminated_length": 478.0, "completions/mean_length": 136.96875, "completions/mean_terminated_length": 136.96875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.8674456179142, "epoch": 0.00507, "frac_reward_zero_std": 0.5, "grad_norm": 0.1441047489643097, "kl": 0.8987349607050419, "learning_rate": 9.999897332526437e-06, "loss": -0.0064, "num_tokens": 11520514.0, "reward": 1.6568529605865479, "reward_std": 0.8649104833602905, "rewards/rollout_reward_func/mean": 1.6568529605865479, "rewards/rollout_reward_func/std": 1.2352374792099, "sampling/importance_sampling_ratio/max": 0.5538491010665894, "sampling/importance_sampling_ratio/mean": 0.3162684142589569, "sampling/importance_sampling_ratio/min": 0.0003078084555454552, "sampling/sampling_logp_difference/max": 2.7978875637054443, "sampling/sampling_logp_difference/mean": 0.8630849123001099, "step": 507, "step_time": 7.434656434998033 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0703125, "entropy": 4.937085956335068, "epoch": 0.00508, "grad_norm": 0.05614837259054184, "kl": 0.6874014958739281, "learning_rate": 9.999896896110337e-06, "loss": -0.0072, "step": 508, "step_time": 3.7908533830050146 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "completions/clipped_ratio": 0.03125, "completions/max_length": 1518.0, "completions/max_terminated_length": 1518.0, "completions/mean_length": 241.40625, "completions/mean_terminated_length": 248.6774139404297, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.183966040611267, "epoch": 0.00509, "frac_reward_zero_std": 0.0, "grad_norm": 0.06075812876224518, "kl": 0.5145781561732292, "learning_rate": 9.999896458768663e-06, "loss": -0.0114, "num_tokens": 11562205.0, "reward": 1.0641486644744873, "reward_std": 0.9224786162376404, "rewards/rollout_reward_func/mean": 1.0641486644744873, "rewards/rollout_reward_func/std": 1.6326723098754883, "sampling/importance_sampling_ratio/max": 0.5602924823760986, "sampling/importance_sampling_ratio/mean": 0.2748628854751587, "sampling/importance_sampling_ratio/min": 2.800230445470283e-14, "sampling/sampling_logp_difference/max": 4.84787654876709, "sampling/sampling_logp_difference/mean": 0.9880207777023315, "step": 509, "step_time": 10.450661846996809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 5.252003610134125, "epoch": 0.0051, "grad_norm": 0.07746058702468872, "kl": 0.5037425048649311, "learning_rate": 9.999896020501416e-06, "loss": -0.0116, "step": 510, "step_time": 6.383912289002183 }, { "clip_ratio/high_max": 0.03693181835114956, "clip_ratio/high_mean": 0.01846590917557478, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01846590917557478, "completions/clipped_ratio": 0.03125, "completions/max_length": 1493.0, "completions/max_terminated_length": 1493.0, "completions/mean_length": 314.28125, "completions/mean_terminated_length": 298.7419128417969, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.035257339477539, "epoch": 0.00511, "frac_reward_zero_std": 0.25, "grad_norm": 0.01586509495973587, "kl": 0.48513131961226463, "learning_rate": 9.999895581308597e-06, "loss": 0.0003, "num_tokens": 11605535.0, "reward": 0.9459637403488159, "reward_std": 0.8420171737670898, "rewards/rollout_reward_func/mean": 0.9459637403488159, "rewards/rollout_reward_func/std": 1.5661653280258179, "sampling/importance_sampling_ratio/max": 0.5567431449890137, "sampling/importance_sampling_ratio/mean": 0.17885175347328186, "sampling/importance_sampling_ratio/min": 1.4302452278069921e-10, "sampling/sampling_logp_difference/max": 4.040516376495361, "sampling/sampling_logp_difference/mean": 1.126773715019226, "step": 511, "step_time": 10.604471478996857 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 6.063455641269684, "epoch": 0.00512, "grad_norm": 0.016570931300520897, "kl": 0.48449917137622833, "learning_rate": 9.999895141190201e-06, "loss": 0.0003, "step": 512, "step_time": 5.80664846300715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1251.0, "completions/max_terminated_length": 1251.0, "completions/mean_length": 465.53125, "completions/mean_terminated_length": 465.53125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.066488564014435, "epoch": 0.00513, "frac_reward_zero_std": 0.0, "grad_norm": 0.019164547324180603, "kl": 0.3569229617714882, "learning_rate": 9.999894700146234e-06, "loss": -0.0129, "num_tokens": 11655518.0, "reward": 0.8792263269424438, "reward_std": 1.7105140686035156, "rewards/rollout_reward_func/mean": 0.8792263269424438, "rewards/rollout_reward_func/std": 1.7389239072799683, "sampling/importance_sampling_ratio/max": 0.5454436540603638, "sampling/importance_sampling_ratio/mean": 0.12571008503437042, "sampling/importance_sampling_ratio/min": 4.893173387500059e-15, "sampling/sampling_logp_difference/max": 11.245293617248535, "sampling/sampling_logp_difference/mean": 1.4795067310333252, "step": 513, "step_time": 10.740244418007933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0024038462433964014, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0024038462433964014, "entropy": 7.074859440326691, "epoch": 0.00514, "grad_norm": 0.01973694935441017, "kl": 0.3575850687921047, "learning_rate": 9.999894258176692e-06, "loss": -0.0129, "step": 514, "step_time": 5.524796093010082 }, { "clip_ratio/high_max": 0.0052083334885537624, "clip_ratio/high_mean": 0.0026041667442768812, "clip_ratio/low_mean": 0.0026041667442768812, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0052083334885537624, "completions/clipped_ratio": 0.03125, "completions/max_length": 1153.0, "completions/max_terminated_length": 1153.0, "completions/mean_length": 484.75, "completions/mean_terminated_length": 470.1612854003906, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.759190857410431, "epoch": 0.00515, "frac_reward_zero_std": 0.0, "grad_norm": 0.07422410696744919, "kl": 0.30489340238273144, "learning_rate": 9.999893815281578e-06, "loss": -0.0129, "num_tokens": 11708115.0, "reward": 1.1167306900024414, "reward_std": 1.6305431127548218, "rewards/rollout_reward_func/mean": 1.1167306900024414, "rewards/rollout_reward_func/std": 1.5842094421386719, "sampling/importance_sampling_ratio/max": 0.5510468482971191, "sampling/importance_sampling_ratio/mean": 0.15665262937545776, "sampling/importance_sampling_ratio/min": 3.5675595810857885e-10, "sampling/sampling_logp_difference/max": 3.9936270713806152, "sampling/sampling_logp_difference/mean": 1.106630563735962, "step": 515, "step_time": 10.10078576199885 }, { "clip_ratio/high_max": 0.020833333488553762, "clip_ratio/high_mean": 0.010416666744276881, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666744276881, "entropy": 5.769267678260803, "epoch": 0.00516, "grad_norm": 0.0441148467361927, "kl": 0.31514061242341995, "learning_rate": 9.999893371460891e-06, "loss": -0.0132, "step": 516, "step_time": 5.248467985999014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.009090909268707037, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009090909268707037, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/max_terminated_length": 472.0, "completions/mean_length": 187.09375, "completions/mean_terminated_length": 187.09375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.02503314614296, "epoch": 0.00517, "frac_reward_zero_std": 0.25, "grad_norm": 0.09749259054660797, "kl": 0.6185448095202446, "learning_rate": 9.99989292671463e-06, "loss": -0.0105, "num_tokens": 11748881.0, "reward": 1.4114463329315186, "reward_std": 1.018436074256897, "rewards/rollout_reward_func/mean": 1.4114463329315186, "rewards/rollout_reward_func/std": 1.2221072912216187, "sampling/importance_sampling_ratio/max": 0.5659542679786682, "sampling/importance_sampling_ratio/mean": 0.2986334562301636, "sampling/importance_sampling_ratio/min": 1.0498233501884505e-23, "sampling/sampling_logp_difference/max": 12.861411094665527, "sampling/sampling_logp_difference/mean": 1.161632776260376, "step": 517, "step_time": 7.03835033499854 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 5.03291592001915, "epoch": 0.00518, "grad_norm": 0.04526164010167122, "kl": 0.6215247642248869, "learning_rate": 9.999892481042796e-06, "loss": -0.0106, "step": 518, "step_time": 4.29506869800025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.004464285913854837, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004464285913854837, "completions/clipped_ratio": 0.0, "completions/max_length": 1261.0, "completions/max_terminated_length": 1261.0, "completions/mean_length": 552.59375, "completions/mean_terminated_length": 552.59375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.739412784576416, "epoch": 0.00519, "frac_reward_zero_std": 0.0, "grad_norm": 0.19967208802700043, "kl": 0.38197395019233227, "learning_rate": 9.99989203444539e-06, "loss": -0.0161, "num_tokens": 11803232.0, "reward": 0.8351632952690125, "reward_std": 1.5609453916549683, "rewards/rollout_reward_func/mean": 0.8351632952690125, "rewards/rollout_reward_func/std": 1.542076826095581, "sampling/importance_sampling_ratio/max": 0.5545600056648254, "sampling/importance_sampling_ratio/mean": 0.10827475041151047, "sampling/importance_sampling_ratio/min": 3.6776661999139577e-19, "sampling/sampling_logp_difference/max": 11.94085693359375, "sampling/sampling_logp_difference/mean": 1.4254319667816162, "step": 519, "step_time": 9.950048042996059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 6.747407913208008, "epoch": 0.0052, "grad_norm": 0.06921187788248062, "kl": 0.33422453328967094, "learning_rate": 9.99989158692241e-06, "loss": -0.0166, "step": 520, "step_time": 5.521460291005496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1186.0, "completions/max_terminated_length": 1186.0, "completions/mean_length": 263.53125, "completions/mean_terminated_length": 263.53125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.548047244548798, "epoch": 0.00521, "frac_reward_zero_std": 0.0, "grad_norm": 0.08568692952394485, "kl": 0.5539887063205242, "learning_rate": 9.999891138473859e-06, "loss": -0.0186, "num_tokens": 11845546.0, "reward": 0.33621150255203247, "reward_std": 1.3150967359542847, "rewards/rollout_reward_func/mean": 0.33621150255203247, "rewards/rollout_reward_func/std": 1.591708779335022, "sampling/importance_sampling_ratio/max": 0.555237889289856, "sampling/importance_sampling_ratio/mean": 0.25283437967300415, "sampling/importance_sampling_ratio/min": 3.148154014291139e-15, "sampling/sampling_logp_difference/max": 9.336821556091309, "sampling/sampling_logp_difference/mean": 1.1515107154846191, "step": 521, "step_time": 9.746212252000987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.533107757568359, "epoch": 0.00522, "grad_norm": 0.08941102027893066, "kl": 0.5592469312250614, "learning_rate": 9.999890689099736e-06, "loss": -0.0188, "step": 522, "step_time": 5.1878320630057715 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "completions/clipped_ratio": 0.0, "completions/max_length": 1214.0, "completions/max_terminated_length": 1214.0, "completions/mean_length": 309.34375, "completions/mean_terminated_length": 309.34375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.38015353679657, "epoch": 0.00523, "frac_reward_zero_std": 0.0, "grad_norm": 0.047022152692079544, "kl": 0.4176870919764042, "learning_rate": 9.999890238800038e-06, "loss": -0.0051, "num_tokens": 11891293.0, "reward": 0.474479079246521, "reward_std": 1.1762750148773193, "rewards/rollout_reward_func/mean": 0.474479079246521, "rewards/rollout_reward_func/std": 1.3282150030136108, "sampling/importance_sampling_ratio/max": 0.562349259853363, "sampling/importance_sampling_ratio/mean": 0.19054627418518066, "sampling/importance_sampling_ratio/min": 1.4751499116533715e-15, "sampling/sampling_logp_difference/max": 8.630292892456055, "sampling/sampling_logp_difference/mean": 1.3847410678863525, "step": 523, "step_time": 9.355135004003387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 6.3504303097724915, "epoch": 0.00524, "grad_norm": 0.041544802486896515, "kl": 0.4133421294391155, "learning_rate": 9.99988978757477e-06, "loss": -0.0051, "step": 524, "step_time": 5.706385124994995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.004464285913854837, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004464285913854837, "completions/clipped_ratio": 0.0, "completions/max_length": 1346.0, "completions/max_terminated_length": 1346.0, "completions/mean_length": 177.75, "completions/mean_terminated_length": 177.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.689943999052048, "epoch": 0.00525, "frac_reward_zero_std": 0.25, "grad_norm": 0.02872322127223015, "kl": 0.7694494463503361, "learning_rate": 9.99988933542393e-06, "loss": -0.0128, "num_tokens": 11928687.0, "reward": 1.2134002447128296, "reward_std": 1.2734096050262451, "rewards/rollout_reward_func/mean": 1.2134002447128296, "rewards/rollout_reward_func/std": 1.531915545463562, "sampling/importance_sampling_ratio/max": 0.560696542263031, "sampling/importance_sampling_ratio/mean": 0.34064042568206787, "sampling/importance_sampling_ratio/min": 1.8007327184932365e-08, "sampling/sampling_logp_difference/max": 2.8504250049591064, "sampling/sampling_logp_difference/mean": 0.9484803676605225, "step": 525, "step_time": 9.836106667000422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.674013167619705, "epoch": 0.00526, "grad_norm": 0.024345947429537773, "kl": 0.7665489912033081, "learning_rate": 9.999888882347517e-06, "loss": -0.0129, "step": 526, "step_time": 5.579629591997218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "completions/clipped_ratio": 0.0, "completions/max_length": 1467.0, "completions/max_terminated_length": 1467.0, "completions/mean_length": 411.0, "completions/mean_terminated_length": 411.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.323470294475555, "epoch": 0.00527, "frac_reward_zero_std": 0.0, "grad_norm": 0.08172091841697693, "kl": 0.4655117504298687, "learning_rate": 9.999888428345532e-06, "loss": -0.0108, "num_tokens": 11978028.0, "reward": 1.510459065437317, "reward_std": 1.2957842350006104, "rewards/rollout_reward_func/mean": 1.510459065437317, "rewards/rollout_reward_func/std": 1.462587594985962, "sampling/importance_sampling_ratio/max": 0.555077075958252, "sampling/importance_sampling_ratio/mean": 0.1976388543844223, "sampling/importance_sampling_ratio/min": 1.6124240573844872e-05, "sampling/sampling_logp_difference/max": 3.49765944480896, "sampling/sampling_logp_difference/mean": 0.8914374709129333, "step": 527, "step_time": 11.043232878000708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 5.309996843338013, "epoch": 0.00528, "grad_norm": 0.0741109699010849, "kl": 0.47044637799263, "learning_rate": 9.999887973417974e-06, "loss": -0.0109, "step": 528, "step_time": 5.897459952000645 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "completions/clipped_ratio": 0.0, "completions/max_length": 1313.0, "completions/max_terminated_length": 1313.0, "completions/mean_length": 528.0, "completions/mean_terminated_length": 528.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.818665504455566, "epoch": 0.00529, "frac_reward_zero_std": 0.0, "grad_norm": 0.05136013776063919, "kl": 0.3401583470404148, "learning_rate": 9.999887517564846e-06, "loss": -0.0122, "num_tokens": 12031712.0, "reward": 1.0794494152069092, "reward_std": 1.4761320352554321, "rewards/rollout_reward_func/mean": 1.0794494152069092, "rewards/rollout_reward_func/std": 1.4756500720977783, "sampling/importance_sampling_ratio/max": 0.3289669454097748, "sampling/importance_sampling_ratio/mean": 0.08849041163921356, "sampling/importance_sampling_ratio/min": 7.412755621771794e-06, "sampling/sampling_logp_difference/max": 2.848637580871582, "sampling/sampling_logp_difference/mean": 1.2919234037399292, "step": 529, "step_time": 9.889704178996908 }, { "clip_ratio/high_max": 0.02500000037252903, "clip_ratio/high_mean": 0.012500000186264515, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012500000186264515, "entropy": 6.820170521736145, "epoch": 0.0053, "grad_norm": 0.025089414790272713, "kl": 0.34011505357921124, "learning_rate": 9.999887060786147e-06, "loss": -0.0123, "step": 530, "step_time": 6.040526671993575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 592.0, "completions/max_terminated_length": 592.0, "completions/mean_length": 136.40625, "completions/mean_terminated_length": 136.40625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.950818598270416, "epoch": 0.00531, "frac_reward_zero_std": 0.25, "grad_norm": 0.06519388407468796, "kl": 0.5357001684606075, "learning_rate": 9.999886603081875e-06, "loss": -0.0108, "num_tokens": 12070654.0, "reward": 0.9649208784103394, "reward_std": 1.2721278667449951, "rewards/rollout_reward_func/mean": 0.9649208784103394, "rewards/rollout_reward_func/std": 1.5914020538330078, "sampling/importance_sampling_ratio/max": 0.5587332844734192, "sampling/importance_sampling_ratio/mean": 0.3111618161201477, "sampling/importance_sampling_ratio/min": 5.95847102127095e-13, "sampling/sampling_logp_difference/max": 10.437742233276367, "sampling/sampling_logp_difference/mean": 1.012485146522522, "step": 531, "step_time": 7.349302371003432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.95848274230957, "epoch": 0.00532, "grad_norm": 0.07212476432323456, "kl": 0.5296542048454285, "learning_rate": 9.999886144452034e-06, "loss": -0.0109, "step": 532, "step_time": 4.536934644005669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 955.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 221.65625, "completions/mean_terminated_length": 221.65625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.027897775173187, "epoch": 0.00533, "frac_reward_zero_std": 0.25, "grad_norm": 0.08918745815753937, "kl": 0.5755210816860199, "learning_rate": 9.999885684896619e-06, "loss": -0.0159, "num_tokens": 12110971.0, "reward": 1.1696722507476807, "reward_std": 0.9111778736114502, "rewards/rollout_reward_func/mean": 1.1696722507476807, "rewards/rollout_reward_func/std": 1.4375903606414795, "sampling/importance_sampling_ratio/max": 0.5579591989517212, "sampling/importance_sampling_ratio/mean": 0.3500863313674927, "sampling/importance_sampling_ratio/min": 6.33982472209027e-06, "sampling/sampling_logp_difference/max": 2.7367687225341797, "sampling/sampling_logp_difference/mean": 0.7225215435028076, "step": 533, "step_time": 8.522890375996212 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.018229166977107525, "entropy": 4.048595607280731, "epoch": 0.00534, "grad_norm": 0.08643896132707596, "kl": 0.5773259997367859, "learning_rate": 9.999885224415634e-06, "loss": -0.0164, "step": 534, "step_time": 4.732994218993554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1295.0, "completions/max_terminated_length": 1295.0, "completions/mean_length": 300.625, "completions/mean_terminated_length": 307.0000305175781, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.494761645793915, "epoch": 0.00535, "frac_reward_zero_std": 0.0, "grad_norm": 0.023846806958317757, "kl": 0.5848655477166176, "learning_rate": 9.999884763009078e-06, "loss": -0.008, "num_tokens": 12155135.0, "reward": 0.8964548110961914, "reward_std": 1.2862744331359863, "rewards/rollout_reward_func/mean": 0.8964548110961914, "rewards/rollout_reward_func/std": 1.5621706247329712, "sampling/importance_sampling_ratio/max": 0.5570156574249268, "sampling/importance_sampling_ratio/mean": 0.2721025347709656, "sampling/importance_sampling_ratio/min": 4.648880672780686e-12, "sampling/sampling_logp_difference/max": 3.9946560859680176, "sampling/sampling_logp_difference/mean": 1.062856912612915, "step": 535, "step_time": 9.923959164996631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.51589971780777, "epoch": 0.00536, "grad_norm": 0.02517838031053543, "kl": 0.5837592631578445, "learning_rate": 9.99988430067695e-06, "loss": -0.008, "step": 536, "step_time": 5.48169692099691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1099.0, "completions/max_terminated_length": 1099.0, "completions/mean_length": 440.125, "completions/mean_terminated_length": 440.125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.531893014907837, "epoch": 0.00537, "frac_reward_zero_std": 0.25, "grad_norm": 0.0674588531255722, "kl": 0.483549565076828, "learning_rate": 9.999883837419253e-06, "loss": -0.0145, "num_tokens": 12204640.0, "reward": 1.5644454956054688, "reward_std": 1.2356622219085693, "rewards/rollout_reward_func/mean": 1.5644454956054688, "rewards/rollout_reward_func/std": 1.402438998222351, "sampling/importance_sampling_ratio/max": 0.5490744709968567, "sampling/importance_sampling_ratio/mean": 0.2180551290512085, "sampling/importance_sampling_ratio/min": 6.636041161365332e-18, "sampling/sampling_logp_difference/max": 4.540686130523682, "sampling/sampling_logp_difference/mean": 1.198866367340088, "step": 537, "step_time": 9.660956349001935 }, { "clip_ratio/high_max": 0.043750000186264515, "clip_ratio/high_mean": 0.021875000093132257, "clip_ratio/low_mean": 0.0052083334885537624, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02708333358168602, "entropy": 5.547809898853302, "epoch": 0.00538, "grad_norm": 0.039239078760147095, "kl": 0.48837591893970966, "learning_rate": 9.999883373235985e-06, "loss": -0.0145, "step": 538, "step_time": 5.695445823996124 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "completions/clipped_ratio": 0.0, "completions/max_length": 1454.0, "completions/max_terminated_length": 1454.0, "completions/mean_length": 523.25, "completions/mean_terminated_length": 523.25, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.037812232971191, "epoch": 0.00539, "frac_reward_zero_std": 0.25, "grad_norm": 0.03441119194030762, "kl": 0.4101129425689578, "learning_rate": 9.999882908127145e-06, "loss": -0.0108, "num_tokens": 12256378.0, "reward": 1.2306755781173706, "reward_std": 1.2330962419509888, "rewards/rollout_reward_func/mean": 1.2306755781173706, "rewards/rollout_reward_func/std": 1.651137351989746, "sampling/importance_sampling_ratio/max": 0.558419406414032, "sampling/importance_sampling_ratio/mean": 0.18812423944473267, "sampling/importance_sampling_ratio/min": 4.545303688985314e-09, "sampling/sampling_logp_difference/max": 2.740030288696289, "sampling/sampling_logp_difference/mean": 1.2018777132034302, "step": 539, "step_time": 10.550716514997475 }, { "clip_ratio/high_max": 0.040178571827709675, "clip_ratio/high_mean": 0.020089285913854837, "clip_ratio/low_mean": 0.010156250093132257, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.030245536006987095, "entropy": 6.037966787815094, "epoch": 0.0054, "grad_norm": 0.014095176011323929, "kl": 0.41213067434728146, "learning_rate": 9.999882442092736e-06, "loss": -0.0109, "step": 540, "step_time": 5.883133458002703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 469.0, "completions/max_terminated_length": 469.0, "completions/mean_length": 115.84375, "completions/mean_terminated_length": 115.84375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.612487018108368, "epoch": 0.00541, "frac_reward_zero_std": 0.25, "grad_norm": 0.1028226986527443, "kl": 0.5154131352901459, "learning_rate": 9.999881975132757e-06, "loss": -0.0106, "num_tokens": 12292648.0, "reward": 0.890675961971283, "reward_std": 1.3092992305755615, "rewards/rollout_reward_func/mean": 0.890675961971283, "rewards/rollout_reward_func/std": 1.8249057531356812, "sampling/importance_sampling_ratio/max": 0.5610595345497131, "sampling/importance_sampling_ratio/mean": 0.2592204511165619, "sampling/importance_sampling_ratio/min": 2.7097710244561313e-06, "sampling/sampling_logp_difference/max": 4.316779136657715, "sampling/sampling_logp_difference/mean": 1.3150641918182373, "step": 541, "step_time": 7.416660892002255 }, { "clip_ratio/high_max": 0.0729166679084301, "clip_ratio/high_mean": 0.03645833395421505, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03645833395421505, "entropy": 5.521456569433212, "epoch": 0.00542, "grad_norm": 0.03705768659710884, "kl": 0.5314689502120018, "learning_rate": 9.999881507247207e-06, "loss": -0.0106, "step": 542, "step_time": 3.785364581999602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1454.0, "completions/max_terminated_length": 1454.0, "completions/mean_length": 475.375, "completions/mean_terminated_length": 475.375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.275467395782471, "epoch": 0.00543, "frac_reward_zero_std": 0.0, "grad_norm": 0.013009553775191307, "kl": 0.36730433255434036, "learning_rate": 9.999881038436085e-06, "loss": -0.0112, "num_tokens": 12342152.0, "reward": 1.3601200580596924, "reward_std": 1.8566858768463135, "rewards/rollout_reward_func/mean": 1.3601200580596924, "rewards/rollout_reward_func/std": 1.9755078554153442, "sampling/importance_sampling_ratio/max": 0.551436185836792, "sampling/importance_sampling_ratio/mean": 0.16020739078521729, "sampling/importance_sampling_ratio/min": 2.626461027832594e-11, "sampling/sampling_logp_difference/max": 4.013437747955322, "sampling/sampling_logp_difference/mean": 1.228442907333374, "step": 543, "step_time": 10.598281112008408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.004464285913854837, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004464285913854837, "entropy": 6.250630676746368, "epoch": 0.00544, "grad_norm": 0.012910689227283001, "kl": 0.36539071798324585, "learning_rate": 9.999880568699396e-06, "loss": -0.0112, "step": 544, "step_time": 6.144365380001545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 865.0, "completions/max_terminated_length": 865.0, "completions/mean_length": 239.3125, "completions/mean_terminated_length": 236.8064422607422, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.528842717409134, "epoch": 0.00545, "frac_reward_zero_std": 0.0, "grad_norm": 0.04850868508219719, "kl": 0.4260190296918154, "learning_rate": 9.999880098037136e-06, "loss": -0.0253, "num_tokens": 12386504.0, "reward": 0.7041530609130859, "reward_std": 1.7591133117675781, "rewards/rollout_reward_func/mean": 0.7041530609130859, "rewards/rollout_reward_func/std": 1.8088918924331665, "sampling/importance_sampling_ratio/max": 0.5086262822151184, "sampling/importance_sampling_ratio/mean": 0.13910913467407227, "sampling/importance_sampling_ratio/min": 1.851380261456793e-17, "sampling/sampling_logp_difference/max": 4.160706520080566, "sampling/sampling_logp_difference/mean": 1.3444554805755615, "step": 545, "step_time": 8.338149131002865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 6.5173607766628265, "epoch": 0.00546, "grad_norm": 0.04581444337964058, "kl": 0.41853712871670723, "learning_rate": 9.999879626449306e-06, "loss": -0.0255, "step": 546, "step_time": 4.564878824992775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1747.0, "completions/max_terminated_length": 1747.0, "completions/mean_length": 549.40625, "completions/mean_terminated_length": 549.40625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.566330075263977, "epoch": 0.00547, "frac_reward_zero_std": 0.0, "grad_norm": 0.03583892434835434, "kl": 0.43627299927175045, "learning_rate": 9.999879153935907e-06, "loss": -0.0042, "num_tokens": 12441179.0, "reward": 0.7729940414428711, "reward_std": 1.6012511253356934, "rewards/rollout_reward_func/mean": 0.7729940414428711, "rewards/rollout_reward_func/std": 1.5449013710021973, "sampling/importance_sampling_ratio/max": 0.6001700162887573, "sampling/importance_sampling_ratio/mean": 0.11010003089904785, "sampling/importance_sampling_ratio/min": 5.16403868451587e-13, "sampling/sampling_logp_difference/max": 12.926736831665039, "sampling/sampling_logp_difference/mean": 1.4152710437774658, "step": 547, "step_time": 12.025553676001437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 6.554310649633408, "epoch": 0.00548, "grad_norm": 0.037647850811481476, "kl": 0.43927135691046715, "learning_rate": 9.99987868049694e-06, "loss": -0.0041, "step": 548, "step_time": 6.475153663996025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1409.0, "completions/max_terminated_length": 1409.0, "completions/mean_length": 703.875, "completions/mean_terminated_length": 695.290283203125, "completions/min_length": 290.0, "completions/min_terminated_length": 290.0, "entropy": 5.994349956512451, "epoch": 0.00549, "frac_reward_zero_std": 0.0, "grad_norm": 0.04155699908733368, "kl": 0.4911614414304495, "learning_rate": 9.9998782061324e-06, "loss": -0.0168, "num_tokens": 12500635.0, "reward": 0.9947093725204468, "reward_std": 1.4010803699493408, "rewards/rollout_reward_func/mean": 0.9947093725204468, "rewards/rollout_reward_func/std": 1.5818467140197754, "sampling/importance_sampling_ratio/max": 0.3278559446334839, "sampling/importance_sampling_ratio/mean": 0.07749043405056, "sampling/importance_sampling_ratio/min": 1.532706322970867e-10, "sampling/sampling_logp_difference/max": 4.807460308074951, "sampling/sampling_logp_difference/mean": 1.1068646907806396, "step": 549, "step_time": 10.981946409003285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.980656802654266, "epoch": 0.0055, "grad_norm": 0.042649149894714355, "kl": 0.4888974176719785, "learning_rate": 9.999877730842293e-06, "loss": -0.0169, "step": 550, "step_time": 6.300059092998708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1350.0, "completions/max_terminated_length": 1350.0, "completions/mean_length": 387.40625, "completions/mean_terminated_length": 387.40625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.883252263069153, "epoch": 0.00551, "frac_reward_zero_std": 0.0, "grad_norm": 0.04306456446647644, "kl": 0.28885788563638926, "learning_rate": 9.999877254626616e-06, "loss": -0.0201, "num_tokens": 12549466.0, "reward": 0.34505006670951843, "reward_std": 1.701661467552185, "rewards/rollout_reward_func/mean": 0.34505006670951843, "rewards/rollout_reward_func/std": 1.6827504634857178, "sampling/importance_sampling_ratio/max": 0.534356951713562, "sampling/importance_sampling_ratio/mean": 0.09479525685310364, "sampling/importance_sampling_ratio/min": 1.685858619588157e-13, "sampling/sampling_logp_difference/max": 4.517723083496094, "sampling/sampling_logp_difference/mean": 1.3577089309692383, "step": 551, "step_time": 10.235994452006707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 6.857841730117798, "epoch": 0.00552, "grad_norm": 0.04200361296534538, "kl": 0.28530791215598583, "learning_rate": 9.99987677748537e-06, "loss": -0.0203, "step": 552, "step_time": 5.700782797001011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 430.0, "completions/max_terminated_length": 430.0, "completions/mean_length": 114.0, "completions/mean_terminated_length": 114.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.870336204767227, "epoch": 0.00553, "frac_reward_zero_std": 0.5, "grad_norm": 0.06765387207269669, "kl": 0.7004420608282089, "learning_rate": 9.999876299418556e-06, "loss": -0.0065, "num_tokens": 12584071.0, "reward": 1.4297139644622803, "reward_std": 0.8134871125221252, "rewards/rollout_reward_func/mean": 1.4297139644622803, "rewards/rollout_reward_func/std": 1.2651240825653076, "sampling/importance_sampling_ratio/max": 0.5608552098274231, "sampling/importance_sampling_ratio/mean": 0.4119713306427002, "sampling/importance_sampling_ratio/min": 2.0681496607721783e-05, "sampling/sampling_logp_difference/max": 3.3575541973114014, "sampling/sampling_logp_difference/mean": 0.6459577083587646, "step": 553, "step_time": 6.975314605995663 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 3.876595765352249, "epoch": 0.00554, "grad_norm": 0.04245024174451828, "kl": 0.6947055719792843, "learning_rate": 9.999875820426172e-06, "loss": -0.0063, "step": 554, "step_time": 3.609132479989057 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0052083334885537624, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0052083334885537624, "completions/clipped_ratio": 0.0, "completions/max_length": 1327.0, "completions/max_terminated_length": 1327.0, "completions/mean_length": 444.78125, "completions/mean_terminated_length": 444.78125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.8078755140304565, "epoch": 0.00555, "frac_reward_zero_std": 0.0, "grad_norm": 0.08915738761425018, "kl": 0.5061967074871063, "learning_rate": 9.999875340508221e-06, "loss": -0.0128, "num_tokens": 12634199.0, "reward": 0.514289140701294, "reward_std": 1.3619499206542969, "rewards/rollout_reward_func/mean": 0.514289140701294, "rewards/rollout_reward_func/std": 1.8325207233428955, "sampling/importance_sampling_ratio/max": 0.5526893734931946, "sampling/importance_sampling_ratio/mean": 0.2030172199010849, "sampling/importance_sampling_ratio/min": 3.2731125498348295e-15, "sampling/sampling_logp_difference/max": 4.061750411987305, "sampling/sampling_logp_difference/mean": 0.8751991987228394, "step": 555, "step_time": 10.689498063991778 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.021875000093132257, "entropy": 4.812107235193253, "epoch": 0.00556, "grad_norm": 0.05233035981655121, "kl": 0.47275351360440254, "learning_rate": 9.999874859664698e-06, "loss": -0.0132, "step": 556, "step_time": 5.569053969997185 }, { "clip_ratio/high_max": 0.0625, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "completions/clipped_ratio": 0.0, "completions/max_length": 1302.0, "completions/max_terminated_length": 1302.0, "completions/mean_length": 249.5625, "completions/mean_terminated_length": 249.5625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.369798272848129, "epoch": 0.00557, "frac_reward_zero_std": 0.25, "grad_norm": 0.08616980910301208, "kl": 0.857265293598175, "learning_rate": 9.99987437789561e-06, "loss": -0.0063, "num_tokens": 12674621.0, "reward": 1.7965830564498901, "reward_std": 0.8662173748016357, "rewards/rollout_reward_func/mean": 1.7965830564498901, "rewards/rollout_reward_func/std": 1.127169132232666, "sampling/importance_sampling_ratio/max": 0.5505087971687317, "sampling/importance_sampling_ratio/mean": 0.3187395930290222, "sampling/importance_sampling_ratio/min": 5.370530561776832e-05, "sampling/sampling_logp_difference/max": 2.778005599975586, "sampling/sampling_logp_difference/mean": 0.7000705599784851, "step": 557, "step_time": 9.94734086599783 }, { "clip_ratio/high_max": 0.0625, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 4.322062522172928, "epoch": 0.00558, "grad_norm": 0.07388712465763092, "kl": 0.8638340048491955, "learning_rate": 9.999873895200953e-06, "loss": -0.0064, "step": 558, "step_time": 5.785922068000218 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0052083334885537624, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0052083334885537624, "completions/clipped_ratio": 0.0, "completions/max_length": 1122.0, "completions/max_terminated_length": 1122.0, "completions/mean_length": 293.375, "completions/mean_terminated_length": 293.375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.758132994174957, "epoch": 0.00559, "frac_reward_zero_std": 0.25, "grad_norm": 0.07069656252861023, "kl": 0.4117635153234005, "learning_rate": 9.999873411580727e-06, "loss": -0.0048, "num_tokens": 12718233.0, "reward": 1.1362475156784058, "reward_std": 1.1962668895721436, "rewards/rollout_reward_func/mean": 1.1362475156784058, "rewards/rollout_reward_func/std": 1.6802362203598022, "sampling/importance_sampling_ratio/max": 0.5606548190116882, "sampling/importance_sampling_ratio/mean": 0.2024528980255127, "sampling/importance_sampling_ratio/min": 3.381999249540968e-07, "sampling/sampling_logp_difference/max": 4.595333099365234, "sampling/sampling_logp_difference/mean": 1.1507668495178223, "step": 559, "step_time": 9.233399962002295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 5.729071795940399, "epoch": 0.0056, "grad_norm": 0.06704051792621613, "kl": 0.4169541262090206, "learning_rate": 9.999872927034932e-06, "loss": -0.0049, "step": 560, "step_time": 5.109603147000598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1355.0, "completions/max_terminated_length": 1355.0, "completions/mean_length": 451.6875, "completions/mean_terminated_length": 453.5483703613281, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.597581714391708, "epoch": 0.00561, "frac_reward_zero_std": 0.25, "grad_norm": 0.024896971881389618, "kl": 0.5832211412489414, "learning_rate": 9.99987244156357e-06, "loss": -0.0121, "num_tokens": 12766449.0, "reward": 1.1852015256881714, "reward_std": 1.0190374851226807, "rewards/rollout_reward_func/mean": 1.1852015256881714, "rewards/rollout_reward_func/std": 1.3076648712158203, "sampling/importance_sampling_ratio/max": 0.5574406981468201, "sampling/importance_sampling_ratio/mean": 0.218153715133667, "sampling/importance_sampling_ratio/min": 1.1363762708604952e-10, "sampling/sampling_logp_difference/max": 4.082164764404297, "sampling/sampling_logp_difference/mean": 0.8637101650238037, "step": 561, "step_time": 10.461518000996875 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0052083334885537624, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0052083334885537624, "entropy": 4.5962012112140656, "epoch": 0.00562, "grad_norm": 0.02446560002863407, "kl": 0.5790919326245785, "learning_rate": 9.999871955166642e-06, "loss": -0.0122, "step": 562, "step_time": 5.486589507003373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.0625, "completions/max_length": 1409.0, "completions/max_terminated_length": 1409.0, "completions/mean_length": 424.5, "completions/mean_terminated_length": 413.5666809082031, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.522565603256226, "epoch": 0.00563, "frac_reward_zero_std": 0.25, "grad_norm": 0.09409693628549576, "kl": 0.4957460816949606, "learning_rate": 9.999871467844145e-06, "loss": -0.0062, "num_tokens": 12815211.0, "reward": 1.1114308834075928, "reward_std": 1.0217639207839966, "rewards/rollout_reward_func/mean": 1.1114308834075928, "rewards/rollout_reward_func/std": 1.5369840860366821, "sampling/importance_sampling_ratio/max": 0.5513497591018677, "sampling/importance_sampling_ratio/mean": 0.2100670486688614, "sampling/importance_sampling_ratio/min": 7.632052135457881e-14, "sampling/sampling_logp_difference/max": 3.7781500816345215, "sampling/sampling_logp_difference/mean": 1.1113755702972412, "step": 563, "step_time": 10.6149091629959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0026041667442768812, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0026041667442768812, "entropy": 5.523138880729675, "epoch": 0.00564, "grad_norm": 0.10761240124702454, "kl": 0.4954770300537348, "learning_rate": 9.999870979596079e-06, "loss": -0.0066, "step": 564, "step_time": 6.32556061199648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0031250000465661287, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250000465661287, "completions/clipped_ratio": 0.0, "completions/max_length": 1622.0, "completions/max_terminated_length": 1622.0, "completions/mean_length": 223.0625, "completions/mean_terminated_length": 223.0625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.891437113285065, "epoch": 0.00565, "frac_reward_zero_std": 0.5, "grad_norm": 0.014258498325943947, "kl": 0.5628706216812134, "learning_rate": 9.999870490422448e-06, "loss": -0.0067, "num_tokens": 12854457.0, "reward": 1.3951621055603027, "reward_std": 0.6340405344963074, "rewards/rollout_reward_func/mean": 1.3951621055603027, "rewards/rollout_reward_func/std": 1.1405067443847656, "sampling/importance_sampling_ratio/max": 0.559709906578064, "sampling/importance_sampling_ratio/mean": 0.30743497610092163, "sampling/importance_sampling_ratio/min": 8.028892550804301e-10, "sampling/sampling_logp_difference/max": 13.857806205749512, "sampling/sampling_logp_difference/mean": 1.0546367168426514, "step": 565, "step_time": 10.687787858001684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0031250000465661287, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250000465661287, "entropy": 4.885945171117783, "epoch": 0.00566, "grad_norm": 0.014241794124245644, "kl": 0.5648528188467026, "learning_rate": 9.999870000323247e-06, "loss": -0.0067, "step": 566, "step_time": 6.118480612993153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1477.0, "completions/max_terminated_length": 1477.0, "completions/mean_length": 356.875, "completions/mean_terminated_length": 356.875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.288736969232559, "epoch": 0.00567, "frac_reward_zero_std": 0.0, "grad_norm": 0.02236809767782688, "kl": 0.49311911314725876, "learning_rate": 9.99986950929848e-06, "loss": -0.0017, "num_tokens": 12902445.0, "reward": 0.29246699810028076, "reward_std": 1.010037899017334, "rewards/rollout_reward_func/mean": 0.29246699810028076, "rewards/rollout_reward_func/std": 1.4300600290298462, "sampling/importance_sampling_ratio/max": 0.5928930044174194, "sampling/importance_sampling_ratio/mean": 0.23950524628162384, "sampling/importance_sampling_ratio/min": 4.719733794900094e-07, "sampling/sampling_logp_difference/max": 3.920912265777588, "sampling/sampling_logp_difference/mean": 1.0049769878387451, "step": 567, "step_time": 10.47710529800679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.298535108566284, "epoch": 0.00568, "grad_norm": 0.022923531010746956, "kl": 0.4952336736023426, "learning_rate": 9.999869017348145e-06, "loss": -0.0019, "step": 568, "step_time": 5.819600795002771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 856.0, "completions/max_terminated_length": 856.0, "completions/mean_length": 188.53125, "completions/mean_terminated_length": 188.53125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.956774532794952, "epoch": 0.00569, "frac_reward_zero_std": 0.0, "grad_norm": 0.019640212878584862, "kl": 0.7102163136005402, "learning_rate": 9.999868524472245e-06, "loss": -0.0189, "num_tokens": 12942546.0, "reward": 0.8206350803375244, "reward_std": 1.288457989692688, "rewards/rollout_reward_func/mean": 0.8206350803375244, "rewards/rollout_reward_func/std": 1.5799206495285034, "sampling/importance_sampling_ratio/max": 0.5596034526824951, "sampling/importance_sampling_ratio/mean": 0.29263854026794434, "sampling/importance_sampling_ratio/min": 5.978670742479153e-06, "sampling/sampling_logp_difference/max": 4.469498634338379, "sampling/sampling_logp_difference/mean": 0.9742938280105591, "step": 569, "step_time": 8.345315804996062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.953800857067108, "epoch": 0.0057, "grad_norm": 0.021095450967550278, "kl": 0.7125837504863739, "learning_rate": 9.999868030670776e-06, "loss": -0.019, "step": 570, "step_time": 5.028537943006086 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "completions/clipped_ratio": 0.0, "completions/max_length": 876.0, "completions/max_terminated_length": 876.0, "completions/mean_length": 132.75, "completions/mean_terminated_length": 132.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.548663675785065, "epoch": 0.00571, "frac_reward_zero_std": 0.25, "grad_norm": 0.1118892952799797, "kl": 0.528341518715024, "learning_rate": 9.99986753594374e-06, "loss": -0.0108, "num_tokens": 12980667.0, "reward": 0.6048815846443176, "reward_std": 0.978857159614563, "rewards/rollout_reward_func/mean": 0.6048815846443176, "rewards/rollout_reward_func/std": 1.5774074792861938, "sampling/importance_sampling_ratio/max": 0.5592455863952637, "sampling/importance_sampling_ratio/mean": 0.2514501214027405, "sampling/importance_sampling_ratio/min": 0.0001535549235995859, "sampling/sampling_logp_difference/max": 3.24153733253479, "sampling/sampling_logp_difference/mean": 1.0539655685424805, "step": 571, "step_time": 8.121390657997836 }, { "clip_ratio/high_max": 0.11875000037252903, "clip_ratio/high_mean": 0.059375000186264515, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.059375000186264515, "entropy": 5.342832535505295, "epoch": 0.00572, "grad_norm": 0.04898553341627121, "kl": 0.5540528688579798, "learning_rate": 9.99986704029114e-06, "loss": -0.0111, "step": 572, "step_time": 4.541812947998551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1054.0, "completions/max_terminated_length": 1054.0, "completions/mean_length": 247.1875, "completions/mean_terminated_length": 254.64515686035156, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.0958152413368225, "epoch": 0.00573, "frac_reward_zero_std": 0.25, "grad_norm": 0.03170221298933029, "kl": 0.5606250762939453, "learning_rate": 9.99986654371297e-06, "loss": -0.0068, "num_tokens": 13022829.0, "reward": 1.347717523574829, "reward_std": 1.0698028802871704, "rewards/rollout_reward_func/mean": 1.347717523574829, "rewards/rollout_reward_func/std": 1.5989601612091064, "sampling/importance_sampling_ratio/max": 0.5975310802459717, "sampling/importance_sampling_ratio/mean": 0.2752026915550232, "sampling/importance_sampling_ratio/min": 7.841590332798021e-13, "sampling/sampling_logp_difference/max": 10.898290634155273, "sampling/sampling_logp_difference/mean": 1.0027050971984863, "step": 573, "step_time": 9.419902159999765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.0801491141319275, "epoch": 0.00574, "grad_norm": 0.027452990412712097, "kl": 0.5708451792597771, "learning_rate": 9.999866046209236e-06, "loss": -0.0069, "step": 574, "step_time": 4.984512184997584 }, { "clip_ratio/high_max": 0.008928571827709675, "clip_ratio/high_mean": 0.004464285913854837, "clip_ratio/low_mean": 0.01846590917557478, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.022930195089429617, "completions/clipped_ratio": 0.03125, "completions/max_length": 1410.0, "completions/max_terminated_length": 1410.0, "completions/mean_length": 463.125, "completions/mean_terminated_length": 477.5483703613281, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.802230656147003, "epoch": 0.00575, "frac_reward_zero_std": 0.0, "grad_norm": 0.03601468726992607, "kl": 0.30121047236025333, "learning_rate": 9.999865547779934e-06, "loss": -0.01, "num_tokens": 13074237.0, "reward": 0.8713908195495605, "reward_std": 1.4858561754226685, "rewards/rollout_reward_func/mean": 0.8713908195495605, "rewards/rollout_reward_func/std": 1.6062870025634766, "sampling/importance_sampling_ratio/max": 0.3782269060611725, "sampling/importance_sampling_ratio/mean": 0.060342323035001755, "sampling/importance_sampling_ratio/min": 9.541792675854279e-20, "sampling/sampling_logp_difference/max": 14.031211853027344, "sampling/sampling_logp_difference/mean": 1.5091753005981445, "step": 575, "step_time": 10.72294703699663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.024715909268707037, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.024715909268707037, "entropy": 6.778226971626282, "epoch": 0.00576, "grad_norm": 0.03895532712340355, "kl": 0.30758706107735634, "learning_rate": 9.999865048425068e-06, "loss": -0.0101, "step": 576, "step_time": 5.675570493003761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "completions/clipped_ratio": 0.03125, "completions/max_length": 865.0, "completions/max_terminated_length": 865.0, "completions/mean_length": 195.1875, "completions/mean_terminated_length": 200.96774291992188, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.236864864826202, "epoch": 0.00577, "frac_reward_zero_std": 0.25, "grad_norm": 0.0753299817442894, "kl": 0.61163330078125, "learning_rate": 9.999864548144636e-06, "loss": -0.0132, "num_tokens": 13115709.0, "reward": 1.043805480003357, "reward_std": 1.0766196250915527, "rewards/rollout_reward_func/mean": 1.043805480003357, "rewards/rollout_reward_func/std": 1.5800790786743164, "sampling/importance_sampling_ratio/max": 0.5938865542411804, "sampling/importance_sampling_ratio/mean": 0.2535600960254669, "sampling/importance_sampling_ratio/min": 4.325699683432188e-13, "sampling/sampling_logp_difference/max": 11.191061019897461, "sampling/sampling_logp_difference/mean": 1.2046234607696533, "step": 577, "step_time": 8.367872155005898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.2368587255477905, "epoch": 0.00578, "grad_norm": 0.07011278718709946, "kl": 0.6095119006931782, "learning_rate": 9.999864046938636e-06, "loss": -0.0134, "step": 578, "step_time": 4.8659520680012065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1575.0, "completions/max_terminated_length": 1575.0, "completions/mean_length": 288.5625, "completions/mean_terminated_length": 288.5625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.812282204627991, "epoch": 0.00579, "frac_reward_zero_std": 0.0, "grad_norm": 0.034015800803899765, "kl": 0.5131087526679039, "learning_rate": 9.999863544807073e-06, "loss": -0.0162, "num_tokens": 13158988.0, "reward": 1.375891923904419, "reward_std": 1.5345147848129272, "rewards/rollout_reward_func/mean": 1.375891923904419, "rewards/rollout_reward_func/std": 1.5676851272583008, "sampling/importance_sampling_ratio/max": 0.554000973701477, "sampling/importance_sampling_ratio/mean": 0.18683792650699615, "sampling/importance_sampling_ratio/min": 8.981391186324572e-10, "sampling/sampling_logp_difference/max": 12.805536270141602, "sampling/sampling_logp_difference/mean": 1.2740488052368164, "step": 579, "step_time": 10.143213518997072 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 5.814009666442871, "epoch": 0.0058, "grad_norm": 0.008293891325592995, "kl": 0.510577667504549, "learning_rate": 9.999863041749942e-06, "loss": -0.0163, "step": 580, "step_time": 5.935724196002411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 937.0, "completions/max_terminated_length": 937.0, "completions/mean_length": 184.71875, "completions/mean_terminated_length": 184.71875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.7369831204414368, "epoch": 0.00581, "frac_reward_zero_std": 0.5, "grad_norm": 0.01712074689567089, "kl": 0.8277628272771835, "learning_rate": 9.999862537767247e-06, "loss": -0.0064, "num_tokens": 13198075.0, "reward": 0.6201390027999878, "reward_std": 0.6480836272239685, "rewards/rollout_reward_func/mean": 0.6201390027999878, "rewards/rollout_reward_func/std": 1.62631094455719, "sampling/importance_sampling_ratio/max": 0.5544807314872742, "sampling/importance_sampling_ratio/mean": 0.3840450048446655, "sampling/importance_sampling_ratio/min": 5.459465067388435e-20, "sampling/sampling_logp_difference/max": 2.8372721672058105, "sampling/sampling_logp_difference/mean": 0.7134142518043518, "step": 581, "step_time": 9.01977426700978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.739434778690338, "epoch": 0.00582, "grad_norm": 0.019188016653060913, "kl": 0.8284248784184456, "learning_rate": 9.999862032858985e-06, "loss": -0.0064, "step": 582, "step_time": 4.744708059002733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.0, "completions/max_length": 1100.0, "completions/max_terminated_length": 1100.0, "completions/mean_length": 218.6875, "completions/mean_terminated_length": 218.6875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.455252915620804, "epoch": 0.00583, "frac_reward_zero_std": 0.25, "grad_norm": 0.08398087322711945, "kl": 0.6962315440177917, "learning_rate": 9.999861527025157e-06, "loss": -0.0122, "num_tokens": 13238584.0, "reward": 0.8963949680328369, "reward_std": 0.8369415998458862, "rewards/rollout_reward_func/mean": 0.8963949680328369, "rewards/rollout_reward_func/std": 1.291332721710205, "sampling/importance_sampling_ratio/max": 0.5612624287605286, "sampling/importance_sampling_ratio/mean": 0.3388479948043823, "sampling/importance_sampling_ratio/min": 2.6638945200829767e-05, "sampling/sampling_logp_difference/max": 2.4220962524414062, "sampling/sampling_logp_difference/mean": 0.7694323062896729, "step": 583, "step_time": 8.876391252997564 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 4.494312196969986, "epoch": 0.00584, "grad_norm": 0.022178243845701218, "kl": 0.7031536288559437, "learning_rate": 9.999861020265767e-06, "loss": -0.0122, "step": 584, "step_time": 5.551423785000225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "completions/clipped_ratio": 0.03125, "completions/max_length": 542.0, "completions/max_terminated_length": 542.0, "completions/mean_length": 207.25, "completions/mean_terminated_length": 202.32257080078125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.159406065940857, "epoch": 0.00585, "frac_reward_zero_std": 0.0, "grad_norm": 0.1569126844406128, "kl": 0.5737629570066929, "learning_rate": 9.999860512580808e-06, "loss": -0.0097, "num_tokens": 13280087.0, "reward": 0.33887213468551636, "reward_std": 1.5021281242370605, "rewards/rollout_reward_func/mean": 0.33887213468551636, "rewards/rollout_reward_func/std": 1.5179189443588257, "sampling/importance_sampling_ratio/max": 0.555300772190094, "sampling/importance_sampling_ratio/mean": 0.21311062574386597, "sampling/importance_sampling_ratio/min": 3.076303226930431e-17, "sampling/sampling_logp_difference/max": 13.23223876953125, "sampling/sampling_logp_difference/mean": 1.402632236480713, "step": 585, "step_time": 7.0548593229978 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.015625, "clip_ratio/region_mean": 0.11979166697710752, "entropy": 6.407575607299805, "epoch": 0.00586, "grad_norm": 0.05074722319841385, "kl": 0.5208184737712145, "learning_rate": 9.999860003970287e-06, "loss": -0.0102, "step": 586, "step_time": 3.834124562003126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1126.0, "completions/max_terminated_length": 1126.0, "completions/mean_length": 669.40625, "completions/mean_terminated_length": 669.40625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.5255197286605835, "epoch": 0.00587, "frac_reward_zero_std": 0.0, "grad_norm": 0.02637079730629921, "kl": 0.37131614703685045, "learning_rate": 9.9998594944342e-06, "loss": -0.0097, "num_tokens": 13338350.0, "reward": 1.138363003730774, "reward_std": 1.558935523033142, "rewards/rollout_reward_func/mean": 1.138363003730774, "rewards/rollout_reward_func/std": 1.5395950078964233, "sampling/importance_sampling_ratio/max": 0.3110322654247284, "sampling/importance_sampling_ratio/mean": 0.06995931267738342, "sampling/importance_sampling_ratio/min": 1.5333401393036183e-07, "sampling/sampling_logp_difference/max": 2.7925405502319336, "sampling/sampling_logp_difference/mean": 1.2440032958984375, "step": 587, "step_time": 10.255694775998563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 6.545074045658112, "epoch": 0.00588, "grad_norm": 0.027474088594317436, "kl": 0.3744245981797576, "learning_rate": 9.99985898397255e-06, "loss": -0.0097, "step": 588, "step_time": 5.269914603999496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1347.0, "completions/max_terminated_length": 1347.0, "completions/mean_length": 418.09375, "completions/mean_terminated_length": 418.09375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.47662827372551, "epoch": 0.00589, "frac_reward_zero_std": 0.25, "grad_norm": 0.02859111689031124, "kl": 0.5126664973795414, "learning_rate": 9.999858472585334e-06, "loss": -0.0011, "num_tokens": 13386167.0, "reward": 1.532842993736267, "reward_std": 1.0369417667388916, "rewards/rollout_reward_func/mean": 1.532842993736267, "rewards/rollout_reward_func/std": 1.3975237607955933, "sampling/importance_sampling_ratio/max": 0.5566324591636658, "sampling/importance_sampling_ratio/mean": 0.22153082489967346, "sampling/importance_sampling_ratio/min": 5.698910787721445e-10, "sampling/sampling_logp_difference/max": 12.151427268981934, "sampling/sampling_logp_difference/mean": 1.2001961469650269, "step": 589, "step_time": 9.66347349499847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.486259996891022, "epoch": 0.0059, "grad_norm": 0.02606315352022648, "kl": 0.5146608725190163, "learning_rate": 9.999857960272553e-06, "loss": -0.0012, "step": 590, "step_time": 5.886273621992586 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "completions/clipped_ratio": 0.03125, "completions/max_length": 1414.0, "completions/max_terminated_length": 1414.0, "completions/mean_length": 570.34375, "completions/mean_terminated_length": 576.1935424804688, "completions/min_length": 294.0, "completions/min_terminated_length": 294.0, "entropy": 5.467749178409576, "epoch": 0.00591, "frac_reward_zero_std": 0.0, "grad_norm": 0.051621921360492706, "kl": 0.39182290248572826, "learning_rate": 9.99985744703421e-06, "loss": -0.0129, "num_tokens": 13442190.0, "reward": 1.3542439937591553, "reward_std": 1.4609732627868652, "rewards/rollout_reward_func/mean": 1.3542439937591553, "rewards/rollout_reward_func/std": 1.5042190551757812, "sampling/importance_sampling_ratio/max": 0.4129268229007721, "sampling/importance_sampling_ratio/mean": 0.1215938925743103, "sampling/importance_sampling_ratio/min": 4.0141644348790084e-12, "sampling/sampling_logp_difference/max": 4.711059093475342, "sampling/sampling_logp_difference/mean": 1.0525888204574585, "step": 591, "step_time": 10.830407207991811 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 5.4828662276268005, "epoch": 0.00592, "grad_norm": 0.050931356847286224, "kl": 0.3855936750769615, "learning_rate": 9.9998569328703e-06, "loss": -0.013, "step": 592, "step_time": 5.8881904750051035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1690.0, "completions/max_terminated_length": 1690.0, "completions/mean_length": 336.4375, "completions/mean_terminated_length": 336.4375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.9214684665203094, "epoch": 0.00593, "frac_reward_zero_std": 0.25, "grad_norm": 0.018013184890151024, "kl": 0.42775388434529305, "learning_rate": 9.99985641778083e-06, "loss": -0.0128, "num_tokens": 13487478.0, "reward": 1.3469395637512207, "reward_std": 1.226152777671814, "rewards/rollout_reward_func/mean": 1.3469395637512207, "rewards/rollout_reward_func/std": 1.4258391857147217, "sampling/importance_sampling_ratio/max": 0.5546507239341736, "sampling/importance_sampling_ratio/mean": 0.20452147722244263, "sampling/importance_sampling_ratio/min": 3.482858801362454e-07, "sampling/sampling_logp_difference/max": 3.7669711112976074, "sampling/sampling_logp_difference/mean": 1.1793248653411865, "step": 593, "step_time": 11.003979128992796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.917969316244125, "epoch": 0.00594, "grad_norm": 0.01778622530400753, "kl": 0.4278740268200636, "learning_rate": 9.999855901765791e-06, "loss": -0.0128, "step": 594, "step_time": 6.324269474000175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 944.0, "completions/max_terminated_length": 944.0, "completions/mean_length": 300.71875, "completions/mean_terminated_length": 300.71875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.2474517822265625, "epoch": 0.00595, "frac_reward_zero_std": 0.25, "grad_norm": 0.062378499656915665, "kl": 0.5550921820104122, "learning_rate": 9.99985538482519e-06, "loss": -0.0102, "num_tokens": 13532315.0, "reward": 0.5835610032081604, "reward_std": 1.168180227279663, "rewards/rollout_reward_func/mean": 0.5835610032081604, "rewards/rollout_reward_func/std": 1.6029391288757324, "sampling/importance_sampling_ratio/max": 0.559323787689209, "sampling/importance_sampling_ratio/mean": 0.23411278426647186, "sampling/importance_sampling_ratio/min": 7.703984477268333e-13, "sampling/sampling_logp_difference/max": 10.415117263793945, "sampling/sampling_logp_difference/mean": 1.2553951740264893, "step": 595, "step_time": 9.089089707991661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 5.243448555469513, "epoch": 0.00596, "grad_norm": 0.019580984488129616, "kl": 0.5554739702492952, "learning_rate": 9.999854866959026e-06, "loss": -0.0104, "step": 596, "step_time": 5.225256321991765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1018.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 337.21875, "completions/mean_terminated_length": 337.21875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.10041481256485, "epoch": 0.00597, "frac_reward_zero_std": 0.0, "grad_norm": 0.025051718577742577, "kl": 0.5412896759808064, "learning_rate": 9.999854348167299e-06, "loss": -0.0135, "num_tokens": 13577108.0, "reward": -0.4786812961101532, "reward_std": 1.2640187740325928, "rewards/rollout_reward_func/mean": -0.4786812961101532, "rewards/rollout_reward_func/std": 1.5679785013198853, "sampling/importance_sampling_ratio/max": 0.5451707243919373, "sampling/importance_sampling_ratio/mean": 0.1291256695985794, "sampling/importance_sampling_ratio/min": 4.362298293605064e-16, "sampling/sampling_logp_difference/max": 11.030445098876953, "sampling/sampling_logp_difference/mean": 1.5609185695648193, "step": 597, "step_time": 9.096895082995616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 7.092858016490936, "epoch": 0.00598, "grad_norm": 0.03177911043167114, "kl": 0.5742238648235798, "learning_rate": 9.999853828450009e-06, "loss": -0.0134, "step": 598, "step_time": 5.301374828999542 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "completions/clipped_ratio": 0.0, "completions/max_length": 1572.0, "completions/max_terminated_length": 1572.0, "completions/mean_length": 444.625, "completions/mean_terminated_length": 444.625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.539399713277817, "epoch": 0.00599, "frac_reward_zero_std": 0.0, "grad_norm": 0.09357021003961563, "kl": 0.35430664382874966, "learning_rate": 9.999853307807155e-06, "loss": -0.0173, "num_tokens": 13626864.0, "reward": 0.808832049369812, "reward_std": 1.7241395711898804, "rewards/rollout_reward_func/mean": 0.808832049369812, "rewards/rollout_reward_func/std": 1.7365632057189941, "sampling/importance_sampling_ratio/max": 0.5546876192092896, "sampling/importance_sampling_ratio/mean": 0.1303933709859848, "sampling/importance_sampling_ratio/min": 2.0649254395266325e-07, "sampling/sampling_logp_difference/max": 4.254899978637695, "sampling/sampling_logp_difference/mean": 1.3367316722869873, "step": 599, "step_time": 10.816723358002491 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0052083334885537624, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625000465661287, "entropy": 6.4854536056518555, "epoch": 0.006, "grad_norm": 0.049926336854696274, "kl": 0.35788850113749504, "learning_rate": 9.999852786238737e-06, "loss": -0.0177, "step": 600, "step_time": 6.027996774006169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 938.0, "completions/max_terminated_length": 938.0, "completions/mean_length": 445.375, "completions/mean_terminated_length": 445.375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.873747110366821, "epoch": 0.00601, "frac_reward_zero_std": 0.0, "grad_norm": 0.0378187857568264, "kl": 0.5426282864063978, "learning_rate": 9.999852263744758e-06, "loss": -0.0134, "num_tokens": 13678182.0, "reward": 1.5689170360565186, "reward_std": 1.6057040691375732, "rewards/rollout_reward_func/mean": 1.5689170360565186, "rewards/rollout_reward_func/std": 1.7543635368347168, "sampling/importance_sampling_ratio/max": 0.5349972248077393, "sampling/importance_sampling_ratio/mean": 0.18701264262199402, "sampling/importance_sampling_ratio/min": 4.323412213125266e-05, "sampling/sampling_logp_difference/max": 2.8541011810302734, "sampling/sampling_logp_difference/mean": 0.8906332850456238, "step": 601, "step_time": 9.203019616001257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.858705759048462, "epoch": 0.00602, "grad_norm": 0.04095827043056488, "kl": 0.5402872711420059, "learning_rate": 9.999851740325214e-06, "loss": -0.0135, "step": 602, "step_time": 4.792178142008197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1219.0, "completions/max_terminated_length": 1219.0, "completions/mean_length": 483.1875, "completions/mean_terminated_length": 487.8709411621094, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.009613335132599, "epoch": 0.00603, "frac_reward_zero_std": 0.0, "grad_norm": 0.04075102508068085, "kl": 0.2984096445143223, "learning_rate": 9.999851215980108e-06, "loss": -0.0191, "num_tokens": 13730594.0, "reward": 0.5325158834457397, "reward_std": 1.4862759113311768, "rewards/rollout_reward_func/mean": 0.5325158834457397, "rewards/rollout_reward_func/std": 1.6480350494384766, "sampling/importance_sampling_ratio/max": 0.35628604888916016, "sampling/importance_sampling_ratio/mean": 0.0751158595085144, "sampling/importance_sampling_ratio/min": 5.33163100840497e-21, "sampling/sampling_logp_difference/max": 10.007474899291992, "sampling/sampling_logp_difference/mean": 1.5041956901550293, "step": 603, "step_time": 10.079769056996156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 7.0089031457901, "epoch": 0.00604, "grad_norm": 0.041777003556489944, "kl": 0.3020572755485773, "learning_rate": 9.99985069070944e-06, "loss": -0.0193, "step": 604, "step_time": 5.869014607997087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1348.0, "completions/max_terminated_length": 1348.0, "completions/mean_length": 564.71875, "completions/mean_terminated_length": 564.71875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.2693023681640625, "epoch": 0.00605, "frac_reward_zero_std": 0.0, "grad_norm": 0.04548928886651993, "kl": 0.23449756484478712, "learning_rate": 9.999850164513208e-06, "loss": -0.0108, "num_tokens": 13785817.0, "reward": 1.231553316116333, "reward_std": 1.8567674160003662, "rewards/rollout_reward_func/mean": 1.231553316116333, "rewards/rollout_reward_func/std": 1.9017879962921143, "sampling/importance_sampling_ratio/max": 0.3111889660358429, "sampling/importance_sampling_ratio/mean": 0.08638918399810791, "sampling/importance_sampling_ratio/min": 6.281336784041969e-09, "sampling/sampling_logp_difference/max": 5.0648064613342285, "sampling/sampling_logp_difference/mean": 1.1877754926681519, "step": 605, "step_time": 10.575849234999623 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0052083334885537624, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0052083334885537624, "entropy": 6.265333473682404, "epoch": 0.00606, "grad_norm": 0.04931462183594704, "kl": 0.23583719041198492, "learning_rate": 9.999849637391415e-06, "loss": -0.0109, "step": 606, "step_time": 5.739113082003314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1063.0, "completions/max_terminated_length": 1063.0, "completions/mean_length": 232.65625, "completions/mean_terminated_length": 232.65625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.295816987752914, "epoch": 0.00607, "frac_reward_zero_std": 0.25, "grad_norm": 0.08700916171073914, "kl": 0.5613377168774605, "learning_rate": 9.99984910934406e-06, "loss": -0.0117, "num_tokens": 13827759.0, "reward": 1.138547420501709, "reward_std": 1.5033711194992065, "rewards/rollout_reward_func/mean": 1.138547420501709, "rewards/rollout_reward_func/std": 1.7917388677597046, "sampling/importance_sampling_ratio/max": 0.5593668818473816, "sampling/importance_sampling_ratio/mean": 0.21776698529720306, "sampling/importance_sampling_ratio/min": 6.658953943805557e-10, "sampling/sampling_logp_difference/max": 3.5160255432128906, "sampling/sampling_logp_difference/mean": 1.0096335411071777, "step": 607, "step_time": 9.189177902007941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 5.2896987199783325, "epoch": 0.00608, "grad_norm": 0.04871422052383423, "kl": 0.5696534775197506, "learning_rate": 9.999848580371143e-06, "loss": -0.0119, "step": 608, "step_time": 4.845867029998772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1072.0, "completions/max_terminated_length": 1072.0, "completions/mean_length": 345.40625, "completions/mean_terminated_length": 345.40625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.880847096443176, "epoch": 0.00609, "frac_reward_zero_std": 0.0, "grad_norm": 0.04961245134472847, "kl": 0.42266688868403435, "learning_rate": 9.999848050472662e-06, "loss": -0.0214, "num_tokens": 13873124.0, "reward": 0.5101680755615234, "reward_std": 1.5141104459762573, "rewards/rollout_reward_func/mean": 0.5101680755615234, "rewards/rollout_reward_func/std": 1.5656496286392212, "sampling/importance_sampling_ratio/max": 0.5540165901184082, "sampling/importance_sampling_ratio/mean": 0.17103977501392365, "sampling/importance_sampling_ratio/min": 6.814871511551246e-08, "sampling/sampling_logp_difference/max": 3.0935440063476562, "sampling/sampling_logp_difference/mean": 1.1470441818237305, "step": 609, "step_time": 9.377924943997641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 5.883795201778412, "epoch": 0.0061, "grad_norm": 0.04057103395462036, "kl": 0.4193890765309334, "learning_rate": 9.99984751964862e-06, "loss": -0.0215, "step": 610, "step_time": 5.429843553003593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1481.0, "completions/max_terminated_length": 1481.0, "completions/mean_length": 459.0625, "completions/mean_terminated_length": 459.0625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.8707298040390015, "epoch": 0.00611, "frac_reward_zero_std": 0.0, "grad_norm": 0.039746299386024475, "kl": 0.4671197757124901, "learning_rate": 9.999846987899019e-06, "loss": -0.0098, "num_tokens": 13921268.0, "reward": 0.8819388747215271, "reward_std": 1.1190464496612549, "rewards/rollout_reward_func/mean": 0.8819388747215271, "rewards/rollout_reward_func/std": 1.370529294013977, "sampling/importance_sampling_ratio/max": 0.5573278069496155, "sampling/importance_sampling_ratio/mean": 0.15450820326805115, "sampling/importance_sampling_ratio/min": 1.8555753911186912e-07, "sampling/sampling_logp_difference/max": 2.9297916889190674, "sampling/sampling_logp_difference/mean": 1.3406974077224731, "step": 611, "step_time": 10.71846527100206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 6.868020713329315, "epoch": 0.00612, "grad_norm": 0.02228696271777153, "kl": 0.4436655528843403, "learning_rate": 9.999846455223852e-06, "loss": -0.01, "step": 612, "step_time": 5.899556278993259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1524.0, "completions/max_terminated_length": 1524.0, "completions/mean_length": 265.25, "completions/mean_terminated_length": 265.25, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.302441835403442, "epoch": 0.00613, "frac_reward_zero_std": 0.5, "grad_norm": 0.011357209645211697, "kl": 0.6531644612550735, "learning_rate": 9.999845921623126e-06, "loss": -0.011, "num_tokens": 13962033.0, "reward": 1.3963587284088135, "reward_std": 0.9155339002609253, "rewards/rollout_reward_func/mean": 1.3963587284088135, "rewards/rollout_reward_func/std": 1.382657766342163, "sampling/importance_sampling_ratio/max": 0.5587586760520935, "sampling/importance_sampling_ratio/mean": 0.35911616683006287, "sampling/importance_sampling_ratio/min": 7.293590442714049e-07, "sampling/sampling_logp_difference/max": 3.021470546722412, "sampling/sampling_logp_difference/mean": 0.7728229761123657, "step": 613, "step_time": 10.656403456992848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.302978873252869, "epoch": 0.00614, "grad_norm": 0.00891436729580164, "kl": 0.6516547612845898, "learning_rate": 9.999845387096839e-06, "loss": -0.0111, "step": 614, "step_time": 5.810576997002499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 946.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 232.59375, "completions/mean_terminated_length": 239.5806427001953, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.284472823143005, "epoch": 0.00615, "frac_reward_zero_std": 0.0, "grad_norm": 0.17448049783706665, "kl": 0.7154290098696947, "learning_rate": 9.99984485164499e-06, "loss": -0.0118, "num_tokens": 14003181.0, "reward": 0.7263950109481812, "reward_std": 1.5976920127868652, "rewards/rollout_reward_func/mean": 0.7263950109481812, "rewards/rollout_reward_func/std": 1.7552223205566406, "sampling/importance_sampling_ratio/max": 0.7815223336219788, "sampling/importance_sampling_ratio/mean": 0.26300641894340515, "sampling/importance_sampling_ratio/min": 5.114990140562279e-15, "sampling/sampling_logp_difference/max": 4.067675590515137, "sampling/sampling_logp_difference/mean": 1.1560828685760498, "step": 615, "step_time": 8.994629325992719 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02083333395421505, "entropy": 5.27630889415741, "epoch": 0.00616, "grad_norm": 0.04174336791038513, "kl": 0.6082962285727262, "learning_rate": 9.99984431526758e-06, "loss": -0.0121, "step": 616, "step_time": 4.7687295980031195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1874.0, "completions/max_terminated_length": 1874.0, "completions/mean_length": 456.03125, "completions/mean_terminated_length": 456.03125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.469132721424103, "epoch": 0.00617, "frac_reward_zero_std": 0.25, "grad_norm": 0.03241639584302902, "kl": 0.49365248158574104, "learning_rate": 9.99984377796461e-06, "loss": -0.0082, "num_tokens": 14051710.0, "reward": 0.9011662006378174, "reward_std": 0.8850587010383606, "rewards/rollout_reward_func/mean": 0.9011662006378174, "rewards/rollout_reward_func/std": 1.4870102405548096, "sampling/importance_sampling_ratio/max": 0.5616759061813354, "sampling/importance_sampling_ratio/mean": 0.20113565027713776, "sampling/importance_sampling_ratio/min": 9.213165170010584e-13, "sampling/sampling_logp_difference/max": 3.562073230743408, "sampling/sampling_logp_difference/mean": 1.045985460281372, "step": 617, "step_time": 12.276974385993526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.472686350345612, "epoch": 0.00618, "grad_norm": 0.02825688198208809, "kl": 0.48402271792292595, "learning_rate": 9.999843239736079e-06, "loss": -0.0083, "step": 618, "step_time": 7.241024350998487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1171.0, "completions/max_terminated_length": 1171.0, "completions/mean_length": 419.03125, "completions/mean_terminated_length": 419.03125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.7358949184417725, "epoch": 0.00619, "frac_reward_zero_std": 0.0, "grad_norm": 0.03812962397933006, "kl": 0.4377317726612091, "learning_rate": 9.999842700581986e-06, "loss": -0.0138, "num_tokens": 14100655.0, "reward": 0.5292288661003113, "reward_std": 1.610231876373291, "rewards/rollout_reward_func/mean": 0.5292288661003113, "rewards/rollout_reward_func/std": 1.7989037036895752, "sampling/importance_sampling_ratio/max": 0.5471364855766296, "sampling/importance_sampling_ratio/mean": 0.15009362995624542, "sampling/importance_sampling_ratio/min": 1.6687625688175743e-20, "sampling/sampling_logp_difference/max": 13.049628257751465, "sampling/sampling_logp_difference/mean": 1.3563807010650635, "step": 619, "step_time": 9.998567099002685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0069444444961845875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0069444444961845875, "entropy": 5.7284190356731415, "epoch": 0.0062, "grad_norm": 0.028562886640429497, "kl": 0.44284695759415627, "learning_rate": 9.999842160502334e-06, "loss": -0.0139, "step": 620, "step_time": 5.335164686002827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 896.0, "completions/max_terminated_length": 896.0, "completions/mean_length": 290.28125, "completions/mean_terminated_length": 299.3000183105469, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.6668660044670105, "epoch": 0.00621, "frac_reward_zero_std": 0.25, "grad_norm": 0.03452133759856224, "kl": 0.3774310052394867, "learning_rate": 9.99984161949712e-06, "loss": -0.0094, "num_tokens": 14144541.0, "reward": 0.7129031419754028, "reward_std": 1.1629302501678467, "rewards/rollout_reward_func/mean": 0.7129031419754028, "rewards/rollout_reward_func/std": 1.430617094039917, "sampling/importance_sampling_ratio/max": 0.5560125112533569, "sampling/importance_sampling_ratio/mean": 0.19644953310489655, "sampling/importance_sampling_ratio/min": 8.73132349841349e-13, "sampling/sampling_logp_difference/max": 3.687227487564087, "sampling/sampling_logp_difference/mean": 1.0600569248199463, "step": 621, "step_time": 9.39708702900316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 5.684277236461639, "epoch": 0.00622, "grad_norm": 0.014087939634919167, "kl": 0.3735264530405402, "learning_rate": 9.999841077566347e-06, "loss": -0.0095, "step": 622, "step_time": 4.6858012869997765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 786.0, "completions/max_terminated_length": 786.0, "completions/mean_length": 153.875, "completions/mean_terminated_length": 153.875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.324724435806274, "epoch": 0.00623, "frac_reward_zero_std": 0.5, "grad_norm": 0.0074915518052875996, "kl": 0.5344025883823633, "learning_rate": 9.999840534710012e-06, "loss": -0.0045, "num_tokens": 14183234.0, "reward": 1.1117148399353027, "reward_std": 0.6655594110488892, "rewards/rollout_reward_func/mean": 1.1117148399353027, "rewards/rollout_reward_func/std": 1.2621585130691528, "sampling/importance_sampling_ratio/max": 0.5605778098106384, "sampling/importance_sampling_ratio/mean": 0.35527053475379944, "sampling/importance_sampling_ratio/min": 2.23571118215804e-12, "sampling/sampling_logp_difference/max": 4.409959316253662, "sampling/sampling_logp_difference/mean": 0.831778347492218, "step": 623, "step_time": 7.550123231008911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.315487593412399, "epoch": 0.00624, "grad_norm": 0.007705311290919781, "kl": 0.5356729179620743, "learning_rate": 9.99983999092812e-06, "loss": -0.0045, "step": 624, "step_time": 4.674625972995273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1550.0, "completions/max_terminated_length": 1550.0, "completions/mean_length": 588.9375, "completions/mean_terminated_length": 588.9375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.504112660884857, "epoch": 0.00625, "frac_reward_zero_std": 0.0, "grad_norm": 0.07272101193666458, "kl": 0.42937809973955154, "learning_rate": 9.999839446220667e-06, "loss": -0.0076, "num_tokens": 14240336.0, "reward": 1.3675730228424072, "reward_std": 1.523374319076538, "rewards/rollout_reward_func/mean": 1.3675730228424072, "rewards/rollout_reward_func/std": 1.493693470954895, "sampling/importance_sampling_ratio/max": 0.5547716617584229, "sampling/importance_sampling_ratio/mean": 0.16179567575454712, "sampling/importance_sampling_ratio/min": 1.701896512606993e-09, "sampling/sampling_logp_difference/max": 12.5396146774292, "sampling/sampling_logp_difference/mean": 1.0321624279022217, "step": 625, "step_time": 11.005210342998907 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 5.498110443353653, "epoch": 0.00626, "grad_norm": 0.029860185459256172, "kl": 0.41217876970767975, "learning_rate": 9.999838900587653e-06, "loss": -0.0077, "step": 626, "step_time": 6.137394220997521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1641.0, "completions/max_terminated_length": 1641.0, "completions/mean_length": 308.21875, "completions/mean_terminated_length": 308.21875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.727962255477905, "epoch": 0.00627, "frac_reward_zero_std": 0.5, "grad_norm": 0.03653612732887268, "kl": 0.4652413548901677, "learning_rate": 9.999838354029082e-06, "loss": 0.0001, "num_tokens": 14282720.0, "reward": 1.1653308868408203, "reward_std": 0.5036925077438354, "rewards/rollout_reward_func/mean": 1.1653308868408203, "rewards/rollout_reward_func/std": 1.057850956916809, "sampling/importance_sampling_ratio/max": 0.5577746629714966, "sampling/importance_sampling_ratio/mean": 0.3010942339897156, "sampling/importance_sampling_ratio/min": 9.727624274212366e-25, "sampling/sampling_logp_difference/max": 14.341209411621094, "sampling/sampling_logp_difference/mean": 1.465329885482788, "step": 627, "step_time": 10.823259349999717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.725306659936905, "epoch": 0.00628, "grad_norm": 0.037400078028440475, "kl": 0.46707695350050926, "learning_rate": 9.99983780654495e-06, "loss": 0.0002, "step": 628, "step_time": 5.95225175399537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.0, "completions/max_terminated_length": 474.0, "completions/mean_length": 116.4375, "completions/mean_terminated_length": 116.4375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.516537964344025, "epoch": 0.00629, "frac_reward_zero_std": 0.25, "grad_norm": 0.06235259398818016, "kl": 0.5620662830770016, "learning_rate": 9.999837258135259e-06, "loss": -0.0153, "num_tokens": 14318620.0, "reward": 0.6591768860816956, "reward_std": 0.6991163492202759, "rewards/rollout_reward_func/mean": 0.6591768860816956, "rewards/rollout_reward_func/std": 1.4463324546813965, "sampling/importance_sampling_ratio/max": 0.5620694756507874, "sampling/importance_sampling_ratio/mean": 0.38223934173583984, "sampling/importance_sampling_ratio/min": 3.109276031088015e-11, "sampling/sampling_logp_difference/max": 3.7153992652893066, "sampling/sampling_logp_difference/mean": 0.8475480675697327, "step": 629, "step_time": 6.956850574999407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.5137088894844055, "epoch": 0.0063, "grad_norm": 0.06232432276010513, "kl": 0.5586998164653778, "learning_rate": 9.999836708800008e-06, "loss": -0.0153, "step": 630, "step_time": 4.234759978000511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1561.0, "completions/max_terminated_length": 1561.0, "completions/mean_length": 433.53125, "completions/mean_terminated_length": 431.9031982421875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.990876793861389, "epoch": 0.00631, "frac_reward_zero_std": 0.25, "grad_norm": 0.05400877073407173, "kl": 0.39673394709825516, "learning_rate": 9.999836158539198e-06, "loss": -0.0092, "num_tokens": 14368415.0, "reward": 0.9846069812774658, "reward_std": 1.074296474456787, "rewards/rollout_reward_func/mean": 0.9846069812774658, "rewards/rollout_reward_func/std": 1.5160835981369019, "sampling/importance_sampling_ratio/max": 0.5525577664375305, "sampling/importance_sampling_ratio/mean": 0.1897887885570526, "sampling/importance_sampling_ratio/min": 9.738803730466317e-13, "sampling/sampling_logp_difference/max": 3.9363837242126465, "sampling/sampling_logp_difference/mean": 1.1900813579559326, "step": 631, "step_time": 10.763583191994258 }, { "clip_ratio/high_max": 0.043750000186264515, "clip_ratio/high_mean": 0.021875000093132257, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.021875000093132257, "entropy": 5.974256455898285, "epoch": 0.00632, "grad_norm": 0.027141984552145004, "kl": 0.38562004268169403, "learning_rate": 9.99983560735283e-06, "loss": -0.0093, "step": 632, "step_time": 6.026478913005121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 521.0, "completions/max_terminated_length": 521.0, "completions/mean_length": 120.59375, "completions/mean_terminated_length": 120.59375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.4336742758750916, "epoch": 0.00633, "frac_reward_zero_std": 0.5, "grad_norm": 0.015533552505075932, "kl": 0.5913786850869656, "learning_rate": 9.999835055240903e-06, "loss": -0.0018, "num_tokens": 14405086.0, "reward": 1.2058321237564087, "reward_std": 0.7478220462799072, "rewards/rollout_reward_func/mean": 1.2058321237564087, "rewards/rollout_reward_func/std": 1.2895839214324951, "sampling/importance_sampling_ratio/max": 0.5705205202102661, "sampling/importance_sampling_ratio/mean": 0.35852986574172974, "sampling/importance_sampling_ratio/min": 0.00019265853916294873, "sampling/sampling_logp_difference/max": 3.156651496887207, "sampling/sampling_logp_difference/mean": 0.7987962961196899, "step": 633, "step_time": 7.338186778997624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.431467235088348, "epoch": 0.00634, "grad_norm": 0.017178520560264587, "kl": 0.5786591172218323, "learning_rate": 9.999834502203417e-06, "loss": -0.0018, "step": 634, "step_time": 3.7592005249971407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1033.0, "completions/max_terminated_length": 1033.0, "completions/mean_length": 361.28125, "completions/mean_terminated_length": 361.28125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.506398797035217, "epoch": 0.00635, "frac_reward_zero_std": 0.25, "grad_norm": 0.07346973568201065, "kl": 0.4346239473670721, "learning_rate": 9.999833948240373e-06, "loss": -0.0114, "num_tokens": 14452092.0, "reward": 1.0112916231155396, "reward_std": 0.8499678373336792, "rewards/rollout_reward_func/mean": 1.0112916231155396, "rewards/rollout_reward_func/std": 1.458616852760315, "sampling/importance_sampling_ratio/max": 0.5599518418312073, "sampling/importance_sampling_ratio/mean": 0.23131190240383148, "sampling/importance_sampling_ratio/min": 4.164049549615356e-09, "sampling/sampling_logp_difference/max": 11.776033401489258, "sampling/sampling_logp_difference/mean": 1.0889757871627808, "step": 635, "step_time": 9.216930889997457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.499004364013672, "epoch": 0.00636, "grad_norm": 0.09628774970769882, "kl": 0.4329771548509598, "learning_rate": 9.99983339335177e-06, "loss": -0.0117, "step": 636, "step_time": 5.249082361995534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0026041667442768812, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0026041667442768812, "completions/clipped_ratio": 0.0, "completions/max_length": 1126.0, "completions/max_terminated_length": 1126.0, "completions/mean_length": 474.5, "completions/mean_terminated_length": 474.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.155948102474213, "epoch": 0.00637, "frac_reward_zero_std": 0.0, "grad_norm": 0.16893190145492554, "kl": 0.20477932505309582, "learning_rate": 9.99983283753761e-06, "loss": -0.0088, "num_tokens": 14504280.0, "reward": 0.8850740194320679, "reward_std": 1.7180206775665283, "rewards/rollout_reward_func/mean": 0.8850740194320679, "rewards/rollout_reward_func/std": 1.789468765258789, "sampling/importance_sampling_ratio/max": 0.5211223363876343, "sampling/importance_sampling_ratio/mean": 0.06692801415920258, "sampling/importance_sampling_ratio/min": 5.038583344958025e-18, "sampling/sampling_logp_difference/max": 11.833430290222168, "sampling/sampling_logp_difference/mean": 1.5426719188690186, "step": 637, "step_time": 9.606990575000964 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 7.155875861644745, "epoch": 0.00638, "grad_norm": 0.01780097559094429, "kl": 0.20988573972135782, "learning_rate": 9.99983228079789e-06, "loss": -0.0092, "step": 638, "step_time": 5.667675976001192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.018229166977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.018229166977107525, "completions/clipped_ratio": 0.0, "completions/max_length": 1760.0, "completions/max_terminated_length": 1760.0, "completions/mean_length": 465.40625, "completions/mean_terminated_length": 465.40625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.434506744146347, "epoch": 0.00639, "frac_reward_zero_std": 0.25, "grad_norm": 0.06098285689949989, "kl": 0.4374343380331993, "learning_rate": 9.999831723132612e-06, "loss": -0.0111, "num_tokens": 14554709.0, "reward": 1.3615632057189941, "reward_std": 1.1357418298721313, "rewards/rollout_reward_func/mean": 1.3615632057189941, "rewards/rollout_reward_func/std": 1.3069230318069458, "sampling/importance_sampling_ratio/max": 0.5631871819496155, "sampling/importance_sampling_ratio/mean": 0.22367985546588898, "sampling/importance_sampling_ratio/min": 6.241694938680098e-10, "sampling/sampling_logp_difference/max": 8.898721694946289, "sampling/sampling_logp_difference/mean": 1.1255009174346924, "step": 639, "step_time": 11.52477364600054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.4390751123428345, "epoch": 0.0064, "grad_norm": 0.05371527746319771, "kl": 0.43632080778479576, "learning_rate": 9.999831164541778e-06, "loss": -0.0112, "step": 640, "step_time": 6.373686003997136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 269.96875, "completions/mean_terminated_length": 269.96875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.617934942245483, "epoch": 0.00641, "frac_reward_zero_std": 0.0, "grad_norm": 0.018730906769633293, "kl": 0.3519375566393137, "learning_rate": 9.999830605025384e-06, "loss": -0.0066, "num_tokens": 14598959.0, "reward": 0.6408053636550903, "reward_std": 1.5393576622009277, "rewards/rollout_reward_func/mean": 0.6408053636550903, "rewards/rollout_reward_func/std": 1.5420422554016113, "sampling/importance_sampling_ratio/max": 0.5574700832366943, "sampling/importance_sampling_ratio/mean": 0.23730811476707458, "sampling/importance_sampling_ratio/min": 5.25957511854358e-06, "sampling/sampling_logp_difference/max": 3.286466360092163, "sampling/sampling_logp_difference/mean": 1.0836137533187866, "step": 641, "step_time": 8.85174307100533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.612747490406036, "epoch": 0.00642, "grad_norm": 0.020303644239902496, "kl": 0.35167563892900944, "learning_rate": 9.999830044583436e-06, "loss": -0.0067, "step": 642, "step_time": 4.728538244009542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 643.0, "completions/max_terminated_length": 643.0, "completions/mean_length": 233.25, "completions/mean_terminated_length": 233.25, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.303256064653397, "epoch": 0.00643, "frac_reward_zero_std": 0.0, "grad_norm": 0.06228810176253319, "kl": 0.763526102527976, "learning_rate": 9.999829483215928e-06, "loss": -0.0054, "num_tokens": 14641993.0, "reward": 1.0202488899230957, "reward_std": 1.508049488067627, "rewards/rollout_reward_func/mean": 1.0202488899230957, "rewards/rollout_reward_func/std": 1.7243781089782715, "sampling/importance_sampling_ratio/max": 0.5626000761985779, "sampling/importance_sampling_ratio/mean": 0.24166902899742126, "sampling/importance_sampling_ratio/min": 1.9609895651750975e-10, "sampling/sampling_logp_difference/max": 4.322971820831299, "sampling/sampling_logp_difference/mean": 1.0414931774139404, "step": 643, "step_time": 7.4891044159958255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.309182196855545, "epoch": 0.00644, "grad_norm": 0.059333667159080505, "kl": 0.7672412265092134, "learning_rate": 9.999828920922866e-06, "loss": -0.0056, "step": 644, "step_time": 4.519783104999078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1018.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 278.71875, "completions/mean_terminated_length": 278.71875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.177899658679962, "epoch": 0.00645, "frac_reward_zero_std": 0.0, "grad_norm": 0.021517077460885048, "kl": 0.561384379863739, "learning_rate": 9.999828357704242e-06, "loss": -0.0131, "num_tokens": 14685877.0, "reward": 0.6891920566558838, "reward_std": 1.1377370357513428, "rewards/rollout_reward_func/mean": 0.6891920566558838, "rewards/rollout_reward_func/std": 1.7120522260665894, "sampling/importance_sampling_ratio/max": 0.5614084005355835, "sampling/importance_sampling_ratio/mean": 0.16793674230575562, "sampling/importance_sampling_ratio/min": 1.314450486233909e-07, "sampling/sampling_logp_difference/max": 3.2029075622558594, "sampling/sampling_logp_difference/mean": 1.197238564491272, "step": 645, "step_time": 8.730170868999267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 6.18704092502594, "epoch": 0.00646, "grad_norm": 0.023204220458865166, "kl": 0.569736760109663, "learning_rate": 9.999827793560063e-06, "loss": -0.0131, "step": 646, "step_time": 4.9277441579979495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 856.0, "completions/max_terminated_length": 856.0, "completions/mean_length": 204.4375, "completions/mean_terminated_length": 204.4375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.709414958953857, "epoch": 0.00647, "frac_reward_zero_std": 0.25, "grad_norm": 0.03866943344473839, "kl": 0.39509544894099236, "learning_rate": 9.999827228490327e-06, "loss": -0.0009, "num_tokens": 14727234.0, "reward": 1.3353166580200195, "reward_std": 0.9802043437957764, "rewards/rollout_reward_func/mean": 1.3353166580200195, "rewards/rollout_reward_func/std": 1.2533366680145264, "sampling/importance_sampling_ratio/max": 0.5577678680419922, "sampling/importance_sampling_ratio/mean": 0.26036450266838074, "sampling/importance_sampling_ratio/min": 2.10450616577873e-05, "sampling/sampling_logp_difference/max": 4.359549045562744, "sampling/sampling_logp_difference/mean": 1.1134769916534424, "step": 647, "step_time": 8.460396799997397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.689854025840759, "epoch": 0.00648, "grad_norm": 0.0512709878385067, "kl": 0.3969481196254492, "learning_rate": 9.999826662495036e-06, "loss": -0.0011, "step": 648, "step_time": 4.555330414998025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1154.0, "completions/max_terminated_length": 1154.0, "completions/mean_length": 274.5, "completions/mean_terminated_length": 274.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.5325687527656555, "epoch": 0.00649, "frac_reward_zero_std": 0.0, "grad_norm": 0.11776519566774368, "kl": 0.46672242134809494, "learning_rate": 9.999826095574187e-06, "loss": -0.0196, "num_tokens": 14771700.0, "reward": 0.1700505018234253, "reward_std": 1.292714238166809, "rewards/rollout_reward_func/mean": 0.1700505018234253, "rewards/rollout_reward_func/std": 1.5414783954620361, "sampling/importance_sampling_ratio/max": 0.5414243936538696, "sampling/importance_sampling_ratio/mean": 0.15991422533988953, "sampling/importance_sampling_ratio/min": 8.790019358001153e-14, "sampling/sampling_logp_difference/max": 3.9300010204315186, "sampling/sampling_logp_difference/mean": 1.3532629013061523, "step": 649, "step_time": 9.844461377000698 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 6.530853867530823, "epoch": 0.0065, "grad_norm": 0.09538628906011581, "kl": 0.4678298141807318, "learning_rate": 9.999825527727781e-06, "loss": -0.0198, "step": 650, "step_time": 5.163286544000584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1540.0, "completions/max_terminated_length": 1540.0, "completions/mean_length": 457.8125, "completions/mean_terminated_length": 457.8125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.54302442073822, "epoch": 0.00651, "frac_reward_zero_std": 0.25, "grad_norm": 0.027502773329615593, "kl": 0.4639660604298115, "learning_rate": 9.99982495895582e-06, "loss": -0.0078, "num_tokens": 14822445.0, "reward": 1.4741454124450684, "reward_std": 1.151193618774414, "rewards/rollout_reward_func/mean": 1.4741454124450684, "rewards/rollout_reward_func/std": 1.3719367980957031, "sampling/importance_sampling_ratio/max": 0.5629721283912659, "sampling/importance_sampling_ratio/mean": 0.1885615885257721, "sampling/importance_sampling_ratio/min": 7.742747584416065e-06, "sampling/sampling_logp_difference/max": 2.9117376804351807, "sampling/sampling_logp_difference/mean": 1.0684657096862793, "step": 651, "step_time": 10.80053201199189 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0052083334885537624, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0052083334885537624, "entropy": 5.54695725440979, "epoch": 0.00652, "grad_norm": 0.026301171630620956, "kl": 0.4630935303866863, "learning_rate": 9.999824389258302e-06, "loss": -0.0078, "step": 652, "step_time": 5.982943333005096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1250.0, "completions/max_terminated_length": 1250.0, "completions/mean_length": 264.25, "completions/mean_terminated_length": 264.25, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.630528688430786, "epoch": 0.00653, "frac_reward_zero_std": 0.0, "grad_norm": 0.16552560031414032, "kl": 0.6273646578192711, "learning_rate": 9.999823818635227e-06, "loss": -0.0119, "num_tokens": 14865475.0, "reward": 0.10562598705291748, "reward_std": 1.2118816375732422, "rewards/rollout_reward_func/mean": 0.10562598705291748, "rewards/rollout_reward_func/std": 1.7998372316360474, "sampling/importance_sampling_ratio/max": 0.553798258304596, "sampling/importance_sampling_ratio/mean": 0.20061586797237396, "sampling/importance_sampling_ratio/min": 1.1676002031890675e-05, "sampling/sampling_logp_difference/max": 2.9243626594543457, "sampling/sampling_logp_difference/mean": 0.9509320259094238, "step": 653, "step_time": 9.814942144999804 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.0182291679084301, "clip_ratio/low_mean": 0.1119791679084301, "clip_ratio/low_min": 0.05208333395421505, "clip_ratio/region_mean": 0.1302083358168602, "entropy": 5.880203664302826, "epoch": 0.00654, "grad_norm": 0.03374994173645973, "kl": 0.6290454231202602, "learning_rate": 9.9998232470866e-06, "loss": -0.013, "step": 654, "step_time": 5.3447274169921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1241.0, "completions/max_terminated_length": 1241.0, "completions/mean_length": 493.5, "completions/mean_terminated_length": 506.70001220703125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.778140008449554, "epoch": 0.00655, "frac_reward_zero_std": 0.0, "grad_norm": 0.017467761412262917, "kl": 0.3539469726383686, "learning_rate": 9.999822674612414e-06, "loss": -0.017, "num_tokens": 14918077.0, "reward": 0.858413577079773, "reward_std": 1.8964989185333252, "rewards/rollout_reward_func/mean": 0.858413577079773, "rewards/rollout_reward_func/std": 1.8615925312042236, "sampling/importance_sampling_ratio/max": 0.31525254249572754, "sampling/importance_sampling_ratio/mean": 0.07373639941215515, "sampling/importance_sampling_ratio/min": 1.7084543135230485e-19, "sampling/sampling_logp_difference/max": 4.599559783935547, "sampling/sampling_logp_difference/mean": 1.4647977352142334, "step": 655, "step_time": 10.373035454998899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 6.780952870845795, "epoch": 0.00656, "grad_norm": 0.017075715586543083, "kl": 0.358057601377368, "learning_rate": 9.999822101212674e-06, "loss": -0.017, "step": 656, "step_time": 5.475165460993594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1622.0, "completions/max_terminated_length": 1622.0, "completions/mean_length": 492.90625, "completions/mean_terminated_length": 492.90625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.146909773349762, "epoch": 0.00657, "frac_reward_zero_std": 0.0, "grad_norm": 0.06275570392608643, "kl": 0.48388853296637535, "learning_rate": 9.999821526887376e-06, "loss": -0.0088, "num_tokens": 14969539.0, "reward": 0.8116989135742188, "reward_std": 1.3659424781799316, "rewards/rollout_reward_func/mean": 0.8116989135742188, "rewards/rollout_reward_func/std": 1.313522458076477, "sampling/importance_sampling_ratio/max": 0.5586984157562256, "sampling/importance_sampling_ratio/mean": 0.15311051905155182, "sampling/importance_sampling_ratio/min": 1.5325246553097776e-18, "sampling/sampling_logp_difference/max": 4.023992538452148, "sampling/sampling_logp_difference/mean": 1.2495191097259521, "step": 657, "step_time": 11.366350113999943 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 6.159075856208801, "epoch": 0.00658, "grad_norm": 0.05599598214030266, "kl": 0.48616528883576393, "learning_rate": 9.999820951636526e-06, "loss": -0.0088, "step": 658, "step_time": 6.250479831003759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 933.0, "completions/max_terminated_length": 933.0, "completions/mean_length": 219.21875, "completions/mean_terminated_length": 219.21875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.51883602142334, "epoch": 0.00659, "frac_reward_zero_std": 0.25, "grad_norm": 0.019505219534039497, "kl": 0.5023592747747898, "learning_rate": 9.99982037546012e-06, "loss": -0.0145, "num_tokens": 15011278.0, "reward": 0.9727489352226257, "reward_std": 0.991841197013855, "rewards/rollout_reward_func/mean": 0.9727489352226257, "rewards/rollout_reward_func/std": 1.5973271131515503, "sampling/importance_sampling_ratio/max": 0.5557390451431274, "sampling/importance_sampling_ratio/mean": 0.24469415843486786, "sampling/importance_sampling_ratio/min": 2.4370568063591946e-11, "sampling/sampling_logp_difference/max": 4.2054033279418945, "sampling/sampling_logp_difference/mean": 1.082698106765747, "step": 659, "step_time": 9.276107730001968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.512373924255371, "epoch": 0.0066, "grad_norm": 0.019902333617210388, "kl": 0.5040965210646391, "learning_rate": 9.999819798358157e-06, "loss": -0.0144, "step": 660, "step_time": 4.809091000992339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1036.0, "completions/max_terminated_length": 1036.0, "completions/mean_length": 234.21875, "completions/mean_terminated_length": 234.21875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.877407491207123, "epoch": 0.00661, "frac_reward_zero_std": 0.25, "grad_norm": 0.007844503037631512, "kl": 0.5546223297715187, "learning_rate": 9.999819220330643e-06, "loss": -0.0125, "num_tokens": 15053270.0, "reward": 1.2058944702148438, "reward_std": 0.8685800433158875, "rewards/rollout_reward_func/mean": 1.2058944702148438, "rewards/rollout_reward_func/std": 1.5956984758377075, "sampling/importance_sampling_ratio/max": 0.5597085356712341, "sampling/importance_sampling_ratio/mean": 0.202170729637146, "sampling/importance_sampling_ratio/min": 2.564033366070362e-06, "sampling/sampling_logp_difference/max": 4.614621162414551, "sampling/sampling_logp_difference/mean": 1.2160043716430664, "step": 661, "step_time": 9.090164415996696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.867081582546234, "epoch": 0.00662, "grad_norm": 0.007499001454561949, "kl": 0.5547795668244362, "learning_rate": 9.99981864137757e-06, "loss": -0.0124, "step": 662, "step_time": 4.928739184997539 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.0, "completions/max_length": 1468.0, "completions/max_terminated_length": 1468.0, "completions/mean_length": 365.15625, "completions/mean_terminated_length": 365.15625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.02456396818161, "epoch": 0.00663, "frac_reward_zero_std": 0.25, "grad_norm": 0.007818282581865788, "kl": 0.5122385621070862, "learning_rate": 9.999818061498945e-06, "loss": -0.0091, "num_tokens": 15099015.0, "reward": 1.6843271255493164, "reward_std": 1.0935523509979248, "rewards/rollout_reward_func/mean": 1.6843271255493164, "rewards/rollout_reward_func/std": 1.3741456270217896, "sampling/importance_sampling_ratio/max": 0.5509166121482849, "sampling/importance_sampling_ratio/mean": 0.23690803349018097, "sampling/importance_sampling_ratio/min": 2.715561890909157e-07, "sampling/sampling_logp_difference/max": 2.829892635345459, "sampling/sampling_logp_difference/mean": 0.9976156949996948, "step": 663, "step_time": 10.098226842990698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.0160452127456665, "epoch": 0.00664, "grad_norm": 0.007657638750970364, "kl": 0.5118620339781046, "learning_rate": 9.999817480694764e-06, "loss": -0.0091, "step": 664, "step_time": 6.275632767996285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1109.0, "completions/max_terminated_length": 1109.0, "completions/mean_length": 330.28125, "completions/mean_terminated_length": 330.28125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.396239638328552, "epoch": 0.00665, "frac_reward_zero_std": 0.25, "grad_norm": 0.03345051780343056, "kl": 0.443832503631711, "learning_rate": 9.99981689896503e-06, "loss": -0.011, "num_tokens": 15144041.0, "reward": 1.1442493200302124, "reward_std": 1.174126386642456, "rewards/rollout_reward_func/mean": 1.1442493200302124, "rewards/rollout_reward_func/std": 1.3841869831085205, "sampling/importance_sampling_ratio/max": 0.5551058650016785, "sampling/importance_sampling_ratio/mean": 0.21558807790279388, "sampling/importance_sampling_ratio/min": 0.0002794322499539703, "sampling/sampling_logp_difference/max": 3.04229998588562, "sampling/sampling_logp_difference/mean": 1.0146995782852173, "step": 665, "step_time": 9.336107374001585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.390744924545288, "epoch": 0.00666, "grad_norm": 0.03293323889374733, "kl": 0.44296923093497753, "learning_rate": 9.99981631630974e-06, "loss": -0.0111, "step": 666, "step_time": 5.197757273002935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1227.0, "completions/max_terminated_length": 1227.0, "completions/mean_length": 350.34375, "completions/mean_terminated_length": 361.1290283203125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.399774789810181, "epoch": 0.00667, "frac_reward_zero_std": 0.0, "grad_norm": 0.030353715643286705, "kl": 0.3503610440529883, "learning_rate": 9.999815732728897e-06, "loss": -0.0078, "num_tokens": 15190872.0, "reward": 0.07607533782720566, "reward_std": 1.6014677286148071, "rewards/rollout_reward_func/mean": 0.07607533782720566, "rewards/rollout_reward_func/std": 1.6029101610183716, "sampling/importance_sampling_ratio/max": 0.5526623725891113, "sampling/importance_sampling_ratio/mean": 0.08213867247104645, "sampling/importance_sampling_ratio/min": 7.62296367522175e-13, "sampling/sampling_logp_difference/max": 12.101319313049316, "sampling/sampling_logp_difference/mean": 1.623478651046753, "step": 667, "step_time": 10.308199159004289 }, { "clip_ratio/high_max": 0.008928571827709675, "clip_ratio/high_mean": 0.004464285913854837, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004464285913854837, "entropy": 7.382829487323761, "epoch": 0.00668, "grad_norm": 0.023239707574248314, "kl": 0.3268069280311465, "learning_rate": 9.9998151482225e-06, "loss": -0.0079, "step": 668, "step_time": 5.4062019099947065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1400.0, "completions/max_terminated_length": 1400.0, "completions/mean_length": 258.40625, "completions/mean_terminated_length": 266.2257995605469, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.075611889362335, "epoch": 0.00669, "frac_reward_zero_std": 0.25, "grad_norm": 0.020405348390340805, "kl": 0.47539790719747543, "learning_rate": 9.99981456279055e-06, "loss": -0.0068, "num_tokens": 15232887.0, "reward": 1.1542556285858154, "reward_std": 0.9877707362174988, "rewards/rollout_reward_func/mean": 1.1542556285858154, "rewards/rollout_reward_func/std": 1.6958718299865723, "sampling/importance_sampling_ratio/max": 0.560344934463501, "sampling/importance_sampling_ratio/mean": 0.2061402052640915, "sampling/importance_sampling_ratio/min": 1.6726775888073936e-18, "sampling/sampling_logp_difference/max": 12.32072639465332, "sampling/sampling_logp_difference/mean": 1.3028512001037598, "step": 669, "step_time": 9.793694370000594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 6.050284028053284, "epoch": 0.0067, "grad_norm": 0.019628724083304405, "kl": 0.4728831499814987, "learning_rate": 9.999813976433047e-06, "loss": -0.0068, "step": 670, "step_time": 5.740355595004075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 795.0, "completions/max_terminated_length": 795.0, "completions/mean_length": 274.125, "completions/mean_terminated_length": 274.125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.175561189651489, "epoch": 0.00671, "frac_reward_zero_std": 0.0, "grad_norm": 2.229891300201416, "kl": 3.493888095021248, "learning_rate": 9.99981338914999e-06, "loss": -0.0048, "num_tokens": 15277141.0, "reward": 1.1244105100631714, "reward_std": 1.5933701992034912, "rewards/rollout_reward_func/mean": 1.1244105100631714, "rewards/rollout_reward_func/std": 1.5317234992980957, "sampling/importance_sampling_ratio/max": 0.5509688854217529, "sampling/importance_sampling_ratio/mean": 0.17699261009693146, "sampling/importance_sampling_ratio/min": 1.8817718216990187e-11, "sampling/sampling_logp_difference/max": 11.623828887939453, "sampling/sampling_logp_difference/mean": 1.3811750411987305, "step": 671, "step_time": 8.06385398299608 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0052083334885537624, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 6.232859015464783, "epoch": 0.00672, "grad_norm": 0.024308258667588234, "kl": 0.5079888179898262, "learning_rate": 9.99981280094138e-06, "loss": -0.0162, "step": 672, "step_time": 4.514473300994723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1202.0, "completions/max_terminated_length": 1202.0, "completions/mean_length": 253.625, "completions/mean_terminated_length": 253.625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.6543625593185425, "epoch": 0.00673, "frac_reward_zero_std": 0.5, "grad_norm": 0.006255032494664192, "kl": 0.5609371997416019, "learning_rate": 9.999812211807216e-06, "loss": -0.0026, "num_tokens": 15318314.0, "reward": 0.9982370734214783, "reward_std": 0.6941738128662109, "rewards/rollout_reward_func/mean": 0.9982370734214783, "rewards/rollout_reward_func/std": 1.4541690349578857, "sampling/importance_sampling_ratio/max": 0.5592724084854126, "sampling/importance_sampling_ratio/mean": 0.3099770247936249, "sampling/importance_sampling_ratio/min": 9.59366660007073e-14, "sampling/sampling_logp_difference/max": 10.92684268951416, "sampling/sampling_logp_difference/mean": 1.2590910196304321, "step": 673, "step_time": 9.753681531994516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.704591065645218, "epoch": 0.00674, "grad_norm": 0.0071292719803750515, "kl": 0.5626557804644108, "learning_rate": 9.9998116217475e-06, "loss": -0.0026, "step": 674, "step_time": 5.300471314989409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1029.0, "completions/max_terminated_length": 1029.0, "completions/mean_length": 198.21875, "completions/mean_terminated_length": 198.21875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.854500651359558, "epoch": 0.00675, "frac_reward_zero_std": 0.75, "grad_norm": 0.011607320047914982, "kl": 0.7319341264665127, "learning_rate": 9.99981103076223e-06, "loss": -0.0004, "num_tokens": 15356033.0, "reward": 1.678581714630127, "reward_std": 0.3093111217021942, "rewards/rollout_reward_func/mean": 1.678581714630127, "rewards/rollout_reward_func/std": 0.8088850378990173, "sampling/importance_sampling_ratio/max": 0.5582253932952881, "sampling/importance_sampling_ratio/mean": 0.4189662039279938, "sampling/importance_sampling_ratio/min": 2.6445568437338807e-06, "sampling/sampling_logp_difference/max": 2.3361361026763916, "sampling/sampling_logp_difference/mean": 0.6183323860168457, "step": 675, "step_time": 8.6469719050001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.890291303396225, "epoch": 0.00676, "grad_norm": 0.011821780353784561, "kl": 0.7300436720252037, "learning_rate": 9.999810438851407e-06, "loss": -0.0004, "step": 676, "step_time": 5.0661307020054664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1475.0, "completions/max_terminated_length": 1475.0, "completions/mean_length": 415.40625, "completions/mean_terminated_length": 415.40625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.721070289611816, "epoch": 0.00677, "frac_reward_zero_std": 0.25, "grad_norm": 0.033724576234817505, "kl": 0.38979604467749596, "learning_rate": 9.999809846015032e-06, "loss": -0.004, "num_tokens": 15403349.0, "reward": 0.7839260101318359, "reward_std": 1.3350179195404053, "rewards/rollout_reward_func/mean": 0.7839260101318359, "rewards/rollout_reward_func/std": 1.6493858098983765, "sampling/importance_sampling_ratio/max": 0.5569285750389099, "sampling/importance_sampling_ratio/mean": 0.17355726659297943, "sampling/importance_sampling_ratio/min": 3.976083107508008e-14, "sampling/sampling_logp_difference/max": 4.651988983154297, "sampling/sampling_logp_difference/mean": 1.4645905494689941, "step": 677, "step_time": 10.622884445998352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 6.734566271305084, "epoch": 0.00678, "grad_norm": 0.03316821530461311, "kl": 0.38874663040041924, "learning_rate": 9.999809252253105e-06, "loss": -0.0039, "step": 678, "step_time": 6.31442034999418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 624.0, "completions/max_terminated_length": 624.0, "completions/mean_length": 222.40625, "completions/mean_terminated_length": 222.40625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.254373222589493, "epoch": 0.00679, "frac_reward_zero_std": 0.25, "grad_norm": 0.019153395667672157, "kl": 0.4288297686725855, "learning_rate": 9.999808657565626e-06, "loss": -0.0046, "num_tokens": 15443776.0, "reward": 0.5710574388504028, "reward_std": 0.8554186820983887, "rewards/rollout_reward_func/mean": 0.5710574388504028, "rewards/rollout_reward_func/std": 1.5479594469070435, "sampling/importance_sampling_ratio/max": 0.5602561831474304, "sampling/importance_sampling_ratio/mean": 0.2493210732936859, "sampling/importance_sampling_ratio/min": 2.0307481918280246e-08, "sampling/sampling_logp_difference/max": 3.045928955078125, "sampling/sampling_logp_difference/mean": 1.4391887187957764, "step": 679, "step_time": 7.406683081000665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 6.241820603609085, "epoch": 0.0068, "grad_norm": 0.018210556358098984, "kl": 0.42704640701413155, "learning_rate": 9.999808061952593e-06, "loss": -0.0046, "step": 680, "step_time": 4.086445976005052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1475.0, "completions/max_terminated_length": 1475.0, "completions/mean_length": 311.84375, "completions/mean_terminated_length": 311.84375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.2566282749176025, "epoch": 0.00681, "frac_reward_zero_std": 0.0, "grad_norm": 0.02834552526473999, "kl": 0.701028760522604, "learning_rate": 9.999807465414011e-06, "loss": -0.0103, "num_tokens": 15486473.0, "reward": 1.2491767406463623, "reward_std": 1.5799775123596191, "rewards/rollout_reward_func/mean": 1.2491767406463623, "rewards/rollout_reward_func/std": 1.6174430847167969, "sampling/importance_sampling_ratio/max": 0.5587838888168335, "sampling/importance_sampling_ratio/mean": 0.23794753849506378, "sampling/importance_sampling_ratio/min": 2.6551731480140006e-06, "sampling/sampling_logp_difference/max": 4.4764084815979, "sampling/sampling_logp_difference/mean": 0.9761773943901062, "step": 681, "step_time": 10.739386261997424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.2580413818359375, "epoch": 0.00682, "grad_norm": 0.028381751850247383, "kl": 0.6996622011065483, "learning_rate": 9.999806867949875e-06, "loss": -0.0103, "step": 682, "step_time": 6.372372712001379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1300.0, "completions/max_terminated_length": 1300.0, "completions/mean_length": 396.53125, "completions/mean_terminated_length": 396.53125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.418716490268707, "epoch": 0.00683, "frac_reward_zero_std": 0.0, "grad_norm": 0.0497143492102623, "kl": 0.39606258273124695, "learning_rate": 9.999806269560189e-06, "loss": -0.0117, "num_tokens": 15533504.0, "reward": 0.22866004705429077, "reward_std": 1.6302701234817505, "rewards/rollout_reward_func/mean": 0.22866004705429077, "rewards/rollout_reward_func/std": 1.8287628889083862, "sampling/importance_sampling_ratio/max": 0.5579904913902283, "sampling/importance_sampling_ratio/mean": 0.1537972241640091, "sampling/importance_sampling_ratio/min": 5.682339931922797e-09, "sampling/sampling_logp_difference/max": 4.118141174316406, "sampling/sampling_logp_difference/mean": 1.273179292678833, "step": 683, "step_time": 10.042830067995965 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 6.3939924240112305, "epoch": 0.00684, "grad_norm": 0.04960935190320015, "kl": 0.39607035368680954, "learning_rate": 9.99980567024495e-06, "loss": -0.0117, "step": 684, "step_time": 5.760272399998939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1481.0, "completions/max_terminated_length": 1481.0, "completions/mean_length": 591.0, "completions/mean_terminated_length": 591.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.471676170825958, "epoch": 0.00685, "frac_reward_zero_std": 0.0, "grad_norm": 0.05152904614806175, "kl": 0.4741202052682638, "learning_rate": 9.99980507000416e-06, "loss": -0.0051, "num_tokens": 15588393.0, "reward": 0.5455388426780701, "reward_std": 1.515275239944458, "rewards/rollout_reward_func/mean": 0.5455388426780701, "rewards/rollout_reward_func/std": 1.6454240083694458, "sampling/importance_sampling_ratio/max": 0.5545340776443481, "sampling/importance_sampling_ratio/mean": 0.13203741610050201, "sampling/importance_sampling_ratio/min": 2.3602205496856376e-19, "sampling/sampling_logp_difference/max": 4.873176097869873, "sampling/sampling_logp_difference/mean": 1.260620355606079, "step": 685, "step_time": 11.183052979002241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.005681818351149559, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005681818351149559, "entropy": 6.4453243017196655, "epoch": 0.00686, "grad_norm": 0.0194889884442091, "kl": 0.45367528684437275, "learning_rate": 9.999804468837818e-06, "loss": -0.0052, "step": 686, "step_time": 6.024035402002482 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0052083334885537624, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0052083334885537624, "completions/clipped_ratio": 0.0, "completions/max_length": 699.0, "completions/max_terminated_length": 699.0, "completions/mean_length": 106.40625, "completions/mean_terminated_length": 106.40625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.595583736896515, "epoch": 0.00687, "frac_reward_zero_std": 0.5, "grad_norm": 0.006608299911022186, "kl": 0.574263721704483, "learning_rate": 9.999803866745927e-06, "loss": -0.0039, "num_tokens": 15623538.0, "reward": 1.3443982601165771, "reward_std": 0.9863226413726807, "rewards/rollout_reward_func/mean": 1.3443982601165771, "rewards/rollout_reward_func/std": 1.5236314535140991, "sampling/importance_sampling_ratio/max": 0.5578537583351135, "sampling/importance_sampling_ratio/mean": 0.40209388732910156, "sampling/importance_sampling_ratio/min": 2.4083084415593703e-09, "sampling/sampling_logp_difference/max": 2.940959930419922, "sampling/sampling_logp_difference/mean": 0.9028031229972839, "step": 687, "step_time": 7.739407283996115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.581540495157242, "epoch": 0.00688, "grad_norm": 0.0065726637840271, "kl": 0.5766596868634224, "learning_rate": 9.999803263728482e-06, "loss": -0.0039, "step": 688, "step_time": 4.542906637005217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "completions/clipped_ratio": 0.0, "completions/max_length": 1349.0, "completions/max_terminated_length": 1349.0, "completions/mean_length": 360.125, "completions/mean_terminated_length": 360.125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.1729496121406555, "epoch": 0.00689, "frac_reward_zero_std": 0.0, "grad_norm": 0.06801466643810272, "kl": 0.3409823700785637, "learning_rate": 9.999802659785488e-06, "loss": -0.0122, "num_tokens": 15670234.0, "reward": -0.21057918667793274, "reward_std": 1.5661436319351196, "rewards/rollout_reward_func/mean": -0.21057918667793274, "rewards/rollout_reward_func/std": 1.5595225095748901, "sampling/importance_sampling_ratio/max": 0.5922987461090088, "sampling/importance_sampling_ratio/mean": 0.16073165833950043, "sampling/importance_sampling_ratio/min": 1.1084776474490354e-07, "sampling/sampling_logp_difference/max": 3.883821487426758, "sampling/sampling_logp_difference/mean": 1.1459715366363525, "step": 689, "step_time": 10.51556462499866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.004807692486792803, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004807692486792803, "entropy": 6.154626846313477, "epoch": 0.0069, "grad_norm": 0.08165623992681503, "kl": 0.3421839587390423, "learning_rate": 9.999802054916945e-06, "loss": -0.0125, "step": 690, "step_time": 5.883754828999372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1280.0, "completions/max_terminated_length": 1280.0, "completions/mean_length": 420.75, "completions/mean_terminated_length": 424.8333435058594, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.060508489608765, "epoch": 0.00691, "frac_reward_zero_std": 0.0, "grad_norm": 0.07874833792448044, "kl": 0.5106629319489002, "learning_rate": 9.99980144912285e-06, "loss": -0.0072, "num_tokens": 15719818.0, "reward": 0.9100795984268188, "reward_std": 1.202582597732544, "rewards/rollout_reward_func/mean": 0.9100795984268188, "rewards/rollout_reward_func/std": 1.3772863149642944, "sampling/importance_sampling_ratio/max": 0.5566491484642029, "sampling/importance_sampling_ratio/mean": 0.18108732998371124, "sampling/importance_sampling_ratio/min": 1.0694489591860437e-13, "sampling/sampling_logp_difference/max": 4.558590888977051, "sampling/sampling_logp_difference/mean": 1.1503264904022217, "step": 691, "step_time": 10.446937812004762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 6.029834091663361, "epoch": 0.00692, "grad_norm": 0.0753236711025238, "kl": 0.524703155271709, "learning_rate": 9.999800842403203e-06, "loss": -0.0076, "step": 692, "step_time": 5.528459961995395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1569.0, "completions/max_terminated_length": 1569.0, "completions/mean_length": 352.15625, "completions/mean_terminated_length": 352.15625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.972200483083725, "epoch": 0.00693, "frac_reward_zero_std": 0.25, "grad_norm": 0.028479205444455147, "kl": 0.46673058718442917, "learning_rate": 9.999800234758007e-06, "loss": -0.0056, "num_tokens": 15764003.0, "reward": 0.9284811615943909, "reward_std": 0.8938922882080078, "rewards/rollout_reward_func/mean": 0.9284811615943909, "rewards/rollout_reward_func/std": 1.315114140510559, "sampling/importance_sampling_ratio/max": 0.5605376362800598, "sampling/importance_sampling_ratio/mean": 0.2751838266849518, "sampling/importance_sampling_ratio/min": 1.305662493678028e-07, "sampling/sampling_logp_difference/max": 3.7097225189208984, "sampling/sampling_logp_difference/mean": 0.9030347466468811, "step": 693, "step_time": 10.945457427002111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.959029018878937, "epoch": 0.00694, "grad_norm": 0.02759494259953499, "kl": 0.470059335231781, "learning_rate": 9.999799626187263e-06, "loss": -0.0057, "step": 694, "step_time": 5.883698161003849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.003289473708719015, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003289473708719015, "completions/clipped_ratio": 0.03125, "completions/max_length": 1219.0, "completions/max_terminated_length": 1219.0, "completions/mean_length": 218.96875, "completions/mean_terminated_length": 225.51612854003906, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.569039821624756, "epoch": 0.00695, "frac_reward_zero_std": 0.0, "grad_norm": 0.1051386222243309, "kl": 0.4974827356636524, "learning_rate": 9.999799016690968e-06, "loss": -0.0156, "num_tokens": 15804961.0, "reward": 0.29406502842903137, "reward_std": 1.2428635358810425, "rewards/rollout_reward_func/mean": 0.29406502842903137, "rewards/rollout_reward_func/std": 1.3738200664520264, "sampling/importance_sampling_ratio/max": 0.5559452772140503, "sampling/importance_sampling_ratio/mean": 0.21754232048988342, "sampling/importance_sampling_ratio/min": 5.867198948905639e-10, "sampling/sampling_logp_difference/max": 5.001742839813232, "sampling/sampling_logp_difference/mean": 1.0879000425338745, "step": 695, "step_time": 9.798210088003543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 5.624884843826294, "epoch": 0.00696, "grad_norm": 0.08344905823469162, "kl": 0.49215418845415115, "learning_rate": 9.999798406269121e-06, "loss": -0.0159, "step": 696, "step_time": 5.2444967870033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.004166666883975267, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004166666883975267, "completions/clipped_ratio": 0.0, "completions/max_length": 1062.0, "completions/max_terminated_length": 1062.0, "completions/mean_length": 341.09375, "completions/mean_terminated_length": 341.09375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.930630624294281, "epoch": 0.00697, "frac_reward_zero_std": 0.0, "grad_norm": 0.02435060776770115, "kl": 0.34420883283019066, "learning_rate": 9.999797794921726e-06, "loss": -0.0053, "num_tokens": 15851340.0, "reward": 1.3735668659210205, "reward_std": 1.8495086431503296, "rewards/rollout_reward_func/mean": 1.3735668659210205, "rewards/rollout_reward_func/std": 1.8214725255966187, "sampling/importance_sampling_ratio/max": 0.5308433771133423, "sampling/importance_sampling_ratio/mean": 0.07869640737771988, "sampling/importance_sampling_ratio/min": 1.7280348263426504e-09, "sampling/sampling_logp_difference/max": 5.236002445220947, "sampling/sampling_logp_difference/mean": 1.4310535192489624, "step": 697, "step_time": 9.295707091001532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.004166666883975267, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004166666883975267, "entropy": 6.916065335273743, "epoch": 0.00698, "grad_norm": 0.022338205948472023, "kl": 0.35256815422326326, "learning_rate": 9.999797182648783e-06, "loss": -0.0054, "step": 698, "step_time": 4.993093626006157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 916.0, "completions/max_terminated_length": 857.0, "completions/mean_length": 215.4375, "completions/mean_terminated_length": 192.8386993408203, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.062793076038361, "epoch": 0.00699, "frac_reward_zero_std": 0.5, "grad_norm": 0.05680130794644356, "kl": 0.7677697613835335, "learning_rate": 9.999796569450289e-06, "loss": -0.0032, "num_tokens": 15890811.0, "reward": 1.0781958103179932, "reward_std": 0.9168141484260559, "rewards/rollout_reward_func/mean": 1.0781958103179932, "rewards/rollout_reward_func/std": 1.565596342086792, "sampling/importance_sampling_ratio/max": 0.5509035587310791, "sampling/importance_sampling_ratio/mean": 0.28937771916389465, "sampling/importance_sampling_ratio/min": 9.890084584629477e-11, "sampling/sampling_logp_difference/max": 10.06862735748291, "sampling/sampling_logp_difference/mean": 0.8970615267753601, "step": 699, "step_time": 9.269813223996607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.075249791145325, "epoch": 0.007, "grad_norm": 0.06615661084651947, "kl": 0.7610219232738018, "learning_rate": 9.999795955326245e-06, "loss": -0.0034, "step": 700, "step_time": 4.750371329999325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1185.0, "completions/max_terminated_length": 1185.0, "completions/mean_length": 413.71875, "completions/mean_terminated_length": 413.71875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.0921953320503235, "epoch": 0.00701, "frac_reward_zero_std": 0.0, "grad_norm": 0.05877472460269928, "kl": 0.4882674813270569, "learning_rate": 9.999795340276655e-06, "loss": -0.0125, "num_tokens": 15941053.0, "reward": 0.8233373165130615, "reward_std": 1.5892090797424316, "rewards/rollout_reward_func/mean": 0.8233373165130615, "rewards/rollout_reward_func/std": 1.6654473543167114, "sampling/importance_sampling_ratio/max": 0.5532178282737732, "sampling/importance_sampling_ratio/mean": 0.14647597074508667, "sampling/importance_sampling_ratio/min": 4.875932587724069e-17, "sampling/sampling_logp_difference/max": 3.7897729873657227, "sampling/sampling_logp_difference/mean": 1.243513584136963, "step": 701, "step_time": 10.465277972001786 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0052083334885537624, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0052083334885537624, "entropy": 6.084068953990936, "epoch": 0.00702, "grad_norm": 0.04874364659190178, "kl": 0.49960071220993996, "learning_rate": 9.999794724301514e-06, "loss": -0.0128, "step": 702, "step_time": 5.388886699991417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1163.0, "completions/max_terminated_length": 1163.0, "completions/mean_length": 480.65625, "completions/mean_terminated_length": 480.65625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.012529373168945, "epoch": 0.00703, "frac_reward_zero_std": 0.0, "grad_norm": 0.018333252519369125, "kl": 0.2573678810149431, "learning_rate": 9.999794107400824e-06, "loss": -0.0071, "num_tokens": 15993204.0, "reward": 0.2728276550769806, "reward_std": 1.45334792137146, "rewards/rollout_reward_func/mean": 0.2728276550769806, "rewards/rollout_reward_func/std": 1.620566725730896, "sampling/importance_sampling_ratio/max": 0.302228182554245, "sampling/importance_sampling_ratio/mean": 0.07174547016620636, "sampling/importance_sampling_ratio/min": 1.401298464324817e-43, "sampling/sampling_logp_difference/max": 4.511527061462402, "sampling/sampling_logp_difference/mean": 1.3173911571502686, "step": 703, "step_time": 9.988230251001369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 6.999822974205017, "epoch": 0.00704, "grad_norm": 0.018792910501360893, "kl": 0.26068443432450294, "learning_rate": 9.999793489574587e-06, "loss": -0.0071, "step": 704, "step_time": 5.252924181997514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1517.0, "completions/max_terminated_length": 1517.0, "completions/mean_length": 631.28125, "completions/mean_terminated_length": 620.54833984375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.81464946269989, "epoch": 0.00705, "frac_reward_zero_std": 0.25, "grad_norm": 0.010502978228032589, "kl": 0.48945507779717445, "learning_rate": 9.999792870822801e-06, "loss": -0.001, "num_tokens": 16048752.0, "reward": 1.4551923274993896, "reward_std": 1.2561194896697998, "rewards/rollout_reward_func/mean": 1.4551923274993896, "rewards/rollout_reward_func/std": 1.4879974126815796, "sampling/importance_sampling_ratio/max": 0.5585709810256958, "sampling/importance_sampling_ratio/mean": 0.14922238886356354, "sampling/importance_sampling_ratio/min": 4.418620950435034e-24, "sampling/sampling_logp_difference/max": 11.264510154724121, "sampling/sampling_logp_difference/mean": 1.173861026763916, "step": 705, "step_time": 11.79017816400301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.813975989818573, "epoch": 0.00706, "grad_norm": 0.010136735625565052, "kl": 0.48817747458815575, "learning_rate": 9.999792251145466e-06, "loss": -0.001, "step": 706, "step_time": 6.012305637999816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1083.0, "completions/max_terminated_length": 1083.0, "completions/mean_length": 319.875, "completions/mean_terminated_length": 319.875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.928636610507965, "epoch": 0.00707, "frac_reward_zero_std": 0.25, "grad_norm": 0.010359545238316059, "kl": 0.40080916695296764, "learning_rate": 9.999791630542584e-06, "loss": -0.0123, "num_tokens": 16094561.0, "reward": 1.1272495985031128, "reward_std": 1.4083166122436523, "rewards/rollout_reward_func/mean": 1.1272495985031128, "rewards/rollout_reward_func/std": 1.74083411693573, "sampling/importance_sampling_ratio/max": 0.5528454780578613, "sampling/importance_sampling_ratio/mean": 0.2010573148727417, "sampling/importance_sampling_ratio/min": 5.338807019231121e-41, "sampling/sampling_logp_difference/max": 3.958313465118408, "sampling/sampling_logp_difference/mean": 1.23238205909729, "step": 707, "step_time": 10.269836500992824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0008012820617295802, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0008012820617295802, "entropy": 5.931709051132202, "epoch": 0.00708, "grad_norm": 0.01039818674325943, "kl": 0.4025597833096981, "learning_rate": 9.999791009014154e-06, "loss": -0.0124, "step": 708, "step_time": 5.07940221199533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1293.0, "completions/max_terminated_length": 1293.0, "completions/mean_length": 533.5625, "completions/mean_terminated_length": 533.1000366210938, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.850337028503418, "epoch": 0.00709, "frac_reward_zero_std": 0.25, "grad_norm": 0.02110755443572998, "kl": 0.49636097624897957, "learning_rate": 9.999790386560175e-06, "loss": -0.0118, "num_tokens": 16146167.0, "reward": 1.144986629486084, "reward_std": 1.5440635681152344, "rewards/rollout_reward_func/mean": 1.144986629486084, "rewards/rollout_reward_func/std": 1.8308578729629517, "sampling/importance_sampling_ratio/max": 0.55790776014328, "sampling/importance_sampling_ratio/mean": 0.21980050206184387, "sampling/importance_sampling_ratio/min": 5.605193857299268e-45, "sampling/sampling_logp_difference/max": 4.259402751922607, "sampling/sampling_logp_difference/mean": 1.0225013494491577, "step": 709, "step_time": 10.76619523300542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0028409091755747795, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0028409091755747795, "entropy": 4.835175096988678, "epoch": 0.0071, "grad_norm": 0.020530568435788155, "kl": 0.4950161948800087, "learning_rate": 9.99978976318065e-06, "loss": -0.0118, "step": 710, "step_time": 5.5876703829962935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0022321429569274187, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0022321429569274187, "completions/clipped_ratio": 0.0, "completions/max_length": 1570.0, "completions/max_terminated_length": 1570.0, "completions/mean_length": 528.875, "completions/mean_terminated_length": 528.875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.352743625640869, "epoch": 0.00711, "frac_reward_zero_std": 0.25, "grad_norm": 0.09518986940383911, "kl": 0.5990308783948421, "learning_rate": 9.999789138875577e-06, "loss": -0.0057, "num_tokens": 16198167.0, "reward": 1.2203199863433838, "reward_std": 1.2487035989761353, "rewards/rollout_reward_func/mean": 1.2203199863433838, "rewards/rollout_reward_func/std": 1.5820025205612183, "sampling/importance_sampling_ratio/max": 0.5565988421440125, "sampling/importance_sampling_ratio/mean": 0.18458712100982666, "sampling/importance_sampling_ratio/min": 7.040654324607992e-17, "sampling/sampling_logp_difference/max": 12.147419929504395, "sampling/sampling_logp_difference/mean": 1.0294430255889893, "step": 711, "step_time": 12.174985988003755 }, { "clip_ratio/high_max": 0.004464285913854837, "clip_ratio/high_mean": 0.0022321429569274187, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0022321429569274187, "entropy": 5.365568161010742, "epoch": 0.00712, "grad_norm": 0.03286381810903549, "kl": 0.5683500915765762, "learning_rate": 9.999788513644958e-06, "loss": -0.006, "step": 712, "step_time": 6.402360085994587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1186.0, "completions/max_terminated_length": 1186.0, "completions/mean_length": 308.40625, "completions/mean_terminated_length": 308.40625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.782921314239502, "epoch": 0.00713, "frac_reward_zero_std": 0.25, "grad_norm": 0.054727766662836075, "kl": 0.47021846752613783, "learning_rate": 9.999787887488789e-06, "loss": -0.0111, "num_tokens": 16242968.0, "reward": 1.0925992727279663, "reward_std": 1.0131022930145264, "rewards/rollout_reward_func/mean": 1.0925992727279663, "rewards/rollout_reward_func/std": 1.3202331066131592, "sampling/importance_sampling_ratio/max": 0.5613992214202881, "sampling/importance_sampling_ratio/mean": 0.19614891707897186, "sampling/importance_sampling_ratio/min": 0.0002857378567568958, "sampling/sampling_logp_difference/max": 2.7229785919189453, "sampling/sampling_logp_difference/mean": 1.0787100791931152, "step": 713, "step_time": 9.093515627002489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625000465661287, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625000465661287, "entropy": 5.783425450325012, "epoch": 0.00714, "grad_norm": 0.019324947148561478, "kl": 0.46397110633552074, "learning_rate": 9.999787260407074e-06, "loss": -0.0112, "step": 714, "step_time": 5.211055109004519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1538.0, "completions/max_terminated_length": 1538.0, "completions/mean_length": 363.1875, "completions/mean_terminated_length": 363.1875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.715027272701263, "epoch": 0.00715, "frac_reward_zero_std": 0.25, "grad_norm": 0.024548087269067764, "kl": 0.5932558812201023, "learning_rate": 9.999786632399813e-06, "loss": -0.0093, "num_tokens": 16288998.0, "reward": 0.8044400215148926, "reward_std": 1.1714661121368408, "rewards/rollout_reward_func/mean": 0.8044400215148926, "rewards/rollout_reward_func/std": 1.4924389123916626, "sampling/importance_sampling_ratio/max": 0.5504814982414246, "sampling/importance_sampling_ratio/mean": 0.19556696712970734, "sampling/importance_sampling_ratio/min": 2.0196383047732525e-05, "sampling/sampling_logp_difference/max": 3.3294777870178223, "sampling/sampling_logp_difference/mean": 1.0144387483596802, "step": 715, "step_time": 10.405213679001463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.72651880979538, "epoch": 0.00716, "grad_norm": 0.023931045085191727, "kl": 0.5921632423996925, "learning_rate": 9.999786003467005e-06, "loss": -0.0094, "step": 716, "step_time": 6.245636629002547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 766.0, "completions/max_terminated_length": 766.0, "completions/mean_length": 281.40625, "completions/mean_terminated_length": 281.40625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.55021259188652, "epoch": 0.00717, "frac_reward_zero_std": 0.25, "grad_norm": 0.006578054279088974, "kl": 0.46449605468660593, "learning_rate": 9.99978537360865e-06, "loss": -0.0059, "num_tokens": 16332278.0, "reward": 1.073265790939331, "reward_std": 1.2011113166809082, "rewards/rollout_reward_func/mean": 1.073265790939331, "rewards/rollout_reward_func/std": 1.5148231983184814, "sampling/importance_sampling_ratio/max": 0.5487592220306396, "sampling/importance_sampling_ratio/mean": 0.18519897758960724, "sampling/importance_sampling_ratio/min": 2.2059641935356922e-07, "sampling/sampling_logp_difference/max": 3.631439447402954, "sampling/sampling_logp_difference/mean": 1.314673662185669, "step": 717, "step_time": 8.03115327800333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 6.552995324134827, "epoch": 0.00718, "grad_norm": 0.00667697936296463, "kl": 0.4651959342882037, "learning_rate": 9.99978474282475e-06, "loss": -0.0059, "step": 718, "step_time": 4.8378227460016205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1468.0, "completions/max_terminated_length": 1468.0, "completions/mean_length": 255.78125, "completions/mean_terminated_length": 263.51611328125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.843188405036926, "epoch": 0.00719, "frac_reward_zero_std": 0.0, "grad_norm": 0.014971107244491577, "kl": 0.52851976826787, "learning_rate": 9.999784111115302e-06, "loss": -0.0187, "num_tokens": 16371786.0, "reward": 1.0824146270751953, "reward_std": 1.5661665201187134, "rewards/rollout_reward_func/mean": 1.0824146270751953, "rewards/rollout_reward_func/std": 1.5802305936813354, "sampling/importance_sampling_ratio/max": 0.5574051141738892, "sampling/importance_sampling_ratio/mean": 0.2839767336845398, "sampling/importance_sampling_ratio/min": 4.7277861625616424e-08, "sampling/sampling_logp_difference/max": 5.0772600173950195, "sampling/sampling_logp_difference/mean": 0.9365389347076416, "step": 719, "step_time": 10.467455469006381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.830079883337021, "epoch": 0.0072, "grad_norm": 0.012826910242438316, "kl": 0.5296193920075893, "learning_rate": 9.99978347848031e-06, "loss": -0.0188, "step": 720, "step_time": 5.839479105990904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1605.0, "completions/max_terminated_length": 1605.0, "completions/mean_length": 552.28125, "completions/mean_terminated_length": 540.6451416015625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.940600752830505, "epoch": 0.00721, "frac_reward_zero_std": 0.0, "grad_norm": 0.05781747028231621, "kl": 0.2630590908229351, "learning_rate": 9.99978284491977e-06, "loss": -0.0107, "num_tokens": 16425523.0, "reward": 0.4775323271751404, "reward_std": 1.4104171991348267, "rewards/rollout_reward_func/mean": 0.4775323271751404, "rewards/rollout_reward_func/std": 1.4815810918807983, "sampling/importance_sampling_ratio/max": 0.5477714538574219, "sampling/importance_sampling_ratio/mean": 0.09023408591747284, "sampling/importance_sampling_ratio/min": 1.6315719095392968e-15, "sampling/sampling_logp_difference/max": 8.820098876953125, "sampling/sampling_logp_difference/mean": 1.397096037864685, "step": 721, "step_time": 11.316610677000426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 6.922614872455597, "epoch": 0.00722, "grad_norm": 0.05889948084950447, "kl": 0.2590419966727495, "learning_rate": 9.999782210433683e-06, "loss": -0.011, "step": 722, "step_time": 6.385607060994516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1063.0, "completions/max_terminated_length": 1063.0, "completions/mean_length": 420.25, "completions/mean_terminated_length": 420.25, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.282242089509964, "epoch": 0.00723, "frac_reward_zero_std": 0.25, "grad_norm": 0.016428593546152115, "kl": 0.28778629191219807, "learning_rate": 9.999781575022053e-06, "loss": -0.0067, "num_tokens": 16473348.0, "reward": 1.3238818645477295, "reward_std": 1.2036527395248413, "rewards/rollout_reward_func/mean": 1.3238818645477295, "rewards/rollout_reward_func/std": 1.4723718166351318, "sampling/importance_sampling_ratio/max": 0.5550784468650818, "sampling/importance_sampling_ratio/mean": 0.1690264344215393, "sampling/importance_sampling_ratio/min": 7.66658792006325e-10, "sampling/sampling_logp_difference/max": 3.8970251083374023, "sampling/sampling_logp_difference/mean": 1.213907241821289, "step": 723, "step_time": 9.822411039000144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 6.275585949420929, "epoch": 0.00724, "grad_norm": 0.0158408060669899, "kl": 0.28590389527380466, "learning_rate": 9.999780938684877e-06, "loss": -0.0067, "step": 724, "step_time": 5.138187218999519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1401.0, "completions/max_terminated_length": 1401.0, "completions/mean_length": 344.5625, "completions/mean_terminated_length": 355.1612854003906, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.819717735052109, "epoch": 0.00725, "frac_reward_zero_std": 0.0, "grad_norm": 0.05512489378452301, "kl": 0.5377298817038536, "learning_rate": 9.999780301422157e-06, "loss": -0.0192, "num_tokens": 16518690.0, "reward": 0.8530784249305725, "reward_std": 1.5520100593566895, "rewards/rollout_reward_func/mean": 0.8530784249305725, "rewards/rollout_reward_func/std": 1.5652341842651367, "sampling/importance_sampling_ratio/max": 0.5511311292648315, "sampling/importance_sampling_ratio/mean": 0.21934789419174194, "sampling/importance_sampling_ratio/min": 2.67091252226237e-07, "sampling/sampling_logp_difference/max": 3.162621259689331, "sampling/sampling_logp_difference/mean": 0.7914282083511353, "step": 725, "step_time": 10.067892783994466 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0052083334885537624, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0052083334885537624, "entropy": 4.8237215876579285, "epoch": 0.00726, "grad_norm": 0.05466919392347336, "kl": 0.5210190042853355, "learning_rate": 9.99977966323389e-06, "loss": -0.0193, "step": 726, "step_time": 5.703163378992031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1435.0, "completions/max_terminated_length": 1435.0, "completions/mean_length": 444.875, "completions/mean_terminated_length": 458.70965576171875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.8204465508461, "epoch": 0.00727, "frac_reward_zero_std": 0.0, "grad_norm": 0.04971720278263092, "kl": 0.29156025499105453, "learning_rate": 9.99977902412008e-06, "loss": -0.0096, "num_tokens": 16568830.0, "reward": 0.720226526260376, "reward_std": 1.202056884765625, "rewards/rollout_reward_func/mean": 0.720226526260376, "rewards/rollout_reward_func/std": 1.6404314041137695, "sampling/importance_sampling_ratio/max": 0.5367441177368164, "sampling/importance_sampling_ratio/mean": 0.09118979424238205, "sampling/importance_sampling_ratio/min": 2.335330008800679e-13, "sampling/sampling_logp_difference/max": 4.72615909576416, "sampling/sampling_logp_difference/mean": 1.3242747783660889, "step": 727, "step_time": 10.282626684991556 }, { "clip_ratio/high_max": 0.004807692486792803, "clip_ratio/high_mean": 0.0024038462433964014, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0024038462433964014, "entropy": 6.80884724855423, "epoch": 0.00728, "grad_norm": 0.04859205707907677, "kl": 0.2924814932048321, "learning_rate": 9.999778384080722e-06, "loss": -0.0096, "step": 728, "step_time": 6.18644273800237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 856.0, "completions/max_terminated_length": 856.0, "completions/mean_length": 222.90625, "completions/mean_terminated_length": 222.90625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.677052795886993, "epoch": 0.00729, "frac_reward_zero_std": 0.25, "grad_norm": 0.08810516446828842, "kl": 0.5605696709826589, "learning_rate": 9.999777743115822e-06, "loss": -0.0182, "num_tokens": 16608963.0, "reward": 0.8930941820144653, "reward_std": 1.3116999864578247, "rewards/rollout_reward_func/mean": 0.8930941820144653, "rewards/rollout_reward_func/std": 1.6286178827285767, "sampling/importance_sampling_ratio/max": 0.5544478297233582, "sampling/importance_sampling_ratio/mean": 0.24048829078674316, "sampling/importance_sampling_ratio/min": 1.742708555241279e-08, "sampling/sampling_logp_difference/max": 3.972442865371704, "sampling/sampling_logp_difference/mean": 1.1449846029281616, "step": 729, "step_time": 8.420452239988663 }, { "clip_ratio/high_max": 0.0416666679084301, "clip_ratio/high_mean": 0.02083333395421505, "clip_ratio/low_mean": 0.004166666883975267, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.025000001303851604, "entropy": 5.66945007443428, "epoch": 0.0073, "grad_norm": 0.0732661783695221, "kl": 0.5583825428038836, "learning_rate": 9.999777101225378e-06, "loss": -0.0184, "step": 730, "step_time": 4.420467649004422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0028409091755747795, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0028409091755747795, "completions/clipped_ratio": 0.0, "completions/max_length": 1298.0, "completions/max_terminated_length": 1298.0, "completions/mean_length": 419.5625, "completions/mean_terminated_length": 419.5625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.70118921995163, "epoch": 0.00731, "frac_reward_zero_std": 0.0, "grad_norm": 0.02638891153037548, "kl": 0.25725139770656824, "learning_rate": 9.999776458409387e-06, "loss": -0.0045, "num_tokens": 16659025.0, "reward": 0.19443246722221375, "reward_std": 1.5462838411331177, "rewards/rollout_reward_func/mean": 0.19443246722221375, "rewards/rollout_reward_func/std": 1.7192109823226929, "sampling/importance_sampling_ratio/max": 0.5478160977363586, "sampling/importance_sampling_ratio/mean": 0.12275257706642151, "sampling/importance_sampling_ratio/min": 1.9428126756352208e-19, "sampling/sampling_logp_difference/max": 10.060467720031738, "sampling/sampling_logp_difference/mean": 1.5291041135787964, "step": 731, "step_time": 10.401141305999772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.008522727526724339, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008522727526724339, "entropy": 6.686111748218536, "epoch": 0.00732, "grad_norm": 0.025363191962242126, "kl": 0.2540538068860769, "learning_rate": 9.999775814667854e-06, "loss": -0.0046, "step": 732, "step_time": 5.481836792998365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1635.0, "completions/max_terminated_length": 1635.0, "completions/mean_length": 576.9375, "completions/mean_terminated_length": 576.9375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.499359607696533, "epoch": 0.00733, "frac_reward_zero_std": 0.0, "grad_norm": 0.1131906807422638, "kl": 0.47800880670547485, "learning_rate": 9.999775170000777e-06, "loss": -0.0131, "num_tokens": 16713599.0, "reward": 0.8664865493774414, "reward_std": 1.301206350326538, "rewards/rollout_reward_func/mean": 0.8664865493774414, "rewards/rollout_reward_func/std": 1.4110556840896606, "sampling/importance_sampling_ratio/max": 0.4321137070655823, "sampling/importance_sampling_ratio/mean": 0.1577906310558319, "sampling/importance_sampling_ratio/min": 2.0389524319910322e-16, "sampling/sampling_logp_difference/max": 11.264769554138184, "sampling/sampling_logp_difference/mean": 1.1084330081939697, "step": 733, "step_time": 11.350280099999509 }, { "clip_ratio/high_max": 0.08380681835114956, "clip_ratio/high_mean": 0.04190340917557478, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.04190340917557478, "entropy": 5.338380664587021, "epoch": 0.00734, "grad_norm": 0.04250817000865936, "kl": 0.49062924832105637, "learning_rate": 9.999774524408155e-06, "loss": -0.0136, "step": 734, "step_time": 6.724595803985721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1456.0, "completions/max_terminated_length": 1456.0, "completions/mean_length": 607.53125, "completions/mean_terminated_length": 607.53125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.349926292896271, "epoch": 0.00735, "frac_reward_zero_std": 0.0, "grad_norm": 0.11584233492612839, "kl": 0.4382718876004219, "learning_rate": 9.99977387788999e-06, "loss": -0.0062, "num_tokens": 16770736.0, "reward": 1.2419533729553223, "reward_std": 1.6848053932189941, "rewards/rollout_reward_func/mean": 1.2419533729553223, "rewards/rollout_reward_func/std": 1.631235122680664, "sampling/importance_sampling_ratio/max": 0.6374194025993347, "sampling/importance_sampling_ratio/mean": 0.18338212370872498, "sampling/importance_sampling_ratio/min": 4.6983973334135953e-07, "sampling/sampling_logp_difference/max": 3.537966012954712, "sampling/sampling_logp_difference/mean": 0.6636948585510254, "step": 735, "step_time": 11.408889367001393 }, { "clip_ratio/high_max": 0.02500000037252903, "clip_ratio/high_mean": 0.012500000186264515, "clip_ratio/low_mean": 0.02083333395421505, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.033333334140479565, "entropy": 4.3272839188575745, "epoch": 0.00736, "grad_norm": 0.03146413341164589, "kl": 0.45181431621313095, "learning_rate": 9.99977323044628e-06, "loss": -0.0065, "step": 736, "step_time": 5.922764864000783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1586.0, "completions/max_terminated_length": 1586.0, "completions/mean_length": 630.46875, "completions/mean_terminated_length": 630.46875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.452367275953293, "epoch": 0.00737, "frac_reward_zero_std": 0.0, "grad_norm": 0.0854644626379013, "kl": 0.32492589578032494, "learning_rate": 9.99977258207703e-06, "loss": -0.0058, "num_tokens": 16828071.0, "reward": 1.1897318363189697, "reward_std": 1.4362249374389648, "rewards/rollout_reward_func/mean": 1.1897318363189697, "rewards/rollout_reward_func/std": 1.4229001998901367, "sampling/importance_sampling_ratio/max": 0.6252310872077942, "sampling/importance_sampling_ratio/mean": 0.11759937554597855, "sampling/importance_sampling_ratio/min": 4.5185459540686423e-11, "sampling/sampling_logp_difference/max": 9.698439598083496, "sampling/sampling_logp_difference/mean": 1.0492420196533203, "step": 737, "step_time": 11.451465808997455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.016666667070239782, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.016666667070239782, "entropy": 5.441973656415939, "epoch": 0.00738, "grad_norm": 0.0196114182472229, "kl": 0.33275438472628593, "learning_rate": 9.999771932782234e-06, "loss": -0.0061, "step": 738, "step_time": 6.181127589996322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 808.0, "completions/max_terminated_length": 808.0, "completions/mean_length": 284.625, "completions/mean_terminated_length": 284.625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.490635991096497, "epoch": 0.00739, "frac_reward_zero_std": 0.25, "grad_norm": 0.04870762303471565, "kl": 0.6051317863166332, "learning_rate": 9.999771282561895e-06, "loss": -0.0083, "num_tokens": 16872837.0, "reward": 0.6226310729980469, "reward_std": 1.135196566581726, "rewards/rollout_reward_func/mean": 0.6226310729980469, "rewards/rollout_reward_func/std": 1.6166706085205078, "sampling/importance_sampling_ratio/max": 0.5491927266120911, "sampling/importance_sampling_ratio/mean": 0.2681816518306732, "sampling/importance_sampling_ratio/min": 4.380841023277071e-09, "sampling/sampling_logp_difference/max": 11.416885375976562, "sampling/sampling_logp_difference/mean": 0.8853177428245544, "step": 739, "step_time": 8.42891403599424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.481172412633896, "epoch": 0.0074, "grad_norm": 0.05560614913702011, "kl": 0.6025199890136719, "learning_rate": 9.999770631416015e-06, "loss": -0.0083, "step": 740, "step_time": 4.411059766996914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "completions/clipped_ratio": 0.0, "completions/max_length": 1145.0, "completions/max_terminated_length": 1145.0, "completions/mean_length": 320.9375, "completions/mean_terminated_length": 320.9375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.547651559114456, "epoch": 0.00741, "frac_reward_zero_std": 0.25, "grad_norm": 0.012282593175768852, "kl": 0.40375559497624636, "learning_rate": 9.99976997934459e-06, "loss": -0.0107, "num_tokens": 16917624.0, "reward": 0.8785643577575684, "reward_std": 0.9428077936172485, "rewards/rollout_reward_func/mean": 0.8785643577575684, "rewards/rollout_reward_func/std": 1.3916940689086914, "sampling/importance_sampling_ratio/max": 0.5545879602432251, "sampling/importance_sampling_ratio/mean": 0.22693884372711182, "sampling/importance_sampling_ratio/min": 2.0049554223078303e-05, "sampling/sampling_logp_difference/max": 2.4866890907287598, "sampling/sampling_logp_difference/mean": 0.9840067625045776, "step": 741, "step_time": 10.090874073997838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.548500955104828, "epoch": 0.00742, "grad_norm": 0.014945538714528084, "kl": 0.4047854170203209, "learning_rate": 9.999769326347624e-06, "loss": -0.0107, "step": 742, "step_time": 5.307178874994861 }, { "clip_ratio/high_max": 0.003289473708719015, "clip_ratio/high_mean": 0.0016447368543595076, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0016447368543595076, "completions/clipped_ratio": 0.09375, "completions/max_length": 942.0, "completions/max_terminated_length": 942.0, "completions/mean_length": 250.53125, "completions/mean_terminated_length": 236.0689697265625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.364381343126297, "epoch": 0.00743, "frac_reward_zero_std": 0.0, "grad_norm": 0.06708469241857529, "kl": 0.36966467183083296, "learning_rate": 9.999768672425116e-06, "loss": -0.0149, "num_tokens": 16961861.0, "reward": 0.4745810925960541, "reward_std": 1.2339882850646973, "rewards/rollout_reward_func/mean": 0.4745810925960541, "rewards/rollout_reward_func/std": 1.6538277864456177, "sampling/importance_sampling_ratio/max": 0.6459321975708008, "sampling/importance_sampling_ratio/mean": 0.20714028179645538, "sampling/importance_sampling_ratio/min": 2.5881554183344348e-14, "sampling/sampling_logp_difference/max": 4.8671064376831055, "sampling/sampling_logp_difference/mean": 1.0460864305496216, "step": 743, "step_time": 8.644163553006365 }, { "clip_ratio/high_max": 0.005681818351149559, "clip_ratio/high_mean": 0.0028409091755747795, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01065340917557478, "entropy": 5.377569019794464, "epoch": 0.00744, "grad_norm": 0.05340445041656494, "kl": 0.36642452888190746, "learning_rate": 9.999768017577065e-06, "loss": -0.0151, "step": 744, "step_time": 4.702252154991584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0026041667442768812, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0026041667442768812, "completions/clipped_ratio": 0.0, "completions/max_length": 1310.0, "completions/max_terminated_length": 1310.0, "completions/mean_length": 532.1875, "completions/mean_terminated_length": 532.1875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.752238512039185, "epoch": 0.00745, "frac_reward_zero_std": 0.0, "grad_norm": 0.026666833087801933, "kl": 0.4163013622164726, "learning_rate": 9.999767361803471e-06, "loss": -0.0223, "num_tokens": 17015925.0, "reward": 0.5920022130012512, "reward_std": 1.3275200128555298, "rewards/rollout_reward_func/mean": 0.5920022130012512, "rewards/rollout_reward_func/std": 1.5046073198318481, "sampling/importance_sampling_ratio/max": 0.3277439475059509, "sampling/importance_sampling_ratio/mean": 0.11053667217493057, "sampling/importance_sampling_ratio/min": 1.104584759039992e-20, "sampling/sampling_logp_difference/max": 13.490654945373535, "sampling/sampling_logp_difference/mean": 1.2810571193695068, "step": 745, "step_time": 10.764528711999446 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0052083334885537624, "clip_ratio/low_mean": 0.0052083334885537624, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 5.776028752326965, "epoch": 0.00746, "grad_norm": 0.026282131671905518, "kl": 0.4213589318096638, "learning_rate": 9.999766705104336e-06, "loss": -0.0224, "step": 746, "step_time": 5.552706374000991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1036.0, "completions/max_terminated_length": 1036.0, "completions/mean_length": 376.46875, "completions/mean_terminated_length": 376.46875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.3331382274627686, "epoch": 0.00747, "frac_reward_zero_std": 0.25, "grad_norm": 0.032012201845645905, "kl": 0.3756320867687464, "learning_rate": 9.999766047479658e-06, "loss": -0.005, "num_tokens": 17061755.0, "reward": 1.3868427276611328, "reward_std": 1.1664371490478516, "rewards/rollout_reward_func/mean": 1.3868427276611328, "rewards/rollout_reward_func/std": 1.3811391592025757, "sampling/importance_sampling_ratio/max": 0.5559043884277344, "sampling/importance_sampling_ratio/mean": 0.2243882715702057, "sampling/importance_sampling_ratio/min": 9.293415132560767e-06, "sampling/sampling_logp_difference/max": 2.601003885269165, "sampling/sampling_logp_difference/mean": 0.989558756351471, "step": 747, "step_time": 9.408571062998817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.3454954624176025, "epoch": 0.00748, "grad_norm": 0.03362347558140755, "kl": 0.3777758963406086, "learning_rate": 9.99976538892944e-06, "loss": -0.0051, "step": 748, "step_time": 4.979120154006523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1045.0, "completions/max_terminated_length": 1045.0, "completions/mean_length": 377.40625, "completions/mean_terminated_length": 377.40625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.634487092494965, "epoch": 0.00749, "frac_reward_zero_std": 0.25, "grad_norm": 0.05620869994163513, "kl": 0.47020887583494186, "learning_rate": 9.99976472945368e-06, "loss": -0.0104, "num_tokens": 17108537.0, "reward": 1.490056037902832, "reward_std": 1.180042028427124, "rewards/rollout_reward_func/mean": 1.490056037902832, "rewards/rollout_reward_func/std": 1.4739887714385986, "sampling/importance_sampling_ratio/max": 0.5562160611152649, "sampling/importance_sampling_ratio/mean": 0.20296847820281982, "sampling/importance_sampling_ratio/min": 3.5060097186567774e-11, "sampling/sampling_logp_difference/max": 5.096293926239014, "sampling/sampling_logp_difference/mean": 1.0258835554122925, "step": 749, "step_time": 9.213720686002489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.018229166977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.018229166977107525, "entropy": 5.655919671058655, "epoch": 0.0075, "grad_norm": 0.016738997772336006, "kl": 0.46769271325320005, "learning_rate": 9.999764069052378e-06, "loss": -0.0105, "step": 750, "step_time": 5.05311055799757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/max_terminated_length": 500.0, "completions/mean_length": 211.84375, "completions/mean_terminated_length": 211.84375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.934899091720581, "epoch": 0.00751, "frac_reward_zero_std": 0.25, "grad_norm": 0.09253121167421341, "kl": 0.7259721867740154, "learning_rate": 9.999763407725536e-06, "loss": -0.0083, "num_tokens": 17149696.0, "reward": 1.4824676513671875, "reward_std": 1.0882136821746826, "rewards/rollout_reward_func/mean": 1.4824676513671875, "rewards/rollout_reward_func/std": 1.3299038410186768, "sampling/importance_sampling_ratio/max": 0.5585011839866638, "sampling/importance_sampling_ratio/mean": 0.29383134841918945, "sampling/importance_sampling_ratio/min": 0.0004024530644528568, "sampling/sampling_logp_difference/max": 2.4179985523223877, "sampling/sampling_logp_difference/mean": 0.8336355090141296, "step": 751, "step_time": 7.6823254249975435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.9213783740997314, "epoch": 0.00752, "grad_norm": 0.08793601393699646, "kl": 0.7352516651153564, "learning_rate": 9.999762745473153e-06, "loss": -0.0083, "step": 752, "step_time": 3.9104702710028505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1475.0, "completions/max_terminated_length": 1475.0, "completions/mean_length": 462.4375, "completions/mean_terminated_length": 462.4375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.575632244348526, "epoch": 0.00753, "frac_reward_zero_std": 0.0, "grad_norm": 0.023579591885209084, "kl": 0.5184198133647442, "learning_rate": 9.999762082295227e-06, "loss": -0.0066, "num_tokens": 17200249.0, "reward": 0.6151323318481445, "reward_std": 1.6501471996307373, "rewards/rollout_reward_func/mean": 0.6151323318481445, "rewards/rollout_reward_func/std": 1.7069603204727173, "sampling/importance_sampling_ratio/max": 0.5422062277793884, "sampling/importance_sampling_ratio/mean": 0.16804806888103485, "sampling/importance_sampling_ratio/min": 1.8083127347088425e-33, "sampling/sampling_logp_difference/max": 4.937162399291992, "sampling/sampling_logp_difference/mean": 1.0433917045593262, "step": 753, "step_time": 11.12292745800005 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 5.575984179973602, "epoch": 0.00754, "grad_norm": 0.023603329434990883, "kl": 0.5200556702911854, "learning_rate": 9.999761418191762e-06, "loss": -0.0067, "step": 754, "step_time": 5.849827272002585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1604.0, "completions/max_terminated_length": 1604.0, "completions/mean_length": 616.40625, "completions/mean_terminated_length": 622.1612548828125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.909896612167358, "epoch": 0.00755, "frac_reward_zero_std": 0.0, "grad_norm": 0.026348376646637917, "kl": 0.2918439363129437, "learning_rate": 9.999760753162758e-06, "loss": -0.0127, "num_tokens": 17256968.0, "reward": 0.6324486136436462, "reward_std": 1.4565963745117188, "rewards/rollout_reward_func/mean": 0.6324486136436462, "rewards/rollout_reward_func/std": 1.6143876314163208, "sampling/importance_sampling_ratio/max": 0.3205808997154236, "sampling/importance_sampling_ratio/mean": 0.046445585787296295, "sampling/importance_sampling_ratio/min": 5.102025864553188e-17, "sampling/sampling_logp_difference/max": 4.492278099060059, "sampling/sampling_logp_difference/mean": 1.2819854021072388, "step": 755, "step_time": 11.24851971899625 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0028409091755747795, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01065340917557478, "entropy": 6.90344762802124, "epoch": 0.00756, "grad_norm": 0.026462063193321228, "kl": 0.2898443997837603, "learning_rate": 9.999760087208213e-06, "loss": -0.0128, "step": 756, "step_time": 6.547971217005397 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.0, "completions/max_length": 1096.0, "completions/max_terminated_length": 1096.0, "completions/mean_length": 435.75, "completions/mean_terminated_length": 435.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.910741209983826, "epoch": 0.00757, "frac_reward_zero_std": 0.0, "grad_norm": 0.03617434203624725, "kl": 0.33797006495296955, "learning_rate": 9.999759420328126e-06, "loss": -0.0199, "num_tokens": 17307742.0, "reward": 1.3101301193237305, "reward_std": 1.6363023519515991, "rewards/rollout_reward_func/mean": 1.3101301193237305, "rewards/rollout_reward_func/std": 1.594202995300293, "sampling/importance_sampling_ratio/max": 0.32840174436569214, "sampling/importance_sampling_ratio/mean": 0.08269453048706055, "sampling/importance_sampling_ratio/min": 5.27039485391096e-23, "sampling/sampling_logp_difference/max": 11.495254516601562, "sampling/sampling_logp_difference/mean": 1.4666507244110107, "step": 757, "step_time": 9.395647186003771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 6.917143702507019, "epoch": 0.00758, "grad_norm": 0.05352688953280449, "kl": 0.33655720949172974, "learning_rate": 9.999758752522502e-06, "loss": -0.0198, "step": 758, "step_time": 5.592795707001642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 573.0, "completions/max_terminated_length": 573.0, "completions/mean_length": 241.59375, "completions/mean_terminated_length": 241.59375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.078848421573639, "epoch": 0.00759, "frac_reward_zero_std": 0.0, "grad_norm": 0.026525791734457016, "kl": 0.7781529761850834, "learning_rate": 9.999758083791337e-06, "loss": -0.0117, "num_tokens": 17349718.0, "reward": 0.2243773490190506, "reward_std": 1.1315746307373047, "rewards/rollout_reward_func/mean": 0.2243773490190506, "rewards/rollout_reward_func/std": 1.5225774049758911, "sampling/importance_sampling_ratio/max": 0.5426336526870728, "sampling/importance_sampling_ratio/mean": 0.25547125935554504, "sampling/importance_sampling_ratio/min": 2.22670504257394e-09, "sampling/sampling_logp_difference/max": 2.989426851272583, "sampling/sampling_logp_difference/mean": 0.9927998781204224, "step": 759, "step_time": 7.25532362800368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.082860946655273, "epoch": 0.0076, "grad_norm": 0.023983974009752274, "kl": 0.7841481082141399, "learning_rate": 9.999757414134631e-06, "loss": -0.0117, "step": 760, "step_time": 3.994886967000639 }, { "clip_ratio/high_max": 0.00657894741743803, "clip_ratio/high_mean": 0.003289473708719015, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009539473801851273, "completions/clipped_ratio": 0.03125, "completions/max_length": 1201.0, "completions/max_terminated_length": 1201.0, "completions/mean_length": 536.15625, "completions/mean_terminated_length": 552.9354858398438, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.588240265846252, "epoch": 0.00761, "frac_reward_zero_std": 0.0, "grad_norm": 0.018939951434731483, "kl": 0.40575031377375126, "learning_rate": 9.999756743552387e-06, "loss": -0.0187, "num_tokens": 17402940.0, "reward": 1.277022123336792, "reward_std": 1.5545145273208618, "rewards/rollout_reward_func/mean": 1.277022123336792, "rewards/rollout_reward_func/std": 1.6377042531967163, "sampling/importance_sampling_ratio/max": 0.5533004403114319, "sampling/importance_sampling_ratio/mean": 0.17111057043075562, "sampling/importance_sampling_ratio/min": 6.98455959735611e-09, "sampling/sampling_logp_difference/max": 4.78085994720459, "sampling/sampling_logp_difference/mean": 1.0223751068115234, "step": 761, "step_time": 9.989716784006305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.009539473801851273, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009539473801851273, "entropy": 5.612524479627609, "epoch": 0.00762, "grad_norm": 0.020610690116882324, "kl": 0.40202651359140873, "learning_rate": 9.999756072044602e-06, "loss": -0.0187, "step": 762, "step_time": 5.829031949000637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 976.0, "completions/max_terminated_length": 976.0, "completions/mean_length": 279.84375, "completions/mean_terminated_length": 278.2257995605469, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.697990268468857, "epoch": 0.00763, "frac_reward_zero_std": 0.25, "grad_norm": 0.040917012840509415, "kl": 0.6448713392019272, "learning_rate": 9.99975539961128e-06, "loss": -0.0087, "num_tokens": 17447372.0, "reward": 1.2393696308135986, "reward_std": 0.7434722185134888, "rewards/rollout_reward_func/mean": 1.2393696308135986, "rewards/rollout_reward_func/std": 1.368448257446289, "sampling/importance_sampling_ratio/max": 0.5511399507522583, "sampling/importance_sampling_ratio/mean": 0.25340527296066284, "sampling/importance_sampling_ratio/min": 5.812996750620414e-10, "sampling/sampling_logp_difference/max": 3.5506749153137207, "sampling/sampling_logp_difference/mean": 0.8788272738456726, "step": 763, "step_time": 8.941345933995763 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 4.695697337388992, "epoch": 0.00764, "grad_norm": 0.03205671161413193, "kl": 0.6446069180965424, "learning_rate": 9.999754726252418e-06, "loss": -0.0088, "step": 764, "step_time": 5.266124393001519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "completions/clipped_ratio": 0.0, "completions/max_length": 551.0, "completions/max_terminated_length": 551.0, "completions/mean_length": 174.34375, "completions/mean_terminated_length": 174.34375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.044273793697357, "epoch": 0.00765, "frac_reward_zero_std": 0.25, "grad_norm": 0.039656102657318115, "kl": 0.5370845757424831, "learning_rate": 9.999754051968017e-06, "loss": -0.0079, "num_tokens": 17486973.0, "reward": 1.3180228471755981, "reward_std": 1.1697583198547363, "rewards/rollout_reward_func/mean": 1.3180228471755981, "rewards/rollout_reward_func/std": 1.514497995376587, "sampling/importance_sampling_ratio/max": 0.5570627450942993, "sampling/importance_sampling_ratio/mean": 0.291839063167572, "sampling/importance_sampling_ratio/min": 0.00034834592952392995, "sampling/sampling_logp_difference/max": 2.827042818069458, "sampling/sampling_logp_difference/mean": 0.8985918760299683, "step": 765, "step_time": 7.0855432439930155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 5.031380593776703, "epoch": 0.00766, "grad_norm": 0.03139194846153259, "kl": 0.5402332805097103, "learning_rate": 9.999753376758078e-06, "loss": -0.008, "step": 766, "step_time": 3.888866520992451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 515.0, "completions/max_terminated_length": 515.0, "completions/mean_length": 206.625, "completions/mean_terminated_length": 212.77418518066406, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.529975593090057, "epoch": 0.00767, "frac_reward_zero_std": 0.25, "grad_norm": 0.00711762486025691, "kl": 0.7611786164343357, "learning_rate": 9.9997527006226e-06, "loss": -0.0145, "num_tokens": 17526453.0, "reward": 1.173091173171997, "reward_std": 0.7719895839691162, "rewards/rollout_reward_func/mean": 1.173091173171997, "rewards/rollout_reward_func/std": 1.17051100730896, "sampling/importance_sampling_ratio/max": 0.558090329170227, "sampling/importance_sampling_ratio/mean": 0.31916719675064087, "sampling/importance_sampling_ratio/min": 3.29084144823355e-07, "sampling/sampling_logp_difference/max": 5.101071357727051, "sampling/sampling_logp_difference/mean": 0.8929594159126282, "step": 767, "step_time": 7.103158813992195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.004166666883975267, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004166666883975267, "entropy": 4.519268870353699, "epoch": 0.00768, "grad_norm": 0.006663104984909296, "kl": 0.7620422057807446, "learning_rate": 9.999752023561584e-06, "loss": -0.0145, "step": 768, "step_time": 4.460855605000688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 748.0, "completions/max_terminated_length": 748.0, "completions/mean_length": 268.625, "completions/mean_terminated_length": 264.7742004394531, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.164897263050079, "epoch": 0.00769, "frac_reward_zero_std": 0.25, "grad_norm": 0.030413055792450905, "kl": 0.5918042212724686, "learning_rate": 9.999751345575029e-06, "loss": -0.0154, "num_tokens": 17570875.0, "reward": 0.41086918115615845, "reward_std": 1.1066244840621948, "rewards/rollout_reward_func/mean": 0.41086918115615845, "rewards/rollout_reward_func/std": 1.4596521854400635, "sampling/importance_sampling_ratio/max": 0.5794389843940735, "sampling/importance_sampling_ratio/mean": 0.21135905385017395, "sampling/importance_sampling_ratio/min": 1.3789672029102776e-15, "sampling/sampling_logp_difference/max": 3.955777168273926, "sampling/sampling_logp_difference/mean": 1.2578108310699463, "step": 769, "step_time": 8.326857875996211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 6.161556035280228, "epoch": 0.0077, "grad_norm": 0.030549656599760056, "kl": 0.5938818082213402, "learning_rate": 9.999750666662938e-06, "loss": -0.0154, "step": 770, "step_time": 4.854843978002464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1008.0, "completions/max_terminated_length": 1008.0, "completions/mean_length": 267.46875, "completions/mean_terminated_length": 267.46875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.331241965293884, "epoch": 0.00771, "frac_reward_zero_std": 0.0, "grad_norm": 0.029157863929867744, "kl": 0.7495228722691536, "learning_rate": 9.999749986825307e-06, "loss": -0.0167, "num_tokens": 17613863.0, "reward": 1.0042529106140137, "reward_std": 1.3105833530426025, "rewards/rollout_reward_func/mean": 1.0042529106140137, "rewards/rollout_reward_func/std": 1.691234827041626, "sampling/importance_sampling_ratio/max": 0.5603994727134705, "sampling/importance_sampling_ratio/mean": 0.30306878685951233, "sampling/importance_sampling_ratio/min": 3.572447155875125e-07, "sampling/sampling_logp_difference/max": 5.252004623413086, "sampling/sampling_logp_difference/mean": 0.7703447341918945, "step": 771, "step_time": 8.774070241001027 }, { "clip_ratio/high_max": 0.0625, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 4.290276765823364, "epoch": 0.00772, "grad_norm": 0.014897690154612064, "kl": 0.7585486769676208, "learning_rate": 9.999749306062141e-06, "loss": -0.0168, "step": 772, "step_time": 4.910475027998473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 928.0, "completions/max_terminated_length": 928.0, "completions/mean_length": 237.09375, "completions/mean_terminated_length": 237.09375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.747214823961258, "epoch": 0.00773, "frac_reward_zero_std": 0.0, "grad_norm": 0.03810819983482361, "kl": 0.7301519438624382, "learning_rate": 9.999748624373435e-06, "loss": -0.0146, "num_tokens": 17654681.0, "reward": 1.3355209827423096, "reward_std": 1.4799165725708008, "rewards/rollout_reward_func/mean": 1.3355209827423096, "rewards/rollout_reward_func/std": 1.445932149887085, "sampling/importance_sampling_ratio/max": 0.5563107132911682, "sampling/importance_sampling_ratio/mean": 0.28612789511680603, "sampling/importance_sampling_ratio/min": 1.3284660482781874e-08, "sampling/sampling_logp_difference/max": 4.878093719482422, "sampling/sampling_logp_difference/mean": 0.9206214547157288, "step": 773, "step_time": 8.90247534300579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 4.744492799043655, "epoch": 0.00774, "grad_norm": 0.039089832454919815, "kl": 0.7296681143343449, "learning_rate": 9.999747941759192e-06, "loss": -0.0146, "step": 774, "step_time": 5.204999350000435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/max_terminated_length": 501.0, "completions/mean_length": 106.28125, "completions/mean_terminated_length": 106.28125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.2078332006931305, "epoch": 0.00775, "frac_reward_zero_std": 0.25, "grad_norm": 0.07999315857887268, "kl": 0.8435688093304634, "learning_rate": 9.999747258219414e-06, "loss": -0.0195, "num_tokens": 17690634.0, "reward": 0.721333920955658, "reward_std": 0.9274333715438843, "rewards/rollout_reward_func/mean": 0.721333920955658, "rewards/rollout_reward_func/std": 1.6238698959350586, "sampling/importance_sampling_ratio/max": 0.7511751055717468, "sampling/importance_sampling_ratio/mean": 0.37947380542755127, "sampling/importance_sampling_ratio/min": 2.6935504138236865e-07, "sampling/sampling_logp_difference/max": 4.024227142333984, "sampling/sampling_logp_difference/mean": 0.76801598072052, "step": 775, "step_time": 6.886903683993296 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.026041666977107525, "entropy": 4.209187030792236, "epoch": 0.00776, "grad_norm": 0.035422373563051224, "kl": 0.8483177796006203, "learning_rate": 9.999746573754097e-06, "loss": -0.02, "step": 776, "step_time": 4.31634176100124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/max_terminated_length": 514.0, "completions/mean_length": 117.84375, "completions/mean_terminated_length": 117.84375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.743181824684143, "epoch": 0.00777, "frac_reward_zero_std": 0.5, "grad_norm": 0.04321707412600517, "kl": 0.6118834614753723, "learning_rate": 9.999745888363244e-06, "loss": -0.0068, "num_tokens": 17728004.0, "reward": 1.606997013092041, "reward_std": 0.6785095930099487, "rewards/rollout_reward_func/mean": 1.606997013092041, "rewards/rollout_reward_func/std": 0.9840222597122192, "sampling/importance_sampling_ratio/max": 0.5604978799819946, "sampling/importance_sampling_ratio/mean": 0.41263991594314575, "sampling/importance_sampling_ratio/min": 0.0003459248982835561, "sampling/sampling_logp_difference/max": 3.006181240081787, "sampling/sampling_logp_difference/mean": 0.6178587079048157, "step": 777, "step_time": 6.6834774340022705 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 3.7547148168087006, "epoch": 0.00778, "grad_norm": 0.022584987804293633, "kl": 0.614430271089077, "learning_rate": 9.999745202046853e-06, "loss": -0.0069, "step": 778, "step_time": 3.7560465740061773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1108.0, "completions/max_terminated_length": 1108.0, "completions/mean_length": 378.875, "completions/mean_terminated_length": 378.875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.088224291801453, "epoch": 0.00779, "frac_reward_zero_std": 0.0, "grad_norm": 0.08513360470533371, "kl": 0.6235489994287491, "learning_rate": 9.999744514804925e-06, "loss": -0.0073, "num_tokens": 17776862.0, "reward": 0.8276459574699402, "reward_std": 1.766157865524292, "rewards/rollout_reward_func/mean": 0.8276459574699402, "rewards/rollout_reward_func/std": 1.8020845651626587, "sampling/importance_sampling_ratio/max": 0.5392987132072449, "sampling/importance_sampling_ratio/mean": 0.17483018338680267, "sampling/importance_sampling_ratio/min": 6.877268333482789e-06, "sampling/sampling_logp_difference/max": 2.8763630390167236, "sampling/sampling_logp_difference/mean": 0.8727230429649353, "step": 779, "step_time": 9.816498252006568 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 5.131264925003052, "epoch": 0.0078, "grad_norm": 0.10892699658870697, "kl": 0.6250488944351673, "learning_rate": 9.999743826637464e-06, "loss": -0.0077, "step": 780, "step_time": 5.175737560995913 }, { "clip_ratio/high_max": 0.011363636702299118, "clip_ratio/high_mean": 0.005681818351149559, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005681818351149559, "completions/clipped_ratio": 0.0, "completions/max_length": 1103.0, "completions/max_terminated_length": 1103.0, "completions/mean_length": 307.75, "completions/mean_terminated_length": 307.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.233185142278671, "epoch": 0.00781, "frac_reward_zero_std": 0.25, "grad_norm": 0.005553718190640211, "kl": 0.5123361237347126, "learning_rate": 9.999743137544465e-06, "loss": -0.0122, "num_tokens": 17822803.0, "reward": 1.3505842685699463, "reward_std": 1.1903436183929443, "rewards/rollout_reward_func/mean": 1.3505842685699463, "rewards/rollout_reward_func/std": 1.6066209077835083, "sampling/importance_sampling_ratio/max": 0.5551131963729858, "sampling/importance_sampling_ratio/mean": 0.2236855924129486, "sampling/importance_sampling_ratio/min": 1.698728374321945e-05, "sampling/sampling_logp_difference/max": 4.168777942657471, "sampling/sampling_logp_difference/mean": 0.9214679002761841, "step": 781, "step_time": 9.516636459997244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.2738178968429565, "epoch": 0.00782, "grad_norm": 0.005654075648635626, "kl": 0.5157980658113956, "learning_rate": 9.999742447525931e-06, "loss": -0.0121, "step": 782, "step_time": 5.13541822899424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1118.0, "completions/max_terminated_length": 1118.0, "completions/mean_length": 255.65625, "completions/mean_terminated_length": 255.65625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.988871186971664, "epoch": 0.00783, "frac_reward_zero_std": 0.5, "grad_norm": 0.008372110314667225, "kl": 0.5354340504854918, "learning_rate": 9.99974175658186e-06, "loss": -0.0039, "num_tokens": 17864858.0, "reward": 1.0860227346420288, "reward_std": 0.8936514854431152, "rewards/rollout_reward_func/mean": 1.0860227346420288, "rewards/rollout_reward_func/std": 1.6339399814605713, "sampling/importance_sampling_ratio/max": 0.5578740239143372, "sampling/importance_sampling_ratio/mean": 0.3038545846939087, "sampling/importance_sampling_ratio/min": 7.010476110735908e-06, "sampling/sampling_logp_difference/max": 3.1507515907287598, "sampling/sampling_logp_difference/mean": 0.9047021269798279, "step": 783, "step_time": 9.067674606001674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.0063716173172, "epoch": 0.00784, "grad_norm": 0.008126320317387581, "kl": 0.5363357421010733, "learning_rate": 9.999741064712254e-06, "loss": -0.0039, "step": 784, "step_time": 5.090304327008198 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0028409091755747795, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013257576152682304, "completions/clipped_ratio": 0.0625, "completions/max_length": 1198.0, "completions/max_terminated_length": 1198.0, "completions/mean_length": 326.875, "completions/mean_terminated_length": 321.73333740234375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.694684207439423, "epoch": 0.00785, "frac_reward_zero_std": 0.0, "grad_norm": 0.05406257137656212, "kl": 0.3522528298199177, "learning_rate": 9.999740371917113e-06, "loss": -0.0142, "num_tokens": 17911934.0, "reward": 0.47740092873573303, "reward_std": 1.3048415184020996, "rewards/rollout_reward_func/mean": 0.47740092873573303, "rewards/rollout_reward_func/std": 1.3944610357284546, "sampling/importance_sampling_ratio/max": 0.49954915046691895, "sampling/importance_sampling_ratio/mean": 0.11643362045288086, "sampling/importance_sampling_ratio/min": 2.806228201066496e-15, "sampling/sampling_logp_difference/max": 4.009269714355469, "sampling/sampling_logp_difference/mean": 1.3549714088439941, "step": 785, "step_time": 9.821180439997988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 6.708090126514435, "epoch": 0.00786, "grad_norm": 0.09246014058589935, "kl": 0.35282826237380505, "learning_rate": 9.999739678196437e-06, "loss": -0.0142, "step": 786, "step_time": 5.257623068999237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 973.0, "completions/max_terminated_length": 973.0, "completions/mean_length": 183.84375, "completions/mean_terminated_length": 176.5806427001953, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.3928718864917755, "epoch": 0.00787, "frac_reward_zero_std": 0.5, "grad_norm": 0.009686038829386234, "kl": 0.6981099508702755, "learning_rate": 9.999738983550224e-06, "loss": -0.0108, "num_tokens": 17950885.0, "reward": 1.663719892501831, "reward_std": 0.7475528717041016, "rewards/rollout_reward_func/mean": 1.663719892501831, "rewards/rollout_reward_func/std": 1.1636477708816528, "sampling/importance_sampling_ratio/max": 0.55673748254776, "sampling/importance_sampling_ratio/mean": 0.33758825063705444, "sampling/importance_sampling_ratio/min": 1.1256601119974396e-12, "sampling/sampling_logp_difference/max": 3.1495165824890137, "sampling/sampling_logp_difference/mean": 0.8009759187698364, "step": 787, "step_time": 9.012830818996008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.3807573318481445, "epoch": 0.00788, "grad_norm": 0.010370921343564987, "kl": 0.6933284439146519, "learning_rate": 9.999738287978477e-06, "loss": -0.0108, "step": 788, "step_time": 4.810867237003549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "completions/clipped_ratio": 0.0, "completions/max_length": 1694.0, "completions/max_terminated_length": 1694.0, "completions/mean_length": 426.21875, "completions/mean_terminated_length": 426.21875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.420791447162628, "epoch": 0.00789, "frac_reward_zero_std": 0.0, "grad_norm": 0.07609885185956955, "kl": 0.5146136824041605, "learning_rate": 9.999737591481196e-06, "loss": -0.0132, "num_tokens": 18001416.0, "reward": 0.810278058052063, "reward_std": 1.6930299997329712, "rewards/rollout_reward_func/mean": 0.810278058052063, "rewards/rollout_reward_func/std": 1.7924693822860718, "sampling/importance_sampling_ratio/max": 0.5457202196121216, "sampling/importance_sampling_ratio/mean": 0.1313910335302353, "sampling/importance_sampling_ratio/min": 1.5343860626515493e-09, "sampling/sampling_logp_difference/max": 4.15908145904541, "sampling/sampling_logp_difference/mean": 1.3393747806549072, "step": 789, "step_time": 11.270620226001483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 6.430220544338226, "epoch": 0.0079, "grad_norm": 0.07341763377189636, "kl": 0.5065960586071014, "learning_rate": 9.999736894058379e-06, "loss": -0.0133, "step": 790, "step_time": 6.3467642340074235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1373.0, "completions/max_terminated_length": 1373.0, "completions/mean_length": 498.5, "completions/mean_terminated_length": 498.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.838984042406082, "epoch": 0.00791, "frac_reward_zero_std": 0.25, "grad_norm": 0.009681105613708496, "kl": 0.6359118148684502, "learning_rate": 9.999736195710027e-06, "loss": -0.0082, "num_tokens": 18052132.0, "reward": 1.290679931640625, "reward_std": 1.3756345510482788, "rewards/rollout_reward_func/mean": 1.290679931640625, "rewards/rollout_reward_func/std": 1.652610182762146, "sampling/importance_sampling_ratio/max": 0.558882474899292, "sampling/importance_sampling_ratio/mean": 0.2001921832561493, "sampling/importance_sampling_ratio/min": 9.249243081502212e-13, "sampling/sampling_logp_difference/max": 9.85413932800293, "sampling/sampling_logp_difference/mean": 1.2396645545959473, "step": 791, "step_time": 10.973378315000446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.833165884017944, "epoch": 0.00792, "grad_norm": 0.00962514616549015, "kl": 0.634971622377634, "learning_rate": 9.999735496436145e-06, "loss": -0.0082, "step": 792, "step_time": 6.2227880010032095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "completions/clipped_ratio": 0.0, "completions/max_length": 1062.0, "completions/max_terminated_length": 1062.0, "completions/mean_length": 381.9375, "completions/mean_terminated_length": 381.9375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.913483202457428, "epoch": 0.00793, "frac_reward_zero_std": 0.0, "grad_norm": 0.0075057074427604675, "kl": 0.663780840113759, "learning_rate": 9.999734796236725e-06, "loss": -0.0107, "num_tokens": 18098042.0, "reward": 1.4541244506835938, "reward_std": 1.3934955596923828, "rewards/rollout_reward_func/mean": 1.4541244506835938, "rewards/rollout_reward_func/std": 1.5593856573104858, "sampling/importance_sampling_ratio/max": 0.5597532391548157, "sampling/importance_sampling_ratio/mean": 0.2932572364807129, "sampling/importance_sampling_ratio/min": 7.896797799000765e-12, "sampling/sampling_logp_difference/max": 4.097633361816406, "sampling/sampling_logp_difference/mean": 1.0161813497543335, "step": 793, "step_time": 9.05111920899435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 4.904108643531799, "epoch": 0.00794, "grad_norm": 0.006967233028262854, "kl": 0.6610964853316545, "learning_rate": 9.999734095111773e-06, "loss": -0.0107, "step": 794, "step_time": 5.0618663919995015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1369.0, "completions/max_terminated_length": 1369.0, "completions/mean_length": 452.40625, "completions/mean_terminated_length": 452.40625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.5919064581394196, "epoch": 0.00795, "frac_reward_zero_std": 0.0, "grad_norm": 0.012883273884654045, "kl": 0.6226501911878586, "learning_rate": 9.999733393061286e-06, "loss": -0.014, "num_tokens": 18148250.0, "reward": 0.9828009605407715, "reward_std": 1.1741504669189453, "rewards/rollout_reward_func/mean": 0.9828009605407715, "rewards/rollout_reward_func/std": 1.2959059476852417, "sampling/importance_sampling_ratio/max": 0.5556159615516663, "sampling/importance_sampling_ratio/mean": 0.20153363049030304, "sampling/importance_sampling_ratio/min": 2.915875596531947e-10, "sampling/sampling_logp_difference/max": 4.406022548675537, "sampling/sampling_logp_difference/mean": 1.1646894216537476, "step": 795, "step_time": 10.41498899299404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.583464682102203, "epoch": 0.00796, "grad_norm": 0.012277374975383282, "kl": 0.6208663545548916, "learning_rate": 9.999732690085267e-06, "loss": -0.0139, "step": 796, "step_time": 5.724274896001589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1090.0, "completions/max_terminated_length": 1090.0, "completions/mean_length": 202.8125, "completions/mean_terminated_length": 202.8125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.0661289393901825, "epoch": 0.00797, "frac_reward_zero_std": 0.5, "grad_norm": 0.010095727629959583, "kl": 0.8844674080610275, "learning_rate": 9.999731986183711e-06, "loss": -0.0041, "num_tokens": 18185240.0, "reward": 1.8338711261749268, "reward_std": 0.6992128491401672, "rewards/rollout_reward_func/mean": 1.8338711261749268, "rewards/rollout_reward_func/std": 0.9769031405448914, "sampling/importance_sampling_ratio/max": 0.5613255500793457, "sampling/importance_sampling_ratio/mean": 0.4395048916339874, "sampling/importance_sampling_ratio/min": 1.563217483635526e-05, "sampling/sampling_logp_difference/max": 3.5432419776916504, "sampling/sampling_logp_difference/mean": 0.48559796810150146, "step": 797, "step_time": 9.051400657997874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.0544377863407135, "epoch": 0.00798, "grad_norm": 0.009150011464953423, "kl": 0.8840001150965691, "learning_rate": 9.999731281356627e-06, "loss": -0.0041, "step": 798, "step_time": 5.387432087994966 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "completions/clipped_ratio": 0.0, "completions/max_length": 1522.0, "completions/max_terminated_length": 1522.0, "completions/mean_length": 437.15625, "completions/mean_terminated_length": 437.15625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.4258162677288055, "epoch": 0.00799, "frac_reward_zero_std": 0.25, "grad_norm": 0.15777434408664703, "kl": 0.5216842694208026, "learning_rate": 9.999730575604006e-06, "loss": -0.0086, "num_tokens": 18234503.0, "reward": 1.5973279476165771, "reward_std": 1.0302233695983887, "rewards/rollout_reward_func/mean": 1.5973279476165771, "rewards/rollout_reward_func/std": 1.1867154836654663, "sampling/importance_sampling_ratio/max": 0.5514029860496521, "sampling/importance_sampling_ratio/mean": 0.22605104744434357, "sampling/importance_sampling_ratio/min": 6.098043741076253e-06, "sampling/sampling_logp_difference/max": 2.580040693283081, "sampling/sampling_logp_difference/mean": 1.0357426404953003, "step": 799, "step_time": 10.569576592992235 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 5.422855347394943, "epoch": 0.008, "grad_norm": 0.04687910154461861, "kl": 0.5463248025625944, "learning_rate": 9.999729868925855e-06, "loss": -0.0091, "step": 800, "step_time": 6.009816909001529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 648.0, "completions/max_terminated_length": 648.0, "completions/mean_length": 242.5625, "completions/mean_terminated_length": 243.1333465576172, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.4905965924263, "epoch": 0.00801, "frac_reward_zero_std": 0.0, "grad_norm": 0.09515002369880676, "kl": 0.4684169590473175, "learning_rate": 9.99972916132217e-06, "loss": -0.0137, "num_tokens": 18278048.0, "reward": 0.7307156324386597, "reward_std": 1.450479507446289, "rewards/rollout_reward_func/mean": 0.7307156324386597, "rewards/rollout_reward_func/std": 1.4636036157608032, "sampling/importance_sampling_ratio/max": 0.5470131635665894, "sampling/importance_sampling_ratio/mean": 0.16158750653266907, "sampling/importance_sampling_ratio/min": 7.794129835860095e-16, "sampling/sampling_logp_difference/max": 4.249660015106201, "sampling/sampling_logp_difference/mean": 1.3952391147613525, "step": 801, "step_time": 7.923123172997293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 6.488843530416489, "epoch": 0.00802, "grad_norm": 0.09429824352264404, "kl": 0.4632692113518715, "learning_rate": 9.999728452792951e-06, "loss": -0.0138, "step": 802, "step_time": 4.2057292940044135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1045.0, "completions/max_terminated_length": 1045.0, "completions/mean_length": 407.5, "completions/mean_terminated_length": 410.3333435058594, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.42820143699646, "epoch": 0.00803, "frac_reward_zero_std": 0.0, "grad_norm": 0.11700236052274704, "kl": 0.5735816359519958, "learning_rate": 9.999727743338202e-06, "loss": -0.0123, "num_tokens": 18327485.0, "reward": 1.0778735876083374, "reward_std": 1.4202070236206055, "rewards/rollout_reward_func/mean": 1.0778735876083374, "rewards/rollout_reward_func/std": 1.4080612659454346, "sampling/importance_sampling_ratio/max": 0.5566246509552002, "sampling/importance_sampling_ratio/mean": 0.19442830979824066, "sampling/importance_sampling_ratio/min": 1.6356474085650109e-15, "sampling/sampling_logp_difference/max": 12.735309600830078, "sampling/sampling_logp_difference/mean": 1.1502233743667603, "step": 803, "step_time": 9.859601370000746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666744276881, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666744276881, "entropy": 5.428371906280518, "epoch": 0.00804, "grad_norm": 0.062026772648096085, "kl": 0.543616471812129, "learning_rate": 9.99972703295792e-06, "loss": -0.0126, "step": 804, "step_time": 5.530142164996505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1425.0, "completions/max_terminated_length": 1425.0, "completions/mean_length": 183.34375, "completions/mean_terminated_length": 183.34375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.4341573417186737, "epoch": 0.00805, "frac_reward_zero_std": 0.75, "grad_norm": 0.00782847311347723, "kl": 0.9497392363846302, "learning_rate": 9.999726321652106e-06, "loss": -0.0035, "num_tokens": 18364304.0, "reward": 1.9153661727905273, "reward_std": 0.5423017740249634, "rewards/rollout_reward_func/mean": 1.9153661727905273, "rewards/rollout_reward_func/std": 1.0392372608184814, "sampling/importance_sampling_ratio/max": 0.5606573820114136, "sampling/importance_sampling_ratio/mean": 0.42390328645706177, "sampling/importance_sampling_ratio/min": 5.2208393858110824e-12, "sampling/sampling_logp_difference/max": 3.8280487060546875, "sampling/sampling_logp_difference/mean": 0.6264277696609497, "step": 805, "step_time": 10.093419090997486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0024999999441206455, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0024999999441206455, "entropy": 3.436723083257675, "epoch": 0.00806, "grad_norm": 0.007447425276041031, "kl": 0.9492172561585903, "learning_rate": 9.999725609420761e-06, "loss": -0.0035, "step": 806, "step_time": 5.733761728995887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1135.0, "completions/max_terminated_length": 1135.0, "completions/mean_length": 348.21875, "completions/mean_terminated_length": 348.21875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.621841758489609, "epoch": 0.00807, "frac_reward_zero_std": 0.25, "grad_norm": 0.03207160159945488, "kl": 0.5516029726713896, "learning_rate": 9.999724896263882e-06, "loss": -0.0128, "num_tokens": 18411127.0, "reward": 0.8837496638298035, "reward_std": 0.9133739471435547, "rewards/rollout_reward_func/mean": 0.8837496638298035, "rewards/rollout_reward_func/std": 1.287269115447998, "sampling/importance_sampling_ratio/max": 0.5540860891342163, "sampling/importance_sampling_ratio/mean": 0.21528474986553192, "sampling/importance_sampling_ratio/min": 4.365537733974634e-06, "sampling/sampling_logp_difference/max": 2.766023635864258, "sampling/sampling_logp_difference/mean": 1.0746804475784302, "step": 807, "step_time": 9.513878928006307 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0052083334885537624, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.020833333488553762, "entropy": 5.618584483861923, "epoch": 0.00808, "grad_norm": 0.01370350830256939, "kl": 0.550208686850965, "learning_rate": 9.999724182181473e-06, "loss": -0.0129, "step": 808, "step_time": 5.15336468500027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1349.0, "completions/max_terminated_length": 1349.0, "completions/mean_length": 376.21875, "completions/mean_terminated_length": 373.58062744140625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.578514873981476, "epoch": 0.00809, "frac_reward_zero_std": 0.25, "grad_norm": 0.041900794953107834, "kl": 0.6161585114896297, "learning_rate": 9.999723467173534e-06, "loss": -0.0101, "num_tokens": 18457168.0, "reward": 0.9911272525787354, "reward_std": 1.0877071619033813, "rewards/rollout_reward_func/mean": 0.9911272525787354, "rewards/rollout_reward_func/std": 1.4722671508789062, "sampling/importance_sampling_ratio/max": 0.5583702325820923, "sampling/importance_sampling_ratio/mean": 0.19086956977844238, "sampling/importance_sampling_ratio/min": 2.643238546876786e-13, "sampling/sampling_logp_difference/max": 12.696361541748047, "sampling/sampling_logp_difference/mean": 1.1764492988586426, "step": 809, "step_time": 10.892886572000862 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 5.575417459011078, "epoch": 0.0081, "grad_norm": 0.021556977182626724, "kl": 0.6072578728199005, "learning_rate": 9.999722751240062e-06, "loss": -0.0102, "step": 810, "step_time": 5.891864250999788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 597.0, "completions/max_terminated_length": 597.0, "completions/mean_length": 49.5, "completions/mean_terminated_length": 49.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.6829024255275726, "epoch": 0.00811, "frac_reward_zero_std": 0.5, "grad_norm": 0.07428892701864243, "kl": 0.7869374081492424, "learning_rate": 9.999722034381061e-06, "loss": -0.0044, "num_tokens": 18490273.0, "reward": 1.268718957901001, "reward_std": 0.4867534339427948, "rewards/rollout_reward_func/mean": 1.268718957901001, "rewards/rollout_reward_func/std": 1.239928960800171, "sampling/importance_sampling_ratio/max": 0.5618445873260498, "sampling/importance_sampling_ratio/mean": 0.4456453323364258, "sampling/importance_sampling_ratio/min": 3.891029791702749e-06, "sampling/sampling_logp_difference/max": 2.4323973655700684, "sampling/sampling_logp_difference/mean": 0.5640382766723633, "step": 811, "step_time": 6.988483047996851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.057291666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.057291666977107525, "entropy": 3.799774795770645, "epoch": 0.00812, "grad_norm": 0.01821761764585972, "kl": 0.7656202986836433, "learning_rate": 9.999721316596529e-06, "loss": -0.0045, "step": 812, "step_time": 3.8982586590027495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1041.0, "completions/max_terminated_length": 1041.0, "completions/mean_length": 194.6875, "completions/mean_terminated_length": 194.6875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.0832600593566895, "epoch": 0.00813, "frac_reward_zero_std": 0.25, "grad_norm": 0.09899754077196121, "kl": 0.510361798107624, "learning_rate": 9.999720597886464e-06, "loss": -0.0166, "num_tokens": 18530765.0, "reward": 0.686419665813446, "reward_std": 1.191809058189392, "rewards/rollout_reward_func/mean": 0.686419665813446, "rewards/rollout_reward_func/std": 1.5740201473236084, "sampling/importance_sampling_ratio/max": 0.5619470477104187, "sampling/importance_sampling_ratio/mean": 0.2733971178531647, "sampling/importance_sampling_ratio/min": 0.00011973419896094128, "sampling/sampling_logp_difference/max": 2.9965567588806152, "sampling/sampling_logp_difference/mean": 0.9042349457740784, "step": 813, "step_time": 8.803800609999598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 5.167067050933838, "epoch": 0.00814, "grad_norm": 0.09165303409099579, "kl": 0.4959615971893072, "learning_rate": 9.99971987825087e-06, "loss": -0.0171, "step": 814, "step_time": 4.921329856006196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "completions/clipped_ratio": 0.0, "completions/max_length": 1010.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 375.59375, "completions/mean_terminated_length": 375.59375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.426855683326721, "epoch": 0.00815, "frac_reward_zero_std": 0.0, "grad_norm": 0.11542637646198273, "kl": 0.4690941460430622, "learning_rate": 9.999719157689747e-06, "loss": -0.0064, "num_tokens": 18578684.0, "reward": 0.5937157273292542, "reward_std": 1.2138688564300537, "rewards/rollout_reward_func/mean": 0.5937157273292542, "rewards/rollout_reward_func/std": 1.6747989654541016, "sampling/importance_sampling_ratio/max": 0.6809327602386475, "sampling/importance_sampling_ratio/mean": 0.15220028162002563, "sampling/importance_sampling_ratio/min": 3.340563125675544e-06, "sampling/sampling_logp_difference/max": 4.266809463500977, "sampling/sampling_logp_difference/mean": 1.2701810598373413, "step": 815, "step_time": 9.789973366005142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.07812500093132257, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.07812500093132257, "entropy": 6.7073909640312195, "epoch": 0.00816, "grad_norm": 0.05142078176140785, "kl": 0.43942729383707047, "learning_rate": 9.999718436203094e-06, "loss": -0.0069, "step": 816, "step_time": 4.930110349003371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 437.0, "completions/max_terminated_length": 437.0, "completions/mean_length": 93.84375, "completions/mean_terminated_length": 85.7741928100586, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.069375395774841, "epoch": 0.00817, "frac_reward_zero_std": 0.25, "grad_norm": 0.1364990621805191, "kl": 0.7652542367577553, "learning_rate": 9.999717713790909e-06, "loss": -0.0115, "num_tokens": 18615047.0, "reward": 1.1253883838653564, "reward_std": 0.9428339004516602, "rewards/rollout_reward_func/mean": 1.1253883838653564, "rewards/rollout_reward_func/std": 1.1202540397644043, "sampling/importance_sampling_ratio/max": 0.5588148832321167, "sampling/importance_sampling_ratio/mean": 0.3006037473678589, "sampling/importance_sampling_ratio/min": 9.574323278638275e-11, "sampling/sampling_logp_difference/max": 4.775975227355957, "sampling/sampling_logp_difference/mean": 0.9599727392196655, "step": 817, "step_time": 6.864891659995919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 5.199218511581421, "epoch": 0.00818, "grad_norm": 0.1108751967549324, "kl": 0.7399784848093987, "learning_rate": 9.999716990453195e-06, "loss": -0.0113, "step": 818, "step_time": 3.6931515249998483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1045.0, "completions/max_terminated_length": 1045.0, "completions/mean_length": 220.71875, "completions/mean_terminated_length": 220.71875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.605605930089951, "epoch": 0.00819, "frac_reward_zero_std": 0.5, "grad_norm": 0.0030042477883398533, "kl": 0.5537140993401408, "learning_rate": 9.999716266189952e-06, "loss": -0.0057, "num_tokens": 18654443.0, "reward": 1.2131903171539307, "reward_std": 0.6529244184494019, "rewards/rollout_reward_func/mean": 1.2131903171539307, "rewards/rollout_reward_func/std": 1.3638769388198853, "sampling/importance_sampling_ratio/max": 0.557607114315033, "sampling/importance_sampling_ratio/mean": 0.3024512231349945, "sampling/importance_sampling_ratio/min": 2.3407196181324252e-07, "sampling/sampling_logp_difference/max": 4.102196216583252, "sampling/sampling_logp_difference/mean": 1.2488386631011963, "step": 819, "step_time": 8.364941805000853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.595553934574127, "epoch": 0.0082, "grad_norm": 0.0028942753560841084, "kl": 0.5539277708157897, "learning_rate": 9.99971554100118e-06, "loss": -0.0056, "step": 820, "step_time": 5.220845918007399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 984.0, "completions/max_terminated_length": 984.0, "completions/mean_length": 204.59375, "completions/mean_terminated_length": 204.59375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.354553043842316, "epoch": 0.00821, "frac_reward_zero_std": 0.5, "grad_norm": 0.013868918642401695, "kl": 0.6002988442778587, "learning_rate": 9.99971481488688e-06, "loss": -0.0115, "num_tokens": 18692476.0, "reward": 1.473341703414917, "reward_std": 0.8925998210906982, "rewards/rollout_reward_func/mean": 1.473341703414917, "rewards/rollout_reward_func/std": 1.314660668373108, "sampling/importance_sampling_ratio/max": 0.5607547163963318, "sampling/importance_sampling_ratio/mean": 0.32197171449661255, "sampling/importance_sampling_ratio/min": 4.2527090045041405e-06, "sampling/sampling_logp_difference/max": 2.754023551940918, "sampling/sampling_logp_difference/mean": 1.0502216815948486, "step": 821, "step_time": 8.920222114989883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.351597368717194, "epoch": 0.00822, "grad_norm": 0.013546761125326157, "kl": 0.5939770825207233, "learning_rate": 9.99971408784705e-06, "loss": -0.0115, "step": 822, "step_time": 4.6819207050029945 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.0, "completions/max_length": 676.0, "completions/max_terminated_length": 676.0, "completions/mean_length": 216.375, "completions/mean_terminated_length": 216.375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.585466146469116, "epoch": 0.00823, "frac_reward_zero_std": 0.0, "grad_norm": 0.04089023917913437, "kl": 0.5116410255432129, "learning_rate": 9.99971335988169e-06, "loss": -0.0111, "num_tokens": 18734487.0, "reward": 0.7158352136611938, "reward_std": 1.1850640773773193, "rewards/rollout_reward_func/mean": 0.7158352136611938, "rewards/rollout_reward_func/std": 1.3964422941207886, "sampling/importance_sampling_ratio/max": 0.5588886141777039, "sampling/importance_sampling_ratio/mean": 0.1729010045528412, "sampling/importance_sampling_ratio/min": 2.763925294857472e-06, "sampling/sampling_logp_difference/max": 2.9094648361206055, "sampling/sampling_logp_difference/mean": 1.2671958208084106, "step": 823, "step_time": 7.750661966001644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 6.573865473270416, "epoch": 0.00824, "grad_norm": 0.08943275362253189, "kl": 0.4993261285126209, "learning_rate": 9.999712630990802e-06, "loss": -0.011, "step": 824, "step_time": 4.077047034003044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1249.0, "completions/max_terminated_length": 1249.0, "completions/mean_length": 401.59375, "completions/mean_terminated_length": 401.59375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.422319054603577, "epoch": 0.00825, "frac_reward_zero_std": 0.5, "grad_norm": 0.012001791968941689, "kl": 0.5308416169136763, "learning_rate": 9.999711901174385e-06, "loss": -0.0071, "num_tokens": 18780719.0, "reward": 0.5720277428627014, "reward_std": 0.9599268436431885, "rewards/rollout_reward_func/mean": 0.5720277428627014, "rewards/rollout_reward_func/std": 1.6362261772155762, "sampling/importance_sampling_ratio/max": 0.5601397156715393, "sampling/importance_sampling_ratio/mean": 0.29733753204345703, "sampling/importance_sampling_ratio/min": 3.839034558364757e-18, "sampling/sampling_logp_difference/max": 4.6985554695129395, "sampling/sampling_logp_difference/mean": 1.1269580125808716, "step": 825, "step_time": 9.865171665998787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.412641853094101, "epoch": 0.00826, "grad_norm": 0.011401299387216568, "kl": 0.530197967775166, "learning_rate": 9.999711170432441e-06, "loss": -0.0071, "step": 826, "step_time": 5.728185355994356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1288.0, "completions/max_terminated_length": 1288.0, "completions/mean_length": 618.375, "completions/mean_terminated_length": 599.1935424804688, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.1818206906318665, "epoch": 0.00827, "frac_reward_zero_std": 0.0, "grad_norm": 0.02461504004895687, "kl": 0.5500675067305565, "learning_rate": 9.999710438764968e-06, "loss": -0.0102, "num_tokens": 18835893.0, "reward": 0.9207163453102112, "reward_std": 1.2974469661712646, "rewards/rollout_reward_func/mean": 0.9207163453102112, "rewards/rollout_reward_func/std": 1.7292238473892212, "sampling/importance_sampling_ratio/max": 0.5649543404579163, "sampling/importance_sampling_ratio/mean": 0.1722087562084198, "sampling/importance_sampling_ratio/min": 1.2403379514580593e-05, "sampling/sampling_logp_difference/max": 3.2723357677459717, "sampling/sampling_logp_difference/mean": 0.931765615940094, "step": 827, "step_time": 10.74336524100363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0026041667442768812, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0026041667442768812, "entropy": 5.166671931743622, "epoch": 0.00828, "grad_norm": 0.02417115867137909, "kl": 0.548377301543951, "learning_rate": 9.999709706171968e-06, "loss": -0.0103, "step": 828, "step_time": 5.604643925995333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1389.0, "completions/max_terminated_length": 1389.0, "completions/mean_length": 549.75, "completions/mean_terminated_length": 549.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.31521999835968, "epoch": 0.00829, "frac_reward_zero_std": 0.0, "grad_norm": 0.013268968090415001, "kl": 0.3651489792391658, "learning_rate": 9.999708972653441e-06, "loss": -0.0132, "num_tokens": 18890153.0, "reward": 0.9783883094787598, "reward_std": 1.6241247653961182, "rewards/rollout_reward_func/mean": 0.9783883094787598, "rewards/rollout_reward_func/std": 1.8394323587417603, "sampling/importance_sampling_ratio/max": 0.3107578754425049, "sampling/importance_sampling_ratio/mean": 0.1110948696732521, "sampling/importance_sampling_ratio/min": 5.500684814325335e-14, "sampling/sampling_logp_difference/max": 4.839944839477539, "sampling/sampling_logp_difference/mean": 1.2904177904129028, "step": 829, "step_time": 10.348116549001134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0036764706019312143, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0036764706019312143, "entropy": 6.329380631446838, "epoch": 0.0083, "grad_norm": 0.013406427577137947, "kl": 0.3640747591853142, "learning_rate": 9.999708238209385e-06, "loss": -0.0132, "step": 830, "step_time": 5.717081003997009 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1027.0, "completions/max_terminated_length": 1027.0, "completions/mean_length": 114.09375, "completions/mean_terminated_length": 114.09375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.030858010053635, "epoch": 0.00831, "frac_reward_zero_std": 0.75, "grad_norm": 0.0628335177898407, "kl": 0.7256956994533539, "learning_rate": 9.999707502839802e-06, "loss": -0.0025, "num_tokens": 18924850.0, "reward": 1.332452654838562, "reward_std": 0.4308134615421295, "rewards/rollout_reward_func/mean": 1.332452654838562, "rewards/rollout_reward_func/std": 1.4272743463516235, "sampling/importance_sampling_ratio/max": 0.5609004497528076, "sampling/importance_sampling_ratio/mean": 0.4197853207588196, "sampling/importance_sampling_ratio/min": 3.2364028449381743e-13, "sampling/sampling_logp_difference/max": 10.173107147216797, "sampling/sampling_logp_difference/mean": 0.8100380897521973, "step": 831, "step_time": 8.324001323999255 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 4.028167933225632, "epoch": 0.00832, "grad_norm": 0.003128102282062173, "kl": 0.7256807759404182, "learning_rate": 9.999706766544692e-06, "loss": -0.0027, "step": 832, "step_time": 5.366748094995273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1255.0, "completions/max_terminated_length": 1255.0, "completions/mean_length": 231.28125, "completions/mean_terminated_length": 238.22579956054688, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.199437618255615, "epoch": 0.00833, "frac_reward_zero_std": 0.25, "grad_norm": 0.029787367209792137, "kl": 0.5159009657800198, "learning_rate": 9.999706029324055e-06, "loss": -0.0081, "num_tokens": 18966655.0, "reward": 0.2604735791683197, "reward_std": 1.0081332921981812, "rewards/rollout_reward_func/mean": 0.2604735791683197, "rewards/rollout_reward_func/std": 1.6427421569824219, "sampling/importance_sampling_ratio/max": 0.5569727420806885, "sampling/importance_sampling_ratio/mean": 0.1835363358259201, "sampling/importance_sampling_ratio/min": 3.3039849949381174e-15, "sampling/sampling_logp_difference/max": 9.997044563293457, "sampling/sampling_logp_difference/mean": 1.4758656024932861, "step": 833, "step_time": 10.062128057994414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 6.193919003009796, "epoch": 0.00834, "grad_norm": 0.03286369517445564, "kl": 0.5097744911909103, "learning_rate": 9.999705291177891e-06, "loss": -0.008, "step": 834, "step_time": 5.518312560998311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 583.0, "completions/max_terminated_length": 583.0, "completions/mean_length": 100.90625, "completions/mean_terminated_length": 100.90625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.504168808460236, "epoch": 0.00835, "frac_reward_zero_std": 0.25, "grad_norm": 0.12461061775684357, "kl": 0.6587738655507565, "learning_rate": 9.999704552106202e-06, "loss": -0.0068, "num_tokens": 19001462.0, "reward": 0.9962348937988281, "reward_std": 1.0542640686035156, "rewards/rollout_reward_func/mean": 0.9962348937988281, "rewards/rollout_reward_func/std": 1.4702318906784058, "sampling/importance_sampling_ratio/max": 0.7328256964683533, "sampling/importance_sampling_ratio/mean": 0.34976157546043396, "sampling/importance_sampling_ratio/min": 4.940187864121981e-06, "sampling/sampling_logp_difference/max": 4.023098945617676, "sampling/sampling_logp_difference/mean": 0.8479019403457642, "step": 835, "step_time": 7.048374996000348 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02083333395421505, "entropy": 4.521932780742645, "epoch": 0.00836, "grad_norm": 0.05408518388867378, "kl": 0.6476293653249741, "learning_rate": 9.999703812108984e-06, "loss": -0.0071, "step": 836, "step_time": 3.89560043900201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1217.0, "completions/max_terminated_length": 1217.0, "completions/mean_length": 508.6875, "completions/mean_terminated_length": 508.6875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.091561019420624, "epoch": 0.00837, "frac_reward_zero_std": 0.0, "grad_norm": 0.04456321522593498, "kl": 0.3511710804887116, "learning_rate": 9.999703071186241e-06, "loss": -0.0195, "num_tokens": 19054756.0, "reward": 0.45298314094543457, "reward_std": 1.590667724609375, "rewards/rollout_reward_func/mean": 0.45298314094543457, "rewards/rollout_reward_func/std": 1.6451629400253296, "sampling/importance_sampling_ratio/max": 0.3142670691013336, "sampling/importance_sampling_ratio/mean": 0.09680217504501343, "sampling/importance_sampling_ratio/min": 2.9406890117433804e-11, "sampling/sampling_logp_difference/max": 4.136491775512695, "sampling/sampling_logp_difference/mean": 1.1050076484680176, "step": 837, "step_time": 10.031002610001451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 6.096462249755859, "epoch": 0.00838, "grad_norm": 0.04754398390650749, "kl": 0.3498899517580867, "learning_rate": 9.999702329337973e-06, "loss": -0.0195, "step": 838, "step_time": 6.25943328300491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 582.0, "completions/max_terminated_length": 582.0, "completions/mean_length": 210.125, "completions/mean_terminated_length": 210.125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.811141759157181, "epoch": 0.00839, "frac_reward_zero_std": 0.25, "grad_norm": 0.01333694253116846, "kl": 0.6424208432435989, "learning_rate": 9.999701586564176e-06, "loss": -0.0197, "num_tokens": 19097027.0, "reward": 1.8990983963012695, "reward_std": 1.0743646621704102, "rewards/rollout_reward_func/mean": 1.8990983963012695, "rewards/rollout_reward_func/std": 1.2691134214401245, "sampling/importance_sampling_ratio/max": 0.55917888879776, "sampling/importance_sampling_ratio/mean": 0.27298611402511597, "sampling/importance_sampling_ratio/min": 1.3357097259358852e-06, "sampling/sampling_logp_difference/max": 4.703125476837158, "sampling/sampling_logp_difference/mean": 0.9169949889183044, "step": 839, "step_time": 7.476377410999703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.80833375453949, "epoch": 0.0084, "grad_norm": 0.014015255495905876, "kl": 0.6468724012374878, "learning_rate": 9.999700842864858e-06, "loss": -0.0197, "step": 840, "step_time": 4.044980416991166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1066.0, "completions/max_terminated_length": 1066.0, "completions/mean_length": 440.5625, "completions/mean_terminated_length": 440.5625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.206878453493118, "epoch": 0.00841, "frac_reward_zero_std": 0.25, "grad_norm": 0.006954957731068134, "kl": 0.3573467442765832, "learning_rate": 9.999700098240011e-06, "loss": -0.0174, "num_tokens": 19146632.0, "reward": 1.1238574981689453, "reward_std": 1.2749414443969727, "rewards/rollout_reward_func/mean": 1.1238574981689453, "rewards/rollout_reward_func/std": 1.5470935106277466, "sampling/importance_sampling_ratio/max": 0.5615661144256592, "sampling/importance_sampling_ratio/mean": 0.1826063096523285, "sampling/importance_sampling_ratio/min": 3.6252051359042525e-05, "sampling/sampling_logp_difference/max": 2.752593994140625, "sampling/sampling_logp_difference/mean": 1.1357097625732422, "step": 841, "step_time": 9.285324991004018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 6.22381266951561, "epoch": 0.00842, "grad_norm": 0.005349314771592617, "kl": 0.3531578406691551, "learning_rate": 9.999699352689638e-06, "loss": -0.0174, "step": 842, "step_time": 5.080666127989389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 781.0, "completions/max_terminated_length": 781.0, "completions/mean_length": 219.34375, "completions/mean_terminated_length": 210.5483856201172, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.246934026479721, "epoch": 0.00843, "frac_reward_zero_std": 0.25, "grad_norm": 0.03671419247984886, "kl": 0.4097129814326763, "learning_rate": 9.999698606213743e-06, "loss": -0.0066, "num_tokens": 19187645.0, "reward": 0.6314017176628113, "reward_std": 1.2819956541061401, "rewards/rollout_reward_func/mean": 0.6314017176628113, "rewards/rollout_reward_func/std": 1.7155719995498657, "sampling/importance_sampling_ratio/max": 0.5511430501937866, "sampling/importance_sampling_ratio/mean": 0.2166445553302765, "sampling/importance_sampling_ratio/min": 6.992513122347499e-14, "sampling/sampling_logp_difference/max": 4.269595623016357, "sampling/sampling_logp_difference/mean": 1.236667275428772, "step": 843, "step_time": 8.612406847001694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 6.240929841995239, "epoch": 0.00844, "grad_norm": 0.035103246569633484, "kl": 0.4084679186344147, "learning_rate": 9.999697858812321e-06, "loss": -0.0066, "step": 844, "step_time": 4.916497928003082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1689.0, "completions/max_terminated_length": 1689.0, "completions/mean_length": 204.125, "completions/mean_terminated_length": 204.125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.531455636024475, "epoch": 0.00845, "frac_reward_zero_std": 0.25, "grad_norm": 0.03902106359601021, "kl": 0.5723611116409302, "learning_rate": 9.999697110485375e-06, "loss": -0.0135, "num_tokens": 19228891.0, "reward": 0.7270225286483765, "reward_std": 1.0286163091659546, "rewards/rollout_reward_func/mean": 0.7270225286483765, "rewards/rollout_reward_func/std": 1.568471074104309, "sampling/importance_sampling_ratio/max": 0.5592520833015442, "sampling/importance_sampling_ratio/mean": 0.30795788764953613, "sampling/importance_sampling_ratio/min": 0.0004422069760039449, "sampling/sampling_logp_difference/max": 2.4104576110839844, "sampling/sampling_logp_difference/mean": 0.7622987031936646, "step": 845, "step_time": 10.894853175992466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.535184025764465, "epoch": 0.00846, "grad_norm": 0.039999160915613174, "kl": 0.5756605379283428, "learning_rate": 9.999696361232904e-06, "loss": -0.0134, "step": 846, "step_time": 6.286525186998915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 947.0, "completions/max_terminated_length": 947.0, "completions/mean_length": 297.96875, "completions/mean_terminated_length": 297.96875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.726775944232941, "epoch": 0.00847, "frac_reward_zero_std": 0.25, "grad_norm": 0.017249956727027893, "kl": 0.4575106711126864, "learning_rate": 9.999695611054908e-06, "loss": -0.0102, "num_tokens": 19273584.0, "reward": 1.0024816989898682, "reward_std": 1.1709589958190918, "rewards/rollout_reward_func/mean": 1.0024816989898682, "rewards/rollout_reward_func/std": 1.4511189460754395, "sampling/importance_sampling_ratio/max": 0.5525084733963013, "sampling/importance_sampling_ratio/mean": 0.22497601807117462, "sampling/importance_sampling_ratio/min": 0.0001966767740668729, "sampling/sampling_logp_difference/max": 2.5887842178344727, "sampling/sampling_logp_difference/mean": 1.038000226020813, "step": 847, "step_time": 8.356800537996605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.730008155107498, "epoch": 0.00848, "grad_norm": 0.016970129683613777, "kl": 0.45691065047867596, "learning_rate": 9.99969485995139e-06, "loss": -0.0102, "step": 848, "step_time": 4.728324373001669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 760.0, "completions/max_terminated_length": 760.0, "completions/mean_length": 167.96875, "completions/mean_terminated_length": 167.96875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.713880926370621, "epoch": 0.00849, "frac_reward_zero_std": 0.25, "grad_norm": 0.09871052950620651, "kl": 0.34324789233505726, "learning_rate": 9.999694107922345e-06, "loss": 0.0007, "num_tokens": 19314032.0, "reward": 0.7817789316177368, "reward_std": 1.172999620437622, "rewards/rollout_reward_func/mean": 0.7817789316177368, "rewards/rollout_reward_func/std": 1.5013391971588135, "sampling/importance_sampling_ratio/max": 0.7065298557281494, "sampling/importance_sampling_ratio/mean": 0.26158607006073, "sampling/importance_sampling_ratio/min": 5.993164904793957e-06, "sampling/sampling_logp_difference/max": 2.8890886306762695, "sampling/sampling_logp_difference/mean": 1.0567706823349, "step": 849, "step_time": 8.553827251995244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.03645833395421505, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03645833395421505, "entropy": 5.711234152317047, "epoch": 0.0085, "grad_norm": 0.05713628977537155, "kl": 0.342058252543211, "learning_rate": 9.999693354967777e-06, "loss": 0.0001, "step": 850, "step_time": 4.862321680993773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1090.0, "completions/max_terminated_length": 1090.0, "completions/mean_length": 323.75, "completions/mean_terminated_length": 322.0967712402344, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.352403402328491, "epoch": 0.00851, "frac_reward_zero_std": 0.0, "grad_norm": 0.14638318121433258, "kl": 0.3286386476829648, "learning_rate": 9.999692601087686e-06, "loss": -0.0169, "num_tokens": 19359253.0, "reward": 0.6363406181335449, "reward_std": 1.2912418842315674, "rewards/rollout_reward_func/mean": 0.6363406181335449, "rewards/rollout_reward_func/std": 1.6443235874176025, "sampling/importance_sampling_ratio/max": 0.5639177560806274, "sampling/importance_sampling_ratio/mean": 0.15568622946739197, "sampling/importance_sampling_ratio/min": 1.0034345227075874e-24, "sampling/sampling_logp_difference/max": 4.603053092956543, "sampling/sampling_logp_difference/mean": 1.199256181716919, "step": 851, "step_time": 9.489391767005145 }, { "clip_ratio/high_max": 0.08333333395421505, "clip_ratio/high_mean": 0.041666666977107525, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.057291666977107525, "entropy": 6.281131982803345, "epoch": 0.00852, "grad_norm": 0.032634105533361435, "kl": 0.3502370538190007, "learning_rate": 9.999691846282073e-06, "loss": -0.017, "step": 852, "step_time": 5.109813557002781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2286.0, "completions/max_terminated_length": 2286.0, "completions/mean_length": 713.53125, "completions/mean_terminated_length": 713.53125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.415631055831909, "epoch": 0.00853, "frac_reward_zero_std": 0.0, "grad_norm": 0.02878219448029995, "kl": 0.2907652398571372, "learning_rate": 9.999691090550936e-06, "loss": -0.014, "num_tokens": 19418742.0, "reward": 1.6085903644561768, "reward_std": 1.5081437826156616, "rewards/rollout_reward_func/mean": 1.6085903644561768, "rewards/rollout_reward_func/std": 1.6055474281311035, "sampling/importance_sampling_ratio/max": 0.31335756182670593, "sampling/importance_sampling_ratio/mean": 0.08401114493608475, "sampling/importance_sampling_ratio/min": 3.647085691227403e-07, "sampling/sampling_logp_difference/max": 3.7861180305480957, "sampling/sampling_logp_difference/mean": 1.1478580236434937, "step": 853, "step_time": 13.34865877399352 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 6.407030016183853, "epoch": 0.00854, "grad_norm": 0.027826717123389244, "kl": 0.29204037971794605, "learning_rate": 9.999690333894273e-06, "loss": -0.014, "step": 854, "step_time": 7.577863583006547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 858.0, "completions/max_terminated_length": 858.0, "completions/mean_length": 225.03125, "completions/mean_terminated_length": 231.77418518066406, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.976026803255081, "epoch": 0.00855, "frac_reward_zero_std": 0.25, "grad_norm": 0.048985738307237625, "kl": 0.5222478657960892, "learning_rate": 9.99968957631209e-06, "loss": -0.0082, "num_tokens": 19458297.0, "reward": 1.2827180624008179, "reward_std": 1.1104629039764404, "rewards/rollout_reward_func/mean": 1.2827180624008179, "rewards/rollout_reward_func/std": 1.510080337524414, "sampling/importance_sampling_ratio/max": 0.560012936592102, "sampling/importance_sampling_ratio/mean": 0.28831946849823, "sampling/importance_sampling_ratio/min": 3.203782239502573e-10, "sampling/sampling_logp_difference/max": 3.6952157020568848, "sampling/sampling_logp_difference/mean": 1.2304168939590454, "step": 855, "step_time": 9.025277330998506 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0052083334885537624, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0052083334885537624, "entropy": 5.959501326084137, "epoch": 0.00856, "grad_norm": 0.019781991839408875, "kl": 0.5266291759908199, "learning_rate": 9.999688817804385e-06, "loss": -0.0084, "step": 856, "step_time": 4.496739802994853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1385.0, "completions/max_terminated_length": 1385.0, "completions/mean_length": 383.78125, "completions/mean_terminated_length": 383.78125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.617192983627319, "epoch": 0.00857, "frac_reward_zero_std": 0.25, "grad_norm": 0.01072264276444912, "kl": 0.6767667979001999, "learning_rate": 9.999688058371155e-06, "loss": -0.0024, "num_tokens": 19503930.0, "reward": 1.1464736461639404, "reward_std": 0.8498347997665405, "rewards/rollout_reward_func/mean": 1.1464736461639404, "rewards/rollout_reward_func/std": 1.1673145294189453, "sampling/importance_sampling_ratio/max": 0.5562025904655457, "sampling/importance_sampling_ratio/mean": 0.294924795627594, "sampling/importance_sampling_ratio/min": 5.755850927411028e-16, "sampling/sampling_logp_difference/max": 4.901000022888184, "sampling/sampling_logp_difference/mean": 0.86452317237854, "step": 857, "step_time": 10.316764293002052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.605598837137222, "epoch": 0.00858, "grad_norm": 0.010690941475331783, "kl": 0.6767762545496225, "learning_rate": 9.999687298012404e-06, "loss": -0.0024, "step": 858, "step_time": 5.661049551006727 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.0, "completions/max_length": 887.0, "completions/max_terminated_length": 887.0, "completions/mean_length": 242.71875, "completions/mean_terminated_length": 242.71875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.471343398094177, "epoch": 0.00859, "frac_reward_zero_std": 0.25, "grad_norm": 0.1608061045408249, "kl": 0.503684040158987, "learning_rate": 9.999686536728131e-06, "loss": -0.015, "num_tokens": 19545382.0, "reward": 0.5940237641334534, "reward_std": 0.6788533329963684, "rewards/rollout_reward_func/mean": 0.5940237641334534, "rewards/rollout_reward_func/std": 1.463939905166626, "sampling/importance_sampling_ratio/max": 0.5873011946678162, "sampling/importance_sampling_ratio/mean": 0.24358874559402466, "sampling/importance_sampling_ratio/min": 0.0003598741313908249, "sampling/sampling_logp_difference/max": 2.53385329246521, "sampling/sampling_logp_difference/mean": 1.0672597885131836, "step": 859, "step_time": 8.419511881002109 }, { "clip_ratio/high_max": 0.07812500186264515, "clip_ratio/high_mean": 0.039062500931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.039062500931322575, "entropy": 5.469371676445007, "epoch": 0.0086, "grad_norm": 0.009655952453613281, "kl": 0.5069572012871504, "learning_rate": 9.999685774518335e-06, "loss": -0.0156, "step": 860, "step_time": 4.645379481007694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1418.0, "completions/max_terminated_length": 1418.0, "completions/mean_length": 289.96875, "completions/mean_terminated_length": 289.96875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.1020694971084595, "epoch": 0.00861, "frac_reward_zero_std": 0.25, "grad_norm": 0.10250399261713028, "kl": 0.43445974960923195, "learning_rate": 9.999685011383017e-06, "loss": -0.0057, "num_tokens": 19588947.0, "reward": 1.0094074010849, "reward_std": 1.0179818868637085, "rewards/rollout_reward_func/mean": 1.0094074010849, "rewards/rollout_reward_func/std": 1.6970643997192383, "sampling/importance_sampling_ratio/max": 0.724372148513794, "sampling/importance_sampling_ratio/mean": 0.19652965664863586, "sampling/importance_sampling_ratio/min": 2.254552100566798e-07, "sampling/sampling_logp_difference/max": 4.482903480529785, "sampling/sampling_logp_difference/mean": 1.1838427782058716, "step": 861, "step_time": 10.641913688003115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 6.12245774269104, "epoch": 0.00862, "grad_norm": 0.04489557445049286, "kl": 0.42837718315422535, "learning_rate": 9.999684247322179e-06, "loss": -0.0062, "step": 862, "step_time": 5.569121196997003 }, { "clip_ratio/high_max": 0.0059523810632526875, "clip_ratio/high_mean": 0.0029761905316263437, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013392857508733869, "completions/clipped_ratio": 0.0625, "completions/max_length": 1846.0, "completions/max_terminated_length": 1846.0, "completions/mean_length": 575.40625, "completions/mean_terminated_length": 603.300048828125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.345836639404297, "epoch": 0.00863, "frac_reward_zero_std": 0.0, "grad_norm": 0.10118368268013, "kl": 0.8075088150799274, "learning_rate": 9.999683482335817e-06, "loss": -0.005, "num_tokens": 19642925.0, "reward": 0.46718695759773254, "reward_std": 1.4501774311065674, "rewards/rollout_reward_func/mean": 0.46718695759773254, "rewards/rollout_reward_func/std": 1.478354811668396, "sampling/importance_sampling_ratio/max": 0.5541170239448547, "sampling/importance_sampling_ratio/mean": 0.09531361609697342, "sampling/importance_sampling_ratio/min": 2.0236876410728577e-12, "sampling/sampling_logp_difference/max": 4.480251312255859, "sampling/sampling_logp_difference/mean": 1.2713687419891357, "step": 863, "step_time": 12.53924810299577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 6.337746798992157, "epoch": 0.00864, "grad_norm": 0.08754467964172363, "kl": 0.6271044109016657, "learning_rate": 9.999682716423937e-06, "loss": -0.0053, "step": 864, "step_time": 6.88335229000586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 572.0, "completions/max_terminated_length": 572.0, "completions/mean_length": 172.34375, "completions/mean_terminated_length": 172.34375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.155012845993042, "epoch": 0.00865, "frac_reward_zero_std": 0.25, "grad_norm": 0.04062714800238609, "kl": 0.9125541225075722, "learning_rate": 9.999681949586533e-06, "loss": -0.0063, "num_tokens": 19681437.0, "reward": 1.323612093925476, "reward_std": 0.8614369630813599, "rewards/rollout_reward_func/mean": 1.323612093925476, "rewards/rollout_reward_func/std": 1.4495079517364502, "sampling/importance_sampling_ratio/max": 0.5589532852172852, "sampling/importance_sampling_ratio/mean": 0.35496312379837036, "sampling/importance_sampling_ratio/min": 3.1677856545580044e-14, "sampling/sampling_logp_difference/max": 4.325991630554199, "sampling/sampling_logp_difference/mean": 0.8462364673614502, "step": 865, "step_time": 7.079612808996899 }, { "clip_ratio/high_max": 0.0416666679084301, "clip_ratio/high_mean": 0.02083333395421505, "clip_ratio/low_mean": 0.0028409091755747795, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02367424312978983, "entropy": 4.154504418373108, "epoch": 0.00866, "grad_norm": 0.006691935937851667, "kl": 0.913747064769268, "learning_rate": 9.999681181823611e-06, "loss": -0.0064, "step": 866, "step_time": 3.791085054999712 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.004166666883975267, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011979166883975267, "completions/clipped_ratio": 0.0, "completions/max_length": 1576.0, "completions/max_terminated_length": 1576.0, "completions/mean_length": 556.21875, "completions/mean_terminated_length": 556.21875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.7275959849357605, "epoch": 0.00867, "frac_reward_zero_std": 0.0, "grad_norm": 0.03775383532047272, "kl": 0.2754063317552209, "learning_rate": 9.999680413135167e-06, "loss": 0.0016, "num_tokens": 19736188.0, "reward": 0.6176159381866455, "reward_std": 1.2271144390106201, "rewards/rollout_reward_func/mean": 0.6176159381866455, "rewards/rollout_reward_func/std": 1.4079228639602661, "sampling/importance_sampling_ratio/max": 0.5463756322860718, "sampling/importance_sampling_ratio/mean": 0.09952112287282944, "sampling/importance_sampling_ratio/min": 5.000711400526825e-09, "sampling/sampling_logp_difference/max": 4.3815107345581055, "sampling/sampling_logp_difference/mean": 1.324137568473816, "step": 867, "step_time": 11.255757031998655 }, { "clip_ratio/high_max": 0.029761905781924725, "clip_ratio/high_mean": 0.014880952890962362, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.014880952890962362, "entropy": 6.703474819660187, "epoch": 0.00868, "grad_norm": 0.03278752416372299, "kl": 0.2852907218039036, "learning_rate": 9.9996796435212e-06, "loss": 0.0016, "step": 868, "step_time": 6.123072728005354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1299.0, "completions/max_terminated_length": 1299.0, "completions/mean_length": 469.28125, "completions/mean_terminated_length": 470.774169921875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.98077005147934, "epoch": 0.00869, "frac_reward_zero_std": 0.25, "grad_norm": 0.021428074687719345, "kl": 0.6286480724811554, "learning_rate": 9.999678872981717e-06, "loss": -0.0052, "num_tokens": 19785716.0, "reward": 1.4797799587249756, "reward_std": 1.1677173376083374, "rewards/rollout_reward_func/mean": 1.4797799587249756, "rewards/rollout_reward_func/std": 1.3569440841674805, "sampling/importance_sampling_ratio/max": 0.5525619983673096, "sampling/importance_sampling_ratio/mean": 0.23313766717910767, "sampling/importance_sampling_ratio/min": 1.0643607595284266e-07, "sampling/sampling_logp_difference/max": 3.559433698654175, "sampling/sampling_logp_difference/mean": 0.8771290183067322, "step": 869, "step_time": 9.96078566900178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.98908668756485, "epoch": 0.0087, "grad_norm": 0.022535743191838264, "kl": 0.6268547847867012, "learning_rate": 9.999678101516712e-06, "loss": -0.0052, "step": 870, "step_time": 5.564331200996094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1029.0, "completions/max_terminated_length": 1029.0, "completions/mean_length": 376.78125, "completions/mean_terminated_length": 380.32257080078125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.919117748737335, "epoch": 0.00871, "frac_reward_zero_std": 0.25, "grad_norm": 0.03068990632891655, "kl": 0.4807072691619396, "learning_rate": 9.999677329126187e-06, "loss": -0.0089, "num_tokens": 19832544.0, "reward": 1.648889422416687, "reward_std": 1.6352500915527344, "rewards/rollout_reward_func/mean": 1.648889422416687, "rewards/rollout_reward_func/std": 1.8585665225982666, "sampling/importance_sampling_ratio/max": 0.5588000416755676, "sampling/importance_sampling_ratio/mean": 0.23119154572486877, "sampling/importance_sampling_ratio/min": 5.692631312366968e-25, "sampling/sampling_logp_difference/max": 6.254266738891602, "sampling/sampling_logp_difference/mean": 1.1105072498321533, "step": 871, "step_time": 9.262538785005745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.919314831495285, "epoch": 0.00872, "grad_norm": 0.03059012070298195, "kl": 0.4832749590277672, "learning_rate": 9.999676555810143e-06, "loss": -0.009, "step": 872, "step_time": 5.450303668996639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.005965909222140908, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005965909222140908, "completions/clipped_ratio": 0.0625, "completions/max_length": 1397.0, "completions/max_terminated_length": 1397.0, "completions/mean_length": 528.78125, "completions/mean_terminated_length": 545.9000244140625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.08882737159729, "epoch": 0.00873, "frac_reward_zero_std": 0.0, "grad_norm": 0.017587624490261078, "kl": 0.5044682919979095, "learning_rate": 9.999675781568578e-06, "loss": -0.0125, "num_tokens": 19885505.0, "reward": 0.3201920986175537, "reward_std": 1.5326073169708252, "rewards/rollout_reward_func/mean": 0.3201920986175537, "rewards/rollout_reward_func/std": 1.8311327695846558, "sampling/importance_sampling_ratio/max": 0.5534799098968506, "sampling/importance_sampling_ratio/mean": 0.14489606022834778, "sampling/importance_sampling_ratio/min": 7.632476906599062e-17, "sampling/sampling_logp_difference/max": 4.522765159606934, "sampling/sampling_logp_difference/mean": 1.2945594787597656, "step": 873, "step_time": 10.902827240992337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0028409091755747795, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0028409091755747795, "entropy": 6.089991509914398, "epoch": 0.00874, "grad_norm": 0.01725727692246437, "kl": 0.5020488984882832, "learning_rate": 9.999675006401496e-06, "loss": -0.0125, "step": 874, "step_time": 5.806873818000895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.0, "completions/max_length": 1486.0, "completions/max_terminated_length": 1486.0, "completions/mean_length": 343.4375, "completions/mean_terminated_length": 343.4375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.301589548587799, "epoch": 0.00875, "frac_reward_zero_std": 0.0, "grad_norm": 0.047858066856861115, "kl": 0.4275720380246639, "learning_rate": 9.999674230308893e-06, "loss": -0.0054, "num_tokens": 19931587.0, "reward": 0.7896682024002075, "reward_std": 1.408063292503357, "rewards/rollout_reward_func/mean": 0.7896682024002075, "rewards/rollout_reward_func/std": 1.4614943265914917, "sampling/importance_sampling_ratio/max": 0.5574002861976624, "sampling/importance_sampling_ratio/mean": 0.17363665997982025, "sampling/importance_sampling_ratio/min": 4.002716195827816e-06, "sampling/sampling_logp_difference/max": 4.1122846603393555, "sampling/sampling_logp_difference/mean": 1.194441318511963, "step": 875, "step_time": 10.484725573001924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 6.318237543106079, "epoch": 0.00876, "grad_norm": 0.0427342988550663, "kl": 0.4118029698729515, "learning_rate": 9.999673453290772e-06, "loss": -0.0054, "step": 876, "step_time": 5.947483381005441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 566.0, "completions/max_terminated_length": 566.0, "completions/mean_length": 166.5, "completions/mean_terminated_length": 166.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.269352197647095, "epoch": 0.00877, "frac_reward_zero_std": 0.5, "grad_norm": 0.008487203158438206, "kl": 0.6327805742621422, "learning_rate": 9.999672675347131e-06, "loss": -0.0077, "num_tokens": 19969885.0, "reward": 1.629843831062317, "reward_std": 1.08498215675354, "rewards/rollout_reward_func/mean": 1.629843831062317, "rewards/rollout_reward_func/std": 1.5375268459320068, "sampling/importance_sampling_ratio/max": 0.5607469081878662, "sampling/importance_sampling_ratio/mean": 0.34267646074295044, "sampling/importance_sampling_ratio/min": 1.7444855870962783e-07, "sampling/sampling_logp_difference/max": 2.5729002952575684, "sampling/sampling_logp_difference/mean": 0.7102192044258118, "step": 877, "step_time": 7.72126420499626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.274931102991104, "epoch": 0.00878, "grad_norm": 0.008198447525501251, "kl": 0.6338988281786442, "learning_rate": 9.999671896477973e-06, "loss": -0.0077, "step": 878, "step_time": 4.5148104279978725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "completions/clipped_ratio": 0.0, "completions/max_length": 1010.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 285.0625, "completions/mean_terminated_length": 285.0625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.688471615314484, "epoch": 0.00879, "frac_reward_zero_std": 0.25, "grad_norm": 0.07047116756439209, "kl": 0.5738187059760094, "learning_rate": 9.999671116683296e-06, "loss": -0.0068, "num_tokens": 20013367.0, "reward": 0.681964099407196, "reward_std": 0.8124816417694092, "rewards/rollout_reward_func/mean": 0.681964099407196, "rewards/rollout_reward_func/std": 1.3735384941101074, "sampling/importance_sampling_ratio/max": 0.5607972741127014, "sampling/importance_sampling_ratio/mean": 0.2434389740228653, "sampling/importance_sampling_ratio/min": 9.465249604545534e-06, "sampling/sampling_logp_difference/max": 3.9350569248199463, "sampling/sampling_logp_difference/mean": 1.1522154808044434, "step": 879, "step_time": 9.112565565003024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.02083333395421505, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02083333395421505, "entropy": 5.748844414949417, "epoch": 0.0088, "grad_norm": 0.06525468081235886, "kl": 0.5669769160449505, "learning_rate": 9.9996703359631e-06, "loss": -0.007, "step": 880, "step_time": 4.858974066002702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1463.0, "completions/max_terminated_length": 1463.0, "completions/mean_length": 428.8125, "completions/mean_terminated_length": 428.8125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.140826314687729, "epoch": 0.00881, "frac_reward_zero_std": 0.25, "grad_norm": 0.08119796961545944, "kl": 0.43575059063732624, "learning_rate": 9.999669554317389e-06, "loss": -0.0056, "num_tokens": 20062820.0, "reward": 0.9715731143951416, "reward_std": 1.1263158321380615, "rewards/rollout_reward_func/mean": 0.9715731143951416, "rewards/rollout_reward_func/std": 1.4252091646194458, "sampling/importance_sampling_ratio/max": 0.560675859451294, "sampling/importance_sampling_ratio/mean": 0.1949610859155655, "sampling/importance_sampling_ratio/min": 1.124737547542054e-08, "sampling/sampling_logp_difference/max": 4.8820390701293945, "sampling/sampling_logp_difference/mean": 1.3175673484802246, "step": 881, "step_time": 9.999521947003814 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 6.161438047885895, "epoch": 0.00882, "grad_norm": 0.010495537891983986, "kl": 0.42955803498625755, "learning_rate": 9.999668771746158e-06, "loss": -0.0057, "step": 882, "step_time": 5.7121544430010545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1105.0, "completions/max_terminated_length": 1105.0, "completions/mean_length": 458.0, "completions/mean_terminated_length": 458.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.732534170150757, "epoch": 0.00883, "frac_reward_zero_std": 0.0, "grad_norm": 0.05484206974506378, "kl": 0.3319398248568177, "learning_rate": 9.99966798824941e-06, "loss": -0.0142, "num_tokens": 20114326.0, "reward": 0.7003356218338013, "reward_std": 1.450186848640442, "rewards/rollout_reward_func/mean": 0.7003356218338013, "rewards/rollout_reward_func/std": 1.494462490081787, "sampling/importance_sampling_ratio/max": 0.5694385766983032, "sampling/importance_sampling_ratio/mean": 0.11461608111858368, "sampling/importance_sampling_ratio/min": 4.708424967247993e-06, "sampling/sampling_logp_difference/max": 3.9248979091644287, "sampling/sampling_logp_difference/mean": 1.3176023960113525, "step": 883, "step_time": 9.518424282003252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 6.796636521816254, "epoch": 0.00884, "grad_norm": 0.04373004287481308, "kl": 0.3304069098085165, "learning_rate": 9.999667203827144e-06, "loss": -0.0144, "step": 884, "step_time": 5.602263409007719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 955.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 125.3125, "completions/mean_terminated_length": 128.8386993408203, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.1098368763923645, "epoch": 0.00885, "frac_reward_zero_std": 0.5, "grad_norm": 0.036446020007133484, "kl": 0.8117614462971687, "learning_rate": 9.999666418479359e-06, "loss": -0.0108, "num_tokens": 20147486.0, "reward": 1.5680052042007446, "reward_std": 0.69411301612854, "rewards/rollout_reward_func/mean": 1.5680052042007446, "rewards/rollout_reward_func/std": 1.1098153591156006, "sampling/importance_sampling_ratio/max": 0.5569218993186951, "sampling/importance_sampling_ratio/mean": 0.4168953597545624, "sampling/importance_sampling_ratio/min": 4.1322829602563615e-09, "sampling/sampling_logp_difference/max": 4.260263919830322, "sampling/sampling_logp_difference/mean": 0.8089938163757324, "step": 885, "step_time": 8.670325310999033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 4.140138119459152, "epoch": 0.00886, "grad_norm": 0.03878350928425789, "kl": 0.8101013749837875, "learning_rate": 9.999665632206059e-06, "loss": -0.0108, "step": 886, "step_time": 4.687577276996308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1392.0, "completions/max_terminated_length": 1392.0, "completions/mean_length": 270.59375, "completions/mean_terminated_length": 278.80645751953125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.481232166290283, "epoch": 0.00887, "frac_reward_zero_std": 0.25, "grad_norm": 0.011207429692149162, "kl": 0.4895089380443096, "learning_rate": 9.999664845007243e-06, "loss": -0.0039, "num_tokens": 20188669.0, "reward": 0.8796762824058533, "reward_std": 0.8131213784217834, "rewards/rollout_reward_func/mean": 0.8796762824058533, "rewards/rollout_reward_func/std": 1.226043701171875, "sampling/importance_sampling_ratio/max": 0.5583827495574951, "sampling/importance_sampling_ratio/mean": 0.27236032485961914, "sampling/importance_sampling_ratio/min": 4.3965024221041815e-13, "sampling/sampling_logp_difference/max": 4.75656270980835, "sampling/sampling_logp_difference/mean": 1.1672470569610596, "step": 887, "step_time": 9.78639423799541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 5.487031102180481, "epoch": 0.00888, "grad_norm": 0.010347486473619938, "kl": 0.4909365326166153, "learning_rate": 9.99966405688291e-06, "loss": -0.0039, "step": 888, "step_time": 5.669884737995744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1357.0, "completions/max_terminated_length": 1357.0, "completions/mean_length": 415.0625, "completions/mean_terminated_length": 415.0625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.004184663295746, "epoch": 0.00889, "frac_reward_zero_std": 0.25, "grad_norm": 0.06603889167308807, "kl": 0.391216516494751, "learning_rate": 9.99966326783306e-06, "loss": -0.0058, "num_tokens": 20236494.0, "reward": 0.7775735259056091, "reward_std": 0.8134607672691345, "rewards/rollout_reward_func/mean": 0.7775735259056091, "rewards/rollout_reward_func/std": 1.281998634338379, "sampling/importance_sampling_ratio/max": 0.5554279685020447, "sampling/importance_sampling_ratio/mean": 0.21501407027244568, "sampling/importance_sampling_ratio/min": 1.2194095733164545e-10, "sampling/sampling_logp_difference/max": 3.9480602741241455, "sampling/sampling_logp_difference/mean": 1.2795906066894531, "step": 889, "step_time": 10.33380617700459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 6.007346868515015, "epoch": 0.0089, "grad_norm": 0.06640011072158813, "kl": 0.3958117365837097, "learning_rate": 9.999662477857692e-06, "loss": -0.0059, "step": 890, "step_time": 6.619852381001692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1075.0, "completions/max_terminated_length": 1075.0, "completions/mean_length": 258.875, "completions/mean_terminated_length": 258.875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.45432060956955, "epoch": 0.00891, "frac_reward_zero_std": 0.0, "grad_norm": 0.07040227204561234, "kl": 0.5861578322947025, "learning_rate": 9.99966168695681e-06, "loss": -0.0182, "num_tokens": 20279491.0, "reward": 0.32439684867858887, "reward_std": 1.1610280275344849, "rewards/rollout_reward_func/mean": 0.32439684867858887, "rewards/rollout_reward_func/std": 1.620774269104004, "sampling/importance_sampling_ratio/max": 0.5550425052642822, "sampling/importance_sampling_ratio/mean": 0.22571402788162231, "sampling/importance_sampling_ratio/min": 1.7179457856286717e-09, "sampling/sampling_logp_difference/max": 3.259596586227417, "sampling/sampling_logp_difference/mean": 1.0750048160552979, "step": 891, "step_time": 8.856096040999546 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 5.4647470116615295, "epoch": 0.00892, "grad_norm": 0.045618146657943726, "kl": 0.5866867788136005, "learning_rate": 9.999660895130413e-06, "loss": -0.0182, "step": 892, "step_time": 5.02120555700094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1088.0, "completions/max_terminated_length": 1088.0, "completions/mean_length": 169.65625, "completions/mean_terminated_length": 169.65625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.873561412096024, "epoch": 0.00893, "frac_reward_zero_std": 0.5, "grad_norm": 0.02226884290575981, "kl": 0.5732529424130917, "learning_rate": 9.9996601023785e-06, "loss": -0.005, "num_tokens": 20317971.0, "reward": 0.9996055364608765, "reward_std": 0.7448421716690063, "rewards/rollout_reward_func/mean": 0.9996055364608765, "rewards/rollout_reward_func/std": 1.3192939758300781, "sampling/importance_sampling_ratio/max": 0.5618467330932617, "sampling/importance_sampling_ratio/mean": 0.3493072986602783, "sampling/importance_sampling_ratio/min": 9.002195100471511e-10, "sampling/sampling_logp_difference/max": 3.197343587875366, "sampling/sampling_logp_difference/mean": 1.0804153680801392, "step": 893, "step_time": 8.649136111005646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.876284569501877, "epoch": 0.00894, "grad_norm": 0.022267045453190804, "kl": 0.5753142610192299, "learning_rate": 9.999659308701071e-06, "loss": -0.0049, "step": 894, "step_time": 5.005333766999684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 639.0, "completions/max_terminated_length": 639.0, "completions/mean_length": 117.0, "completions/mean_terminated_length": 117.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.8575020730495453, "epoch": 0.00895, "frac_reward_zero_std": 0.5, "grad_norm": 0.0818496122956276, "kl": 0.7838678285479546, "learning_rate": 9.999658514098125e-06, "loss": -0.0101, "num_tokens": 20352701.0, "reward": 0.7939373850822449, "reward_std": 0.6101529598236084, "rewards/rollout_reward_func/mean": 0.7939373850822449, "rewards/rollout_reward_func/std": 1.6686006784439087, "sampling/importance_sampling_ratio/max": 0.5547815561294556, "sampling/importance_sampling_ratio/mean": 0.39956700801849365, "sampling/importance_sampling_ratio/min": 3.9696333026251196e-11, "sampling/sampling_logp_difference/max": 4.308183670043945, "sampling/sampling_logp_difference/mean": 0.685309648513794, "step": 895, "step_time": 7.748911177004629 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 3.7810073792934418, "epoch": 0.00896, "grad_norm": 0.04085128754377365, "kl": 0.7936591729521751, "learning_rate": 9.999657718569665e-06, "loss": -0.0103, "step": 896, "step_time": 4.9130038779949246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1401.0, "completions/max_terminated_length": 1401.0, "completions/mean_length": 393.53125, "completions/mean_terminated_length": 385.06451416015625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.245501697063446, "epoch": 0.00897, "frac_reward_zero_std": 0.0, "grad_norm": 0.14148084819316864, "kl": 0.5495753474533558, "learning_rate": 9.99965692211569e-06, "loss": -0.0164, "num_tokens": 20401465.0, "reward": 1.0293302536010742, "reward_std": 1.8889446258544922, "rewards/rollout_reward_func/mean": 1.0293302536010742, "rewards/rollout_reward_func/std": 1.8943392038345337, "sampling/importance_sampling_ratio/max": 0.6094484925270081, "sampling/importance_sampling_ratio/mean": 0.22069860994815826, "sampling/importance_sampling_ratio/min": 1.8163148647906551e-37, "sampling/sampling_logp_difference/max": 12.518390655517578, "sampling/sampling_logp_difference/mean": 1.1260055303573608, "step": 897, "step_time": 10.175758122994012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 5.220665842294693, "epoch": 0.00898, "grad_norm": 0.022981803864240646, "kl": 0.5547199249267578, "learning_rate": 9.999656124736203e-06, "loss": -0.017, "step": 898, "step_time": 5.686070384999766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1453.0, "completions/max_terminated_length": 1453.0, "completions/mean_length": 345.21875, "completions/mean_terminated_length": 345.21875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.861665785312653, "epoch": 0.00899, "frac_reward_zero_std": 0.0, "grad_norm": 0.05869443342089653, "kl": 0.31116514815948904, "learning_rate": 9.9996553264312e-06, "loss": -0.0178, "num_tokens": 20449436.0, "reward": 0.5081309676170349, "reward_std": 1.3115684986114502, "rewards/rollout_reward_func/mean": 0.5081309676170349, "rewards/rollout_reward_func/std": 1.6286370754241943, "sampling/importance_sampling_ratio/max": 0.5428517460823059, "sampling/importance_sampling_ratio/mean": 0.11011413484811783, "sampling/importance_sampling_ratio/min": 2.0477094949455932e-05, "sampling/sampling_logp_difference/max": 3.2532238960266113, "sampling/sampling_logp_difference/mean": 1.3197574615478516, "step": 899, "step_time": 10.046407915007876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 6.84659868478775, "epoch": 0.009, "grad_norm": 0.044321488589048386, "kl": 0.3074897206388414, "learning_rate": 9.999654527200682e-06, "loss": -0.0181, "step": 900, "step_time": 5.797895243002131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1177.0, "completions/max_terminated_length": 1177.0, "completions/mean_length": 334.15625, "completions/mean_terminated_length": 330.3870849609375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.951647102832794, "epoch": 0.00901, "frac_reward_zero_std": 0.25, "grad_norm": 0.009973004460334778, "kl": 0.5186472460627556, "learning_rate": 9.999653727044649e-06, "loss": -0.0032, "num_tokens": 20494865.0, "reward": 1.041465163230896, "reward_std": 1.0522278547286987, "rewards/rollout_reward_func/mean": 1.041465163230896, "rewards/rollout_reward_func/std": 1.3904699087142944, "sampling/importance_sampling_ratio/max": 0.5519910454750061, "sampling/importance_sampling_ratio/mean": 0.20524059236049652, "sampling/importance_sampling_ratio/min": 4.051707656935816e-17, "sampling/sampling_logp_difference/max": 4.1419782638549805, "sampling/sampling_logp_difference/mean": 1.2518198490142822, "step": 901, "step_time": 9.61083363200305 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0031250000465661287, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013541667023673654, "entropy": 5.926978707313538, "epoch": 0.00902, "grad_norm": 0.013194001279771328, "kl": 0.5273267142474651, "learning_rate": 9.999652925963103e-06, "loss": -0.0032, "step": 902, "step_time": 5.532596760996967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1006.0, "completions/max_terminated_length": 1006.0, "completions/mean_length": 314.84375, "completions/mean_terminated_length": 314.84375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.292068362236023, "epoch": 0.00903, "frac_reward_zero_std": 0.25, "grad_norm": 0.014490664005279541, "kl": 0.5663878656923771, "learning_rate": 9.999652123956045e-06, "loss": -0.0105, "num_tokens": 20541112.0, "reward": 1.4132513999938965, "reward_std": 0.9849671721458435, "rewards/rollout_reward_func/mean": 1.4132513999938965, "rewards/rollout_reward_func/std": 1.2779535055160522, "sampling/importance_sampling_ratio/max": 0.5595520734786987, "sampling/importance_sampling_ratio/mean": 0.21724234521389008, "sampling/importance_sampling_ratio/min": 1.1762375962121041e-08, "sampling/sampling_logp_difference/max": 10.236433029174805, "sampling/sampling_logp_difference/mean": 1.097205400466919, "step": 903, "step_time": 8.863293688002159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.005681818351149559, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005681818351149559, "entropy": 5.286942005157471, "epoch": 0.00904, "grad_norm": 0.014021433889865875, "kl": 0.5704684387892485, "learning_rate": 9.99965132102347e-06, "loss": -0.0105, "step": 904, "step_time": 4.931313710996619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.007068452658131719, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007068452658131719, "completions/clipped_ratio": 0.03125, "completions/max_length": 1363.0, "completions/max_terminated_length": 1363.0, "completions/mean_length": 402.90625, "completions/mean_terminated_length": 382.5483703613281, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.118847489356995, "epoch": 0.00905, "frac_reward_zero_std": 0.25, "grad_norm": 0.06167778745293617, "kl": 0.39563291892409325, "learning_rate": 9.999650517165385e-06, "loss": -0.0022, "num_tokens": 20587949.0, "reward": 0.6822129487991333, "reward_std": 0.7969176173210144, "rewards/rollout_reward_func/mean": 0.6822129487991333, "rewards/rollout_reward_func/std": 1.4003556966781616, "sampling/importance_sampling_ratio/max": 0.5576717853546143, "sampling/importance_sampling_ratio/mean": 0.19322314858436584, "sampling/importance_sampling_ratio/min": 7.059003381826301e-11, "sampling/sampling_logp_difference/max": 3.92435884475708, "sampling/sampling_logp_difference/mean": 1.1617404222488403, "step": 905, "step_time": 9.618445449996216 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0052083334885537624, "clip_ratio/low_mean": 0.0026041667442768812, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007812500232830644, "entropy": 6.09725022315979, "epoch": 0.00906, "grad_norm": 0.04940762370824814, "kl": 0.4025121219456196, "learning_rate": 9.999649712381786e-06, "loss": -0.0023, "step": 906, "step_time": 5.399897060000512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 573.0, "completions/max_terminated_length": 573.0, "completions/mean_length": 165.75, "completions/mean_terminated_length": 170.5806427001953, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.105226933956146, "epoch": 0.00907, "frac_reward_zero_std": 0.25, "grad_norm": 0.006375835742801428, "kl": 0.5696069840341806, "learning_rate": 9.999648906672674e-06, "loss": -0.0063, "num_tokens": 20625344.0, "reward": 1.1611266136169434, "reward_std": 1.0769461393356323, "rewards/rollout_reward_func/mean": 1.1611266136169434, "rewards/rollout_reward_func/std": 1.3931117057800293, "sampling/importance_sampling_ratio/max": 0.5517442226409912, "sampling/importance_sampling_ratio/mean": 0.2712441682815552, "sampling/importance_sampling_ratio/min": 3.571636042476456e-11, "sampling/sampling_logp_difference/max": 8.717081069946289, "sampling/sampling_logp_difference/mean": 1.3530046939849854, "step": 907, "step_time": 7.54224997100755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 6.095210909843445, "epoch": 0.00908, "grad_norm": 0.006110777612775564, "kl": 0.5724486298859119, "learning_rate": 9.999648100038048e-06, "loss": -0.0063, "step": 908, "step_time": 4.355550814001617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1747.0, "completions/max_terminated_length": 1747.0, "completions/mean_length": 400.21875, "completions/mean_terminated_length": 400.21875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.961645305156708, "epoch": 0.00909, "frac_reward_zero_std": 0.25, "grad_norm": 0.03900923207402229, "kl": 0.4735736846923828, "learning_rate": 9.999647292477912e-06, "loss": -0.0092, "num_tokens": 20672085.0, "reward": 1.1127777099609375, "reward_std": 1.264657735824585, "rewards/rollout_reward_func/mean": 1.1127777099609375, "rewards/rollout_reward_func/std": 1.5176262855529785, "sampling/importance_sampling_ratio/max": 0.5544652938842773, "sampling/importance_sampling_ratio/mean": 0.19541077315807343, "sampling/importance_sampling_ratio/min": 2.1913280211390256e-15, "sampling/sampling_logp_difference/max": 4.644211769104004, "sampling/sampling_logp_difference/mean": 1.1488418579101562, "step": 909, "step_time": 11.340245536997827 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 5.9425128698349, "epoch": 0.0091, "grad_norm": 0.031083332374691963, "kl": 0.48045632615685463, "learning_rate": 9.999646483992262e-06, "loss": -0.0093, "step": 910, "step_time": 6.422168948003673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010714286006987095, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010714286006987095, "completions/clipped_ratio": 0.0, "completions/max_length": 1196.0, "completions/max_terminated_length": 1196.0, "completions/mean_length": 388.15625, "completions/mean_terminated_length": 388.15625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.069531857967377, "epoch": 0.00911, "frac_reward_zero_std": 0.0, "grad_norm": 0.022868536412715912, "kl": 0.3483350230380893, "learning_rate": 9.9996456745811e-06, "loss": -0.0116, "num_tokens": 20720545.0, "reward": -0.21965889632701874, "reward_std": 1.2933692932128906, "rewards/rollout_reward_func/mean": -0.21965889632701874, "rewards/rollout_reward_func/std": 1.3205783367156982, "sampling/importance_sampling_ratio/max": 0.5591624975204468, "sampling/importance_sampling_ratio/mean": 0.11450715363025665, "sampling/importance_sampling_ratio/min": 5.455037192358264e-14, "sampling/sampling_logp_difference/max": 4.311592102050781, "sampling/sampling_logp_difference/mean": 1.3851815462112427, "step": 911, "step_time": 9.824591881002561 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.004464285913854837, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012276785913854837, "entropy": 7.038115382194519, "epoch": 0.00912, "grad_norm": 0.021605433896183968, "kl": 0.3508697859942913, "learning_rate": 9.999644864244428e-06, "loss": -0.0116, "step": 912, "step_time": 5.219041943004413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "completions/clipped_ratio": 0.0, "completions/max_length": 1207.0, "completions/max_terminated_length": 1207.0, "completions/mean_length": 549.78125, "completions/mean_terminated_length": 549.78125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.680750250816345, "epoch": 0.00913, "frac_reward_zero_std": 0.0, "grad_norm": 0.014675665646791458, "kl": 0.3934870380908251, "learning_rate": 9.999644052982243e-06, "loss": -0.0145, "num_tokens": 20775874.0, "reward": 1.1373502016067505, "reward_std": 1.7779262065887451, "rewards/rollout_reward_func/mean": 1.1373502016067505, "rewards/rollout_reward_func/std": 1.7858173847198486, "sampling/importance_sampling_ratio/max": 0.31129616498947144, "sampling/importance_sampling_ratio/mean": 0.07660922408103943, "sampling/importance_sampling_ratio/min": 4.038890610846835e-10, "sampling/sampling_logp_difference/max": 11.048521995544434, "sampling/sampling_logp_difference/mean": 1.4070844650268555, "step": 913, "step_time": 10.66969486799644 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 6.688018023967743, "epoch": 0.00914, "grad_norm": 0.014960413798689842, "kl": 0.3957317369058728, "learning_rate": 9.999643240794546e-06, "loss": -0.0145, "step": 914, "step_time": 5.47493928500262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.005681818351149559, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005681818351149559, "completions/clipped_ratio": 0.0, "completions/max_length": 1000.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 356.5, "completions/mean_terminated_length": 356.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.610504746437073, "epoch": 0.00915, "frac_reward_zero_std": 0.0, "grad_norm": 0.04443702474236488, "kl": 0.4611635096371174, "learning_rate": 9.999642427681338e-06, "loss": -0.0143, "num_tokens": 20823153.0, "reward": 0.9251590371131897, "reward_std": 1.2510714530944824, "rewards/rollout_reward_func/mean": 0.9251590371131897, "rewards/rollout_reward_func/std": 1.4995485544204712, "sampling/importance_sampling_ratio/max": 0.5602256655693054, "sampling/importance_sampling_ratio/mean": 0.1480305790901184, "sampling/importance_sampling_ratio/min": 6.264004070999363e-08, "sampling/sampling_logp_difference/max": 4.528769493103027, "sampling/sampling_logp_difference/mean": 1.2594709396362305, "step": 915, "step_time": 9.001377607000904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 6.5855565667152405, "epoch": 0.00916, "grad_norm": 0.03513192757964134, "kl": 0.4647272862493992, "learning_rate": 9.99964161364262e-06, "loss": -0.0145, "step": 916, "step_time": 4.942626565003593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 829.0, "completions/max_terminated_length": 829.0, "completions/mean_length": 275.9375, "completions/mean_terminated_length": 268.8709716796875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.451284766197205, "epoch": 0.00917, "frac_reward_zero_std": 0.25, "grad_norm": 0.057768139988183975, "kl": 0.5536523573100567, "learning_rate": 9.999640798678389e-06, "loss": -0.0058, "num_tokens": 20867927.0, "reward": 0.7144629955291748, "reward_std": 1.1906793117523193, "rewards/rollout_reward_func/mean": 0.7144629955291748, "rewards/rollout_reward_func/std": 1.5902519226074219, "sampling/importance_sampling_ratio/max": 0.5547922849655151, "sampling/importance_sampling_ratio/mean": 0.21661685407161713, "sampling/importance_sampling_ratio/min": 3.7380540025357667e-23, "sampling/sampling_logp_difference/max": 12.758366584777832, "sampling/sampling_logp_difference/mean": 1.1023789644241333, "step": 917, "step_time": 8.808267438995244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.458306550979614, "epoch": 0.00918, "grad_norm": 0.055312179028987885, "kl": 0.5527545548975468, "learning_rate": 9.999639982788647e-06, "loss": -0.006, "step": 918, "step_time": 4.574056160992768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1111.0, "completions/max_terminated_length": 1111.0, "completions/mean_length": 167.71875, "completions/mean_terminated_length": 167.71875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.9583336412906647, "epoch": 0.00919, "frac_reward_zero_std": 0.75, "grad_norm": 0.023900585249066353, "kl": 0.7914503328502178, "learning_rate": 9.999639165973397e-06, "loss": -0.0015, "num_tokens": 20903420.0, "reward": 1.7853233814239502, "reward_std": 0.3701442778110504, "rewards/rollout_reward_func/mean": 1.7853233814239502, "rewards/rollout_reward_func/std": 0.798402726650238, "sampling/importance_sampling_ratio/max": 0.5609456300735474, "sampling/importance_sampling_ratio/mean": 0.4175187349319458, "sampling/importance_sampling_ratio/min": 1.6507923646713607e-05, "sampling/sampling_logp_difference/max": 4.794583320617676, "sampling/sampling_logp_difference/mean": 0.6702576279640198, "step": 919, "step_time": 9.492172187990946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.967419922351837, "epoch": 0.0092, "grad_norm": 0.023695431649684906, "kl": 0.7873925566673279, "learning_rate": 9.999638348232636e-06, "loss": -0.0016, "step": 920, "step_time": 5.115615929000342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 746.0, "completions/max_terminated_length": 746.0, "completions/mean_length": 134.6875, "completions/mean_terminated_length": 134.6875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.328531354665756, "epoch": 0.00921, "frac_reward_zero_std": 0.25, "grad_norm": 0.010641760192811489, "kl": 0.8059714548289776, "learning_rate": 9.999637529566364e-06, "loss": -0.0133, "num_tokens": 20940548.0, "reward": 0.6442286372184753, "reward_std": 1.0729986429214478, "rewards/rollout_reward_func/mean": 0.6442286372184753, "rewards/rollout_reward_func/std": 1.6656098365783691, "sampling/importance_sampling_ratio/max": 0.5602139830589294, "sampling/importance_sampling_ratio/mean": 0.37862128019332886, "sampling/importance_sampling_ratio/min": 0.0002688573149498552, "sampling/sampling_logp_difference/max": 2.246891975402832, "sampling/sampling_logp_difference/mean": 0.7568774223327637, "step": 921, "step_time": 7.867847788002109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.32441645860672, "epoch": 0.00922, "grad_norm": 0.01037620473653078, "kl": 0.807420153170824, "learning_rate": 9.999636709974583e-06, "loss": -0.0133, "step": 922, "step_time": 4.397567268995772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1267.0, "completions/max_terminated_length": 1267.0, "completions/mean_length": 301.375, "completions/mean_terminated_length": 301.375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.369119465351105, "epoch": 0.00923, "frac_reward_zero_std": 0.25, "grad_norm": 0.049634940922260284, "kl": 0.5802182573825121, "learning_rate": 9.999635889457293e-06, "loss": -0.0149, "num_tokens": 20984841.0, "reward": 1.21453857421875, "reward_std": 0.9098163843154907, "rewards/rollout_reward_func/mean": 1.21453857421875, "rewards/rollout_reward_func/std": 1.5214020013809204, "sampling/importance_sampling_ratio/max": 0.5509606599807739, "sampling/importance_sampling_ratio/mean": 0.24817398190498352, "sampling/importance_sampling_ratio/min": 3.858985613697241e-12, "sampling/sampling_logp_difference/max": 10.12543773651123, "sampling/sampling_logp_difference/mean": 1.108487844467163, "step": 923, "step_time": 10.077986531992792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.366585731506348, "epoch": 0.00924, "grad_norm": 0.0470467209815979, "kl": 0.5842974688857794, "learning_rate": 9.999635068014492e-06, "loss": -0.015, "step": 924, "step_time": 5.501624566997634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/max_terminated_length": 296.0, "completions/mean_length": 46.59375, "completions/mean_terminated_length": 46.59375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.93759161233902, "epoch": 0.00925, "frac_reward_zero_std": 0.0, "grad_norm": 0.1193523108959198, "kl": 0.7734843045473099, "learning_rate": 9.999634245646181e-06, "loss": -0.0183, "num_tokens": 21019977.0, "reward": 0.2673649191856384, "reward_std": 1.1764713525772095, "rewards/rollout_reward_func/mean": 0.2673649191856384, "rewards/rollout_reward_func/std": 1.5996495485305786, "sampling/importance_sampling_ratio/max": 0.5574004054069519, "sampling/importance_sampling_ratio/mean": 0.32790303230285645, "sampling/importance_sampling_ratio/min": 2.397131027009891e-07, "sampling/sampling_logp_difference/max": 4.159917831420898, "sampling/sampling_logp_difference/mean": 0.9859074354171753, "step": 925, "step_time": 7.316086970997276 }, { "clip_ratio/high_max": 0.046875, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 4.855719268321991, "epoch": 0.00926, "grad_norm": 0.037844233214855194, "kl": 0.7849360555410385, "learning_rate": 9.999633422352361e-06, "loss": -0.0187, "step": 926, "step_time": 3.583606805997988 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "completions/clipped_ratio": 0.0, "completions/max_length": 614.0, "completions/max_terminated_length": 614.0, "completions/mean_length": 127.28125, "completions/mean_terminated_length": 127.28125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.49048176407814, "epoch": 0.00927, "frac_reward_zero_std": 0.25, "grad_norm": 0.04758641496300697, "kl": 0.7820236422121525, "learning_rate": 9.999632598133035e-06, "loss": -0.0006, "num_tokens": 21056371.0, "reward": 1.1928797960281372, "reward_std": 1.009759783744812, "rewards/rollout_reward_func/mean": 1.1928797960281372, "rewards/rollout_reward_func/std": 1.3143723011016846, "sampling/importance_sampling_ratio/max": 0.5599843859672546, "sampling/importance_sampling_ratio/mean": 0.3669530153274536, "sampling/importance_sampling_ratio/min": 7.376015673798975e-06, "sampling/sampling_logp_difference/max": 2.3251662254333496, "sampling/sampling_logp_difference/mean": 0.841073751449585, "step": 927, "step_time": 7.1588540589982586 }, { "clip_ratio/high_max": 0.09375, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 4.420533657073975, "epoch": 0.00928, "grad_norm": 0.042358893901109695, "kl": 0.7973572723567486, "learning_rate": 9.999631772988198e-06, "loss": -0.0006, "step": 928, "step_time": 4.101976059002482 }, { "clip_ratio/high_max": 0.004807692486792803, "clip_ratio/high_mean": 0.0024038462433964014, "clip_ratio/low_mean": 0.026041666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.028445513220503926, "completions/clipped_ratio": 0.0, "completions/max_length": 1104.0, "completions/max_terminated_length": 1104.0, "completions/mean_length": 325.71875, "completions/mean_terminated_length": 325.71875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.254951894283295, "epoch": 0.00929, "frac_reward_zero_std": 0.25, "grad_norm": 0.07972729206085205, "kl": 0.5905771479010582, "learning_rate": 9.999630946917853e-06, "loss": -0.0084, "num_tokens": 21101710.0, "reward": 1.4161663055419922, "reward_std": 1.1067620515823364, "rewards/rollout_reward_func/mean": 1.4161663055419922, "rewards/rollout_reward_func/std": 1.5203207731246948, "sampling/importance_sampling_ratio/max": 0.5598239302635193, "sampling/importance_sampling_ratio/mean": 0.2187991440296173, "sampling/importance_sampling_ratio/min": 4.257086478342035e-16, "sampling/sampling_logp_difference/max": 12.607219696044922, "sampling/sampling_logp_difference/mean": 0.996957540512085, "step": 929, "step_time": 9.268809591001627 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.016666667070239782, "entropy": 5.233640044927597, "epoch": 0.0093, "grad_norm": 0.09065257012844086, "kl": 0.5821907967329025, "learning_rate": 9.999630119922e-06, "loss": -0.0085, "step": 930, "step_time": 5.399649595001392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.0, "completions/max_length": 578.0, "completions/max_terminated_length": 578.0, "completions/mean_length": 205.65625, "completions/mean_terminated_length": 205.65625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.044974207878113, "epoch": 0.00931, "frac_reward_zero_std": 0.25, "grad_norm": 0.02931939624249935, "kl": 0.5635243449360132, "learning_rate": 9.99962929200064e-06, "loss": -0.0136, "num_tokens": 21141067.0, "reward": 1.1648342609405518, "reward_std": 0.9710017442703247, "rewards/rollout_reward_func/mean": 1.1648342609405518, "rewards/rollout_reward_func/std": 1.3079382181167603, "sampling/importance_sampling_ratio/max": 0.5603911876678467, "sampling/importance_sampling_ratio/mean": 0.31566131114959717, "sampling/importance_sampling_ratio/min": 6.834455689386232e-06, "sampling/sampling_logp_difference/max": 4.796818256378174, "sampling/sampling_logp_difference/mean": 1.0137853622436523, "step": 931, "step_time": 7.574306596994575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.027157739736139774, "clip_ratio/low_min": 0.015625, "clip_ratio/region_mean": 0.027157739736139774, "entropy": 5.071344405412674, "epoch": 0.00932, "grad_norm": 0.02638549916446209, "kl": 0.5715831555426121, "learning_rate": 9.99962846315377e-06, "loss": -0.0137, "step": 932, "step_time": 4.010463770999195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1522.0, "completions/max_terminated_length": 1522.0, "completions/mean_length": 345.5, "completions/mean_terminated_length": 345.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.6158692836761475, "epoch": 0.00933, "frac_reward_zero_std": 0.25, "grad_norm": 0.005347851663827896, "kl": 0.48141033574938774, "learning_rate": 9.999627633381394e-06, "loss": -0.0028, "num_tokens": 21186612.0, "reward": 1.1826319694519043, "reward_std": 1.4459881782531738, "rewards/rollout_reward_func/mean": 1.1826319694519043, "rewards/rollout_reward_func/std": 1.6686556339263916, "sampling/importance_sampling_ratio/max": 0.5568241477012634, "sampling/importance_sampling_ratio/mean": 0.16934923827648163, "sampling/importance_sampling_ratio/min": 4.4673518356797004e-11, "sampling/sampling_logp_difference/max": 8.367538452148438, "sampling/sampling_logp_difference/mean": 1.3166308403015137, "step": 933, "step_time": 10.499839360003534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 6.643912374973297, "epoch": 0.00934, "grad_norm": 0.005524483975023031, "kl": 0.4798288643360138, "learning_rate": 9.99962680268351e-06, "loss": -0.0028, "step": 934, "step_time": 5.989771291995567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1180.0, "completions/max_terminated_length": 1180.0, "completions/mean_length": 224.78125, "completions/mean_terminated_length": 224.78125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.811203747987747, "epoch": 0.00935, "frac_reward_zero_std": 0.5, "grad_norm": 0.010578420013189316, "kl": 0.6545285396277905, "learning_rate": 9.99962597106012e-06, "loss": -0.0098, "num_tokens": 21227622.0, "reward": 1.2027645111083984, "reward_std": 0.9064282178878784, "rewards/rollout_reward_func/mean": 1.2027645111083984, "rewards/rollout_reward_func/std": 1.548215627670288, "sampling/importance_sampling_ratio/max": 0.5575588345527649, "sampling/importance_sampling_ratio/mean": 0.3291592597961426, "sampling/importance_sampling_ratio/min": 0.00038730946835130453, "sampling/sampling_logp_difference/max": 2.6935338973999023, "sampling/sampling_logp_difference/mean": 0.8184448480606079, "step": 935, "step_time": 9.031108870000025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.813754349946976, "epoch": 0.00936, "grad_norm": 0.01002649124711752, "kl": 0.6582659631967545, "learning_rate": 9.99962513851122e-06, "loss": -0.0098, "step": 936, "step_time": 5.482754829998157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 143.0, "completions/max_terminated_length": 143.0, "completions/mean_length": 6.46875, "completions/mean_terminated_length": 6.46875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.1716480553150177, "epoch": 0.00937, "frac_reward_zero_std": 0.5, "grad_norm": 0.10125081241130829, "kl": 0.8946774005889893, "learning_rate": 9.999624305036816e-06, "loss": -0.0022, "num_tokens": 21258707.0, "reward": 1.1774910688400269, "reward_std": 0.16949531435966492, "rewards/rollout_reward_func/mean": 1.1774910688400269, "rewards/rollout_reward_func/std": 1.1428335905075073, "sampling/importance_sampling_ratio/max": 0.6178281903266907, "sampling/importance_sampling_ratio/mean": 0.48497363924980164, "sampling/importance_sampling_ratio/min": 0.006037015002220869, "sampling/sampling_logp_difference/max": 3.816218137741089, "sampling/sampling_logp_difference/mean": 0.4481254518032074, "step": 937, "step_time": 6.338828191001085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.026041666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.026041666977107525, "entropy": 3.232108384370804, "epoch": 0.00938, "grad_norm": 0.07588788121938705, "kl": 0.8879632577300072, "learning_rate": 9.999623470636904e-06, "loss": -0.0024, "step": 938, "step_time": 3.3471250680013327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1108.0, "completions/max_terminated_length": 1108.0, "completions/mean_length": 561.90625, "completions/mean_terminated_length": 598.300048828125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.575101375579834, "epoch": 0.00939, "frac_reward_zero_std": 0.0, "grad_norm": 0.020603138953447342, "kl": 0.38851369824260473, "learning_rate": 9.999622635311485e-06, "loss": -0.0047, "num_tokens": 21313932.0, "reward": 0.46225112676620483, "reward_std": 1.4116184711456299, "rewards/rollout_reward_func/mean": 0.46225112676620483, "rewards/rollout_reward_func/std": 1.4244898557662964, "sampling/importance_sampling_ratio/max": 0.5541741847991943, "sampling/importance_sampling_ratio/mean": 0.10468641668558121, "sampling/importance_sampling_ratio/min": 7.193854815639381e-25, "sampling/sampling_logp_difference/max": 13.091302871704102, "sampling/sampling_logp_difference/mean": 1.4761018753051758, "step": 939, "step_time": 9.618382523000037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 6.591233015060425, "epoch": 0.0094, "grad_norm": 0.018519967794418335, "kl": 0.37600008957087994, "learning_rate": 9.999621799060561e-06, "loss": -0.0048, "step": 940, "step_time": 5.222232445001282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "completions/clipped_ratio": 0.0, "completions/max_length": 962.0, "completions/max_terminated_length": 962.0, "completions/mean_length": 331.21875, "completions/mean_terminated_length": 331.21875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.57637295126915, "epoch": 0.00941, "frac_reward_zero_std": 0.25, "grad_norm": 0.015975629910826683, "kl": 0.5317034218460321, "learning_rate": 9.999620961884131e-06, "loss": -0.0132, "num_tokens": 21357825.0, "reward": 0.871274471282959, "reward_std": 1.336068034172058, "rewards/rollout_reward_func/mean": 0.871274471282959, "rewards/rollout_reward_func/std": 1.629074215888977, "sampling/importance_sampling_ratio/max": 0.5546555519104004, "sampling/importance_sampling_ratio/mean": 0.20861996710300446, "sampling/importance_sampling_ratio/min": 1.892883202611606e-12, "sampling/sampling_logp_difference/max": 13.056625366210938, "sampling/sampling_logp_difference/mean": 1.2357354164123535, "step": 941, "step_time": 8.74489321800138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 5.586115777492523, "epoch": 0.00942, "grad_norm": 0.01639576442539692, "kl": 0.5267787128686905, "learning_rate": 9.999620123782196e-06, "loss": -0.0132, "step": 942, "step_time": 5.62123357100063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1020.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 219.875, "completions/mean_terminated_length": 213.5483856201172, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.980634868144989, "epoch": 0.00943, "frac_reward_zero_std": 0.25, "grad_norm": 0.07834545522928238, "kl": 0.49940263479948044, "learning_rate": 9.999619284754754e-06, "loss": -0.0091, "num_tokens": 21398678.0, "reward": 0.9981669187545776, "reward_std": 1.103666067123413, "rewards/rollout_reward_func/mean": 0.9981669187545776, "rewards/rollout_reward_func/std": 1.4726636409759521, "sampling/importance_sampling_ratio/max": 0.5621940493583679, "sampling/importance_sampling_ratio/mean": 0.3152540624141693, "sampling/importance_sampling_ratio/min": 7.541263078413019e-15, "sampling/sampling_logp_difference/max": 4.484166145324707, "sampling/sampling_logp_difference/mean": 1.0209465026855469, "step": 943, "step_time": 8.866593298003863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.978381723165512, "epoch": 0.00944, "grad_norm": 0.07551766186952591, "kl": 0.4961356520652771, "learning_rate": 9.999618444801806e-06, "loss": -0.0092, "step": 944, "step_time": 4.888993103002576 }, { "clip_ratio/high_max": 0.0052083334885537624, "clip_ratio/high_mean": 0.0026041667442768812, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0026041667442768812, "completions/clipped_ratio": 0.0625, "completions/max_length": 946.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 303.25, "completions/mean_terminated_length": 308.3666687011719, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.3433409333229065, "epoch": 0.00945, "frac_reward_zero_std": 0.0, "grad_norm": 0.05176218971610069, "kl": 0.5398171320557594, "learning_rate": 9.999617603923354e-06, "loss": -0.0133, "num_tokens": 21442284.0, "reward": 0.8093545436859131, "reward_std": 1.226410984992981, "rewards/rollout_reward_func/mean": 0.8093545436859131, "rewards/rollout_reward_func/std": 1.4784810543060303, "sampling/importance_sampling_ratio/max": 0.5537766814231873, "sampling/importance_sampling_ratio/mean": 0.17249846458435059, "sampling/importance_sampling_ratio/min": 5.52121350483499e-14, "sampling/sampling_logp_difference/max": 10.165529251098633, "sampling/sampling_logp_difference/mean": 1.370570182800293, "step": 945, "step_time": 8.682579693999287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 6.340054988861084, "epoch": 0.00946, "grad_norm": 0.04462990164756775, "kl": 0.5361695401370525, "learning_rate": 9.999616762119397e-06, "loss": -0.0134, "step": 946, "step_time": 4.771195454002736 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0031250000465661287, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013541667023673654, "completions/clipped_ratio": 0.03125, "completions/max_length": 605.0, "completions/max_terminated_length": 605.0, "completions/mean_length": 228.875, "completions/mean_terminated_length": 235.74192810058594, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.831173986196518, "epoch": 0.00947, "frac_reward_zero_std": 0.0, "grad_norm": 0.00898384302854538, "kl": 0.6082349568605423, "learning_rate": 9.999615919389935e-06, "loss": -0.0163, "num_tokens": 21484595.0, "reward": 0.40872055292129517, "reward_std": 0.8541519045829773, "rewards/rollout_reward_func/mean": 0.40872055292129517, "rewards/rollout_reward_func/std": 1.235876202583313, "sampling/importance_sampling_ratio/max": 0.5590769648551941, "sampling/importance_sampling_ratio/mean": 0.24651160836219788, "sampling/importance_sampling_ratio/min": 5.03118990913265e-10, "sampling/sampling_logp_difference/max": 4.025032043457031, "sampling/sampling_logp_difference/mean": 1.233036994934082, "step": 947, "step_time": 7.7895603719953215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.8307061195373535, "epoch": 0.00948, "grad_norm": 0.009518974460661411, "kl": 0.6124993190169334, "learning_rate": 9.999615075734968e-06, "loss": -0.0164, "step": 948, "step_time": 4.530425006996666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1094.0, "completions/max_terminated_length": 1094.0, "completions/mean_length": 600.09375, "completions/mean_terminated_length": 600.09375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.500522077083588, "epoch": 0.00949, "frac_reward_zero_std": 0.0, "grad_norm": 0.02607744373381138, "kl": 0.4281851015985012, "learning_rate": 9.999614231154497e-06, "loss": -0.0143, "num_tokens": 21540876.0, "reward": 1.6733695268630981, "reward_std": 1.5339299440383911, "rewards/rollout_reward_func/mean": 1.6733695268630981, "rewards/rollout_reward_func/std": 1.590793490409851, "sampling/importance_sampling_ratio/max": 0.30990585684776306, "sampling/importance_sampling_ratio/mean": 0.11694866418838501, "sampling/importance_sampling_ratio/min": 3.369266035941243e-15, "sampling/sampling_logp_difference/max": 4.8411030769348145, "sampling/sampling_logp_difference/mean": 1.0583808422088623, "step": 949, "step_time": 9.829621074000897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.49697470664978, "epoch": 0.0095, "grad_norm": 0.02594032883644104, "kl": 0.42584751918911934, "learning_rate": 9.999613385648523e-06, "loss": -0.0143, "step": 950, "step_time": 5.20425062299546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1607.0, "completions/max_terminated_length": 1607.0, "completions/mean_length": 447.875, "completions/mean_terminated_length": 447.875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.448490589857101, "epoch": 0.00951, "frac_reward_zero_std": 0.25, "grad_norm": 0.008531956933438778, "kl": 0.5931455660611391, "learning_rate": 9.999612539217044e-06, "loss": -0.0085, "num_tokens": 21589833.0, "reward": 0.9003344774246216, "reward_std": 1.069777488708496, "rewards/rollout_reward_func/mean": 0.9003344774246216, "rewards/rollout_reward_func/std": 1.4654382467269897, "sampling/importance_sampling_ratio/max": 0.5572385787963867, "sampling/importance_sampling_ratio/mean": 0.22662463784217834, "sampling/importance_sampling_ratio/min": 8.153606927407964e-07, "sampling/sampling_logp_difference/max": 4.204616546630859, "sampling/sampling_logp_difference/mean": 1.115882396697998, "step": 951, "step_time": 10.680556256989803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.442933827638626, "epoch": 0.00952, "grad_norm": 0.008374234661459923, "kl": 0.5926556903868914, "learning_rate": 9.999611691860062e-06, "loss": -0.0085, "step": 952, "step_time": 6.040278332999151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1367.0, "completions/max_terminated_length": 1367.0, "completions/mean_length": 300.0625, "completions/mean_terminated_length": 291.7419128417969, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.14716511964798, "epoch": 0.00953, "frac_reward_zero_std": 0.25, "grad_norm": 0.027094287797808647, "kl": 0.5346540957689285, "learning_rate": 9.999610843577577e-06, "loss": -0.0056, "num_tokens": 21634949.0, "reward": 0.4207512438297272, "reward_std": 0.9392671585083008, "rewards/rollout_reward_func/mean": 0.4207512438297272, "rewards/rollout_reward_func/std": 1.3992213010787964, "sampling/importance_sampling_ratio/max": 0.5635802745819092, "sampling/importance_sampling_ratio/mean": 0.21620836853981018, "sampling/importance_sampling_ratio/min": 5.242624306814258e-30, "sampling/sampling_logp_difference/max": 4.641932964324951, "sampling/sampling_logp_difference/mean": 1.3768116235733032, "step": 953, "step_time": 10.572734072007734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 6.149443686008453, "epoch": 0.00954, "grad_norm": 0.027569284662604332, "kl": 0.5359580740332603, "learning_rate": 9.999609994369586e-06, "loss": -0.0056, "step": 954, "step_time": 5.941840860996308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0028409091755747795, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0028409091755747795, "completions/clipped_ratio": 0.0, "completions/max_length": 1171.0, "completions/max_terminated_length": 1171.0, "completions/mean_length": 261.03125, "completions/mean_terminated_length": 261.03125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.0572357177734375, "epoch": 0.00955, "frac_reward_zero_std": 0.5, "grad_norm": 0.005242802668362856, "kl": 0.5246659200638533, "learning_rate": 9.999609144236094e-06, "loss": -0.0012, "num_tokens": 21676138.0, "reward": 1.3423571586608887, "reward_std": 0.6438193321228027, "rewards/rollout_reward_func/mean": 1.3423571586608887, "rewards/rollout_reward_func/std": 1.0771198272705078, "sampling/importance_sampling_ratio/max": 0.5628893375396729, "sampling/importance_sampling_ratio/mean": 0.3134433925151825, "sampling/importance_sampling_ratio/min": 5.363090593846209e-08, "sampling/sampling_logp_difference/max": 3.6384668350219727, "sampling/sampling_logp_difference/mean": 0.8694776296615601, "step": 955, "step_time": 9.701410565001424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.054555714130402, "epoch": 0.00956, "grad_norm": 0.00496673071756959, "kl": 0.5247254967689514, "learning_rate": 9.999608293177099e-06, "loss": -0.0012, "step": 956, "step_time": 5.277521560998139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 469.0, "completions/max_terminated_length": 469.0, "completions/mean_length": 128.875, "completions/mean_terminated_length": 128.875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.613925486803055, "epoch": 0.00957, "frac_reward_zero_std": 0.25, "grad_norm": 0.02710377611219883, "kl": 0.6583181917667389, "learning_rate": 9.9996074411926e-06, "loss": -0.0121, "num_tokens": 21712826.0, "reward": 0.6939052939414978, "reward_std": 1.1159330606460571, "rewards/rollout_reward_func/mean": 0.6939052939414978, "rewards/rollout_reward_func/std": 1.5104163885116577, "sampling/importance_sampling_ratio/max": 0.5590810179710388, "sampling/importance_sampling_ratio/mean": 0.3640831708908081, "sampling/importance_sampling_ratio/min": 2.13548023708654e-08, "sampling/sampling_logp_difference/max": 4.539626121520996, "sampling/sampling_logp_difference/mean": 0.942251443862915, "step": 957, "step_time": 6.898060572006216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.611260861158371, "epoch": 0.00958, "grad_norm": 0.02547261118888855, "kl": 0.6582135520875454, "learning_rate": 9.9996065882826e-06, "loss": -0.0122, "step": 958, "step_time": 3.7807275730010588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 362.34375, "completions/mean_terminated_length": 361.774169921875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.145421862602234, "epoch": 0.00959, "frac_reward_zero_std": 0.0, "grad_norm": 0.009425093419849873, "kl": 0.22762426361441612, "learning_rate": 9.999605734447097e-06, "loss": -0.0101, "num_tokens": 21761113.0, "reward": 0.7199934720993042, "reward_std": 1.702007532119751, "rewards/rollout_reward_func/mean": 0.7199934720993042, "rewards/rollout_reward_func/std": 1.6685402393341064, "sampling/importance_sampling_ratio/max": 0.5467791557312012, "sampling/importance_sampling_ratio/mean": 0.10935481637716293, "sampling/importance_sampling_ratio/min": 4.728548854901821e-17, "sampling/sampling_logp_difference/max": 4.4474639892578125, "sampling/sampling_logp_difference/mean": 1.508182406425476, "step": 959, "step_time": 9.522220130002097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 7.153246343135834, "epoch": 0.0096, "grad_norm": 0.00889738742262125, "kl": 0.22672202344983816, "learning_rate": 9.999604879686092e-06, "loss": -0.0101, "step": 960, "step_time": 4.817712773994572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1063.0, "completions/max_terminated_length": 1063.0, "completions/mean_length": 353.4375, "completions/mean_terminated_length": 353.4375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.306512773036957, "epoch": 0.00961, "frac_reward_zero_std": 0.0, "grad_norm": 0.10617559403181076, "kl": 0.31940358970314264, "learning_rate": 9.999604023999587e-06, "loss": -0.0109, "num_tokens": 21809033.0, "reward": 0.5956326723098755, "reward_std": 1.5438072681427002, "rewards/rollout_reward_func/mean": 0.5956326723098755, "rewards/rollout_reward_func/std": 1.4871166944503784, "sampling/importance_sampling_ratio/max": 0.6490720510482788, "sampling/importance_sampling_ratio/mean": 0.1563282161951065, "sampling/importance_sampling_ratio/min": 1.12531051854603e-06, "sampling/sampling_logp_difference/max": 3.755775213241577, "sampling/sampling_logp_difference/mean": 1.248077154159546, "step": 961, "step_time": 9.241033850001259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.018229166977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.018229166977107525, "entropy": 6.3127128183841705, "epoch": 0.00962, "grad_norm": 0.03214575722813606, "kl": 0.3107733875513077, "learning_rate": 9.999603167387578e-06, "loss": -0.0113, "step": 962, "step_time": 5.083864444997744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 962.0, "completions/max_terminated_length": 962.0, "completions/mean_length": 409.3125, "completions/mean_terminated_length": 409.3125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.456809461116791, "epoch": 0.00963, "frac_reward_zero_std": 0.0, "grad_norm": 0.006596829742193222, "kl": 0.18077724101021886, "learning_rate": 9.999602309850068e-06, "loss": -0.0108, "num_tokens": 21858399.0, "reward": 0.6050747632980347, "reward_std": 1.5693659782409668, "rewards/rollout_reward_func/mean": 0.6050747632980347, "rewards/rollout_reward_func/std": 1.6390836238861084, "sampling/importance_sampling_ratio/max": 0.31076982617378235, "sampling/importance_sampling_ratio/mean": 0.04914311319589615, "sampling/importance_sampling_ratio/min": 6.082511230798282e-12, "sampling/sampling_logp_difference/max": 4.153629302978516, "sampling/sampling_logp_difference/mean": 1.5672366619110107, "step": 963, "step_time": 8.728387311002734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 7.46576988697052, "epoch": 0.00964, "grad_norm": 0.006185516249388456, "kl": 0.1830998044461012, "learning_rate": 9.999601451387057e-06, "loss": -0.0107, "step": 964, "step_time": 4.736202117001085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/max_terminated_length": 479.0, "completions/mean_length": 163.90625, "completions/mean_terminated_length": 163.90625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.247609496116638, "epoch": 0.00965, "frac_reward_zero_std": 0.0, "grad_norm": 0.017285071313381195, "kl": 0.3874510284513235, "learning_rate": 9.999600591998547e-06, "loss": 0.0011, "num_tokens": 21899440.0, "reward": 0.30147600173950195, "reward_std": 0.9845295548439026, "rewards/rollout_reward_func/mean": 0.30147600173950195, "rewards/rollout_reward_func/std": 1.2634124755859375, "sampling/importance_sampling_ratio/max": 0.5563449263572693, "sampling/importance_sampling_ratio/mean": 0.1965678483247757, "sampling/importance_sampling_ratio/min": 0.0001402842899551615, "sampling/sampling_logp_difference/max": 2.8589284420013428, "sampling/sampling_logp_difference/mean": 1.2588117122650146, "step": 965, "step_time": 7.783056238993595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 6.26297789812088, "epoch": 0.00966, "grad_norm": 0.017116395756602287, "kl": 0.386486504226923, "learning_rate": 9.999599731684533e-06, "loss": 0.0011, "step": 966, "step_time": 3.838175107990537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 802.0, "completions/max_terminated_length": 802.0, "completions/mean_length": 197.71875, "completions/mean_terminated_length": 197.71875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.7774248123168945, "epoch": 0.00967, "frac_reward_zero_std": 0.25, "grad_norm": 0.058332301676273346, "kl": 0.46626945747993886, "learning_rate": 9.99959887044502e-06, "loss": -0.0038, "num_tokens": 21939317.0, "reward": 1.0465337038040161, "reward_std": 1.163147211074829, "rewards/rollout_reward_func/mean": 1.0465337038040161, "rewards/rollout_reward_func/std": 1.3961635828018188, "sampling/importance_sampling_ratio/max": 0.5584744811058044, "sampling/importance_sampling_ratio/mean": 0.2835220694541931, "sampling/importance_sampling_ratio/min": 1.3135182598489337e-05, "sampling/sampling_logp_difference/max": 2.781221866607666, "sampling/sampling_logp_difference/mean": 1.1527390480041504, "step": 967, "step_time": 7.904242816999613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.784523040056229, "epoch": 0.00968, "grad_norm": 0.05574793368577957, "kl": 0.4623329231981188, "learning_rate": 9.999598008280007e-06, "loss": -0.0039, "step": 968, "step_time": 4.389724557997397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 861.0, "completions/max_terminated_length": 861.0, "completions/mean_length": 158.1875, "completions/mean_terminated_length": 158.1875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.096733659505844, "epoch": 0.00969, "frac_reward_zero_std": 0.5, "grad_norm": 0.003977145534008741, "kl": 0.6408337894827127, "learning_rate": 9.999597145189494e-06, "loss": -0.0076, "num_tokens": 21977377.0, "reward": 0.5082350373268127, "reward_std": 0.6107528805732727, "rewards/rollout_reward_func/mean": 0.5082350373268127, "rewards/rollout_reward_func/std": 1.4729024171829224, "sampling/importance_sampling_ratio/max": 0.561492383480072, "sampling/importance_sampling_ratio/mean": 0.31171879172325134, "sampling/importance_sampling_ratio/min": 1.4057661978927882e-10, "sampling/sampling_logp_difference/max": 4.528969764709473, "sampling/sampling_logp_difference/mean": 0.965626060962677, "step": 969, "step_time": 8.10562687499987 }, { "clip_ratio/high_max": 0.011363636702299118, "clip_ratio/high_mean": 0.005681818351149559, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005681818351149559, "entropy": 5.092347741127014, "epoch": 0.0097, "grad_norm": 0.003990249242633581, "kl": 0.6390674281865358, "learning_rate": 9.999596281173482e-06, "loss": -0.0076, "step": 970, "step_time": 4.9424734029962565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 991.0, "completions/max_terminated_length": 991.0, "completions/mean_length": 168.21875, "completions/mean_terminated_length": 168.21875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.488227277994156, "epoch": 0.00971, "frac_reward_zero_std": 0.25, "grad_norm": 0.005761371459811926, "kl": 0.37088777124881744, "learning_rate": 9.999595416231968e-06, "loss": -0.0149, "num_tokens": 22016605.0, "reward": 0.3916635513305664, "reward_std": 0.970717191696167, "rewards/rollout_reward_func/mean": 0.3916635513305664, "rewards/rollout_reward_func/std": 1.49248206615448, "sampling/importance_sampling_ratio/max": 0.5565378069877625, "sampling/importance_sampling_ratio/mean": 0.23643743991851807, "sampling/importance_sampling_ratio/min": 9.987915876308318e-12, "sampling/sampling_logp_difference/max": 3.8502020835876465, "sampling/sampling_logp_difference/mean": 1.4209794998168945, "step": 971, "step_time": 8.592405575003795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 6.480239182710648, "epoch": 0.00972, "grad_norm": 0.005766245070844889, "kl": 0.3692659754306078, "learning_rate": 9.999594550364955e-06, "loss": -0.015, "step": 972, "step_time": 4.712593312997342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1514.0, "completions/max_terminated_length": 1514.0, "completions/mean_length": 309.34375, "completions/mean_terminated_length": 309.34375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.832149416208267, "epoch": 0.00973, "frac_reward_zero_std": 0.5, "grad_norm": 0.003315829439088702, "kl": 0.5822305269539356, "learning_rate": 9.999593683572444e-06, "loss": -0.007, "num_tokens": 22060681.0, "reward": 1.1911594867706299, "reward_std": 0.6929875016212463, "rewards/rollout_reward_func/mean": 1.1911594867706299, "rewards/rollout_reward_func/std": 1.4221570491790771, "sampling/importance_sampling_ratio/max": 0.5596482157707214, "sampling/importance_sampling_ratio/mean": 0.2998882532119751, "sampling/importance_sampling_ratio/min": 5.652331935834809e-09, "sampling/sampling_logp_difference/max": 11.261188507080078, "sampling/sampling_logp_difference/mean": 0.8611626625061035, "step": 973, "step_time": 10.791846316995361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.838213622570038, "epoch": 0.00974, "grad_norm": 0.0034156041219830513, "kl": 0.5824278742074966, "learning_rate": 9.999592815854433e-06, "loss": -0.007, "step": 974, "step_time": 6.031850337003561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1126.0, "completions/max_terminated_length": 1126.0, "completions/mean_length": 335.84375, "completions/mean_terminated_length": 335.84375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.0475398898124695, "epoch": 0.00975, "frac_reward_zero_std": 0.25, "grad_norm": 0.03279462084174156, "kl": 0.3521531177684665, "learning_rate": 9.999591947210923e-06, "loss": -0.0075, "num_tokens": 22105879.0, "reward": 0.5203475952148438, "reward_std": 0.909401535987854, "rewards/rollout_reward_func/mean": 0.5203475952148438, "rewards/rollout_reward_func/std": 1.4292141199111938, "sampling/importance_sampling_ratio/max": 0.5569573044776917, "sampling/importance_sampling_ratio/mean": 0.24383419752120972, "sampling/importance_sampling_ratio/min": 1.1789608178602862e-10, "sampling/sampling_logp_difference/max": 4.1074323654174805, "sampling/sampling_logp_difference/mean": 1.358489990234375, "step": 975, "step_time": 9.191130263003288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 6.0445283353328705, "epoch": 0.00976, "grad_norm": 0.03562062233686447, "kl": 0.34834158327430487, "learning_rate": 9.999591077641915e-06, "loss": -0.0075, "step": 976, "step_time": 5.628734463000001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 939.0, "completions/max_terminated_length": 939.0, "completions/mean_length": 340.1875, "completions/mean_terminated_length": 337.73333740234375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.681194067001343, "epoch": 0.00977, "frac_reward_zero_std": 0.0, "grad_norm": 0.20941796898841858, "kl": 0.4305471805855632, "learning_rate": 9.999590207147407e-06, "loss": -0.0269, "num_tokens": 22153213.0, "reward": 1.0680129528045654, "reward_std": 1.4507672786712646, "rewards/rollout_reward_func/mean": 1.0680129528045654, "rewards/rollout_reward_func/std": 1.5489193201065063, "sampling/importance_sampling_ratio/max": 0.3309192657470703, "sampling/importance_sampling_ratio/mean": 0.16340166330337524, "sampling/importance_sampling_ratio/min": 3.3764830931204415e-08, "sampling/sampling_logp_difference/max": 3.514888048171997, "sampling/sampling_logp_difference/mean": 1.0025385618209839, "step": 977, "step_time": 9.047547624006256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.12500000186264515, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.12500000186264515, "entropy": 5.979250729084015, "epoch": 0.00978, "grad_norm": 0.017618266865611076, "kl": 0.37006446067243814, "learning_rate": 9.999589335727404e-06, "loss": -0.0275, "step": 978, "step_time": 4.7673759140052425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 397.0, "completions/max_terminated_length": 397.0, "completions/mean_length": 19.90625, "completions/mean_terminated_length": 20.032257080078125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.8609707355499268, "epoch": 0.00979, "frac_reward_zero_std": 0.5, "grad_norm": 0.026541635394096375, "kl": 0.6990948915481567, "learning_rate": 9.9995884633819e-06, "loss": -0.0092, "num_tokens": 22185294.0, "reward": 1.2355419397354126, "reward_std": 0.6735270023345947, "rewards/rollout_reward_func/mean": 1.2355419397354126, "rewards/rollout_reward_func/std": 1.4009841680526733, "sampling/importance_sampling_ratio/max": 0.561510443687439, "sampling/importance_sampling_ratio/mean": 0.43365156650543213, "sampling/importance_sampling_ratio/min": 6.795839970453207e-20, "sampling/sampling_logp_difference/max": 7.1659159660339355, "sampling/sampling_logp_difference/mean": 0.8971003890037537, "step": 979, "step_time": 6.5888543450018915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.8706866800785065, "epoch": 0.0098, "grad_norm": 0.026904761791229248, "kl": 0.7060029059648514, "learning_rate": 9.9995875901109e-06, "loss": -0.0092, "step": 980, "step_time": 3.5617994280037237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1422.0, "completions/max_terminated_length": 1422.0, "completions/mean_length": 337.53125, "completions/mean_terminated_length": 337.53125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.604537665843964, "epoch": 0.00981, "frac_reward_zero_std": 0.25, "grad_norm": 0.01027620118111372, "kl": 0.5389571450650692, "learning_rate": 9.999586715914402e-06, "loss": -0.0144, "num_tokens": 22230415.0, "reward": 0.726943850517273, "reward_std": 1.1038901805877686, "rewards/rollout_reward_func/mean": 0.726943850517273, "rewards/rollout_reward_func/std": 1.4409198760986328, "sampling/importance_sampling_ratio/max": 0.5586106181144714, "sampling/importance_sampling_ratio/mean": 0.22398555278778076, "sampling/importance_sampling_ratio/min": 4.379458857783902e-07, "sampling/sampling_logp_difference/max": 4.127205848693848, "sampling/sampling_logp_difference/mean": 1.13852858543396, "step": 981, "step_time": 10.287652435996279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.597430169582367, "epoch": 0.00982, "grad_norm": 0.010593624785542488, "kl": 0.5375790968537331, "learning_rate": 9.999585840792405e-06, "loss": -0.0144, "step": 982, "step_time": 6.040225074008049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1108.0, "completions/max_terminated_length": 1108.0, "completions/mean_length": 95.65625, "completions/mean_terminated_length": 98.2258071899414, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.496603578329086, "epoch": 0.00983, "frac_reward_zero_std": 0.5, "grad_norm": 0.007986562326550484, "kl": 0.6707192845642567, "learning_rate": 9.999584964744914e-06, "loss": -0.0109, "num_tokens": 22263771.0, "reward": 1.1938915252685547, "reward_std": 0.6329810619354248, "rewards/rollout_reward_func/mean": 1.1938915252685547, "rewards/rollout_reward_func/std": 1.2819279432296753, "sampling/importance_sampling_ratio/max": 0.5622668266296387, "sampling/importance_sampling_ratio/mean": 0.38462817668914795, "sampling/importance_sampling_ratio/min": 3.055802355947379e-12, "sampling/sampling_logp_difference/max": 3.949310302734375, "sampling/sampling_logp_difference/mean": 0.9233566522598267, "step": 983, "step_time": 8.76285519899102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.63292521238327, "epoch": 0.00984, "grad_norm": 0.008481263183057308, "kl": 0.6400764137506485, "learning_rate": 9.999584087771923e-06, "loss": -0.0109, "step": 984, "step_time": 4.950245940006425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0026041667442768812, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0026041667442768812, "completions/clipped_ratio": 0.0625, "completions/max_length": 1604.0, "completions/max_terminated_length": 1604.0, "completions/mean_length": 554.125, "completions/mean_terminated_length": 549.5667114257812, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.9969402551651, "epoch": 0.00985, "frac_reward_zero_std": 0.0, "grad_norm": 0.013426659628748894, "kl": 0.4185006758198142, "learning_rate": 9.999583209873438e-06, "loss": -0.0163, "num_tokens": 22318535.0, "reward": 0.5597078800201416, "reward_std": 1.3090636730194092, "rewards/rollout_reward_func/mean": 0.5597078800201416, "rewards/rollout_reward_func/std": 1.4784998893737793, "sampling/importance_sampling_ratio/max": 0.5472128391265869, "sampling/importance_sampling_ratio/mean": 0.10406583547592163, "sampling/importance_sampling_ratio/min": 5.469658775968444e-20, "sampling/sampling_logp_difference/max": 13.885442733764648, "sampling/sampling_logp_difference/mean": 1.3150525093078613, "step": 985, "step_time": 11.412408373995277 }, { "clip_ratio/high_max": 0.0062500000931322575, "clip_ratio/high_mean": 0.0031250000465661287, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250000465661287, "entropy": 5.983636975288391, "epoch": 0.00986, "grad_norm": 0.010224462486803532, "kl": 0.41477190889418125, "learning_rate": 9.999582331049455e-06, "loss": -0.0163, "step": 986, "step_time": 6.1847642450011335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0022321429569274187, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0022321429569274187, "completions/clipped_ratio": 0.0, "completions/max_length": 1357.0, "completions/max_terminated_length": 1357.0, "completions/mean_length": 522.625, "completions/mean_terminated_length": 522.625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.788553476333618, "epoch": 0.00987, "frac_reward_zero_std": 0.0, "grad_norm": 0.05811793729662895, "kl": 0.4174581691622734, "learning_rate": 9.999581451299976e-06, "loss": -0.0165, "num_tokens": 22372461.0, "reward": 0.42421436309814453, "reward_std": 1.491444706916809, "rewards/rollout_reward_func/mean": 0.42421436309814453, "rewards/rollout_reward_func/std": 1.650935173034668, "sampling/importance_sampling_ratio/max": 0.3103920817375183, "sampling/importance_sampling_ratio/mean": 0.0691319927573204, "sampling/importance_sampling_ratio/min": 5.862407361982889e-19, "sampling/sampling_logp_difference/max": 9.787571907043457, "sampling/sampling_logp_difference/mean": 1.4196478128433228, "step": 987, "step_time": 10.566858571000921 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0052083334885537624, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0052083334885537624, "entropy": 6.783602476119995, "epoch": 0.00988, "grad_norm": 0.035545844584703445, "kl": 0.4228434907272458, "learning_rate": 9.999580570625e-06, "loss": -0.0165, "step": 988, "step_time": 6.581658828006766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 991.0, "completions/max_terminated_length": 991.0, "completions/mean_length": 159.375, "completions/mean_terminated_length": 164.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.182954847812653, "epoch": 0.00989, "frac_reward_zero_std": 0.25, "grad_norm": 0.04810238257050514, "kl": 0.5978514328598976, "learning_rate": 9.99957968902453e-06, "loss": -0.0018, "num_tokens": 22409131.0, "reward": 1.2374694347381592, "reward_std": 0.8729585409164429, "rewards/rollout_reward_func/mean": 1.2374694347381592, "rewards/rollout_reward_func/std": 1.127852201461792, "sampling/importance_sampling_ratio/max": 0.5590102672576904, "sampling/importance_sampling_ratio/mean": 0.2970987558364868, "sampling/importance_sampling_ratio/min": 1.767628469810656e-12, "sampling/sampling_logp_difference/max": 4.559727668762207, "sampling/sampling_logp_difference/mean": 1.0680071115493774, "step": 989, "step_time": 8.576788086000306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.1480021476745605, "epoch": 0.0099, "grad_norm": 0.04161759838461876, "kl": 0.5959382820874453, "learning_rate": 9.999578806498565e-06, "loss": -0.0019, "step": 990, "step_time": 4.779831904997991 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "completions/clipped_ratio": 0.0, "completions/max_length": 1027.0, "completions/max_terminated_length": 1027.0, "completions/mean_length": 427.71875, "completions/mean_terminated_length": 427.71875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.085287988185883, "epoch": 0.00991, "frac_reward_zero_std": 0.25, "grad_norm": 0.05861838534474373, "kl": 0.7617318965494633, "learning_rate": 9.999577923047102e-06, "loss": -0.0045, "num_tokens": 22458246.0, "reward": 1.452782392501831, "reward_std": 1.1761012077331543, "rewards/rollout_reward_func/mean": 1.452782392501831, "rewards/rollout_reward_func/std": 1.4538490772247314, "sampling/importance_sampling_ratio/max": 0.5576171278953552, "sampling/importance_sampling_ratio/mean": 0.20248980820178986, "sampling/importance_sampling_ratio/min": 0.0001965247211046517, "sampling/sampling_logp_difference/max": 3.613848924636841, "sampling/sampling_logp_difference/mean": 0.8571068644523621, "step": 991, "step_time": 9.334361767007067 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 5.048353493213654, "epoch": 0.00992, "grad_norm": 0.05231252312660217, "kl": 0.7414275147020817, "learning_rate": 9.999577038670144e-06, "loss": -0.0046, "step": 992, "step_time": 5.070286206002493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1045.0, "completions/max_terminated_length": 1045.0, "completions/mean_length": 347.0625, "completions/mean_terminated_length": 357.7419128417969, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.07786026597023, "epoch": 0.00993, "frac_reward_zero_std": 0.0, "grad_norm": 0.05416275933384895, "kl": 0.4467580784112215, "learning_rate": 9.999576153367693e-06, "loss": -0.0177, "num_tokens": 22504277.0, "reward": 0.34469255805015564, "reward_std": 1.452715516090393, "rewards/rollout_reward_func/mean": 0.34469255805015564, "rewards/rollout_reward_func/std": 1.7307486534118652, "sampling/importance_sampling_ratio/max": 0.5540140271186829, "sampling/importance_sampling_ratio/mean": 0.1801304817199707, "sampling/importance_sampling_ratio/min": 6.249587219259167e-12, "sampling/sampling_logp_difference/max": 3.949894428253174, "sampling/sampling_logp_difference/mean": 1.2858705520629883, "step": 993, "step_time": 9.313536595007463 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 6.04869544506073, "epoch": 0.00994, "grad_norm": 0.023195495828986168, "kl": 0.44021396711468697, "learning_rate": 9.999575267139748e-06, "loss": -0.0179, "step": 994, "step_time": 5.892401412995241 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.0, "completions/max_length": 1018.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 264.53125, "completions/mean_terminated_length": 264.53125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.326633155345917, "epoch": 0.00995, "frac_reward_zero_std": 0.25, "grad_norm": 0.01920256018638611, "kl": 0.5436516012996435, "learning_rate": 9.999574379986306e-06, "loss": -0.0138, "num_tokens": 22547586.0, "reward": 1.0379282236099243, "reward_std": 0.9376450777053833, "rewards/rollout_reward_func/mean": 1.0379282236099243, "rewards/rollout_reward_func/std": 1.4912073612213135, "sampling/importance_sampling_ratio/max": 0.5561451315879822, "sampling/importance_sampling_ratio/mean": 0.2580867409706116, "sampling/importance_sampling_ratio/min": 0.000214517189306207, "sampling/sampling_logp_difference/max": 2.7959229946136475, "sampling/sampling_logp_difference/mean": 0.9664074778556824, "step": 995, "step_time": 9.013701614003367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.3172996044158936, "epoch": 0.00996, "grad_norm": 0.01986820623278618, "kl": 0.5426909849047661, "learning_rate": 9.99957349190737e-06, "loss": -0.0137, "step": 996, "step_time": 4.958084895999491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1282.0, "completions/max_terminated_length": 1282.0, "completions/mean_length": 341.0, "completions/mean_terminated_length": 341.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.2521549463272095, "epoch": 0.00997, "frac_reward_zero_std": 0.0, "grad_norm": 0.0948508009314537, "kl": 0.4096280299127102, "learning_rate": 9.99957260290294e-06, "loss": -0.0227, "num_tokens": 22595294.0, "reward": 0.8641683459281921, "reward_std": 1.7857199907302856, "rewards/rollout_reward_func/mean": 0.8641683459281921, "rewards/rollout_reward_func/std": 1.809731125831604, "sampling/importance_sampling_ratio/max": 0.5534086227416992, "sampling/importance_sampling_ratio/mean": 0.13389045000076294, "sampling/importance_sampling_ratio/min": 1.6005869962754105e-09, "sampling/sampling_logp_difference/max": 4.465730667114258, "sampling/sampling_logp_difference/mean": 1.2740232944488525, "step": 997, "step_time": 9.572184832988569 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.02187499962747097, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02187499962747097, "entropy": 6.259401619434357, "epoch": 0.00998, "grad_norm": 0.01674763672053814, "kl": 0.39549991115927696, "learning_rate": 9.999571712973018e-06, "loss": -0.0231, "step": 998, "step_time": 5.482380687994009 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 991.0, "completions/max_terminated_length": 991.0, "completions/mean_length": 245.0625, "completions/mean_terminated_length": 245.0625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.430437594652176, "epoch": 0.00999, "frac_reward_zero_std": 0.5, "grad_norm": 0.011962044052779675, "kl": 0.6622474268078804, "learning_rate": 9.9995708221176e-06, "loss": -0.0081, "num_tokens": 22637255.0, "reward": 0.3731173872947693, "reward_std": 0.7644892334938049, "rewards/rollout_reward_func/mean": 0.3731173872947693, "rewards/rollout_reward_func/std": 1.3032841682434082, "sampling/importance_sampling_ratio/max": 0.5609629154205322, "sampling/importance_sampling_ratio/mean": 0.3234380781650543, "sampling/importance_sampling_ratio/min": 0.0002367989654885605, "sampling/sampling_logp_difference/max": 2.8386566638946533, "sampling/sampling_logp_difference/mean": 0.7231664657592773, "step": 999, "step_time": 9.299711581999873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.431308686733246, "epoch": 0.01, "grad_norm": 0.012714574113488197, "kl": 0.6626850701868534, "learning_rate": 9.99956993033669e-06, "loss": -0.0081, "step": 1000, "step_time": 5.165902921002271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1149.0, "completions/max_terminated_length": 1149.0, "completions/mean_length": 354.0, "completions/mean_terminated_length": 354.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.026657283306122, "epoch": 0.01001, "frac_reward_zero_std": 0.25, "grad_norm": 0.017371322959661484, "kl": 0.27973475866019726, "learning_rate": 9.999569037630288e-06, "loss": -0.0145, "num_tokens": 22683637.0, "reward": 1.3726234436035156, "reward_std": 1.0961698293685913, "rewards/rollout_reward_func/mean": 1.3726234436035156, "rewards/rollout_reward_func/std": 1.488918423652649, "sampling/importance_sampling_ratio/max": 0.5544911623001099, "sampling/importance_sampling_ratio/mean": 0.11358131468296051, "sampling/importance_sampling_ratio/min": 1.2555798623026249e-09, "sampling/sampling_logp_difference/max": 11.930367469787598, "sampling/sampling_logp_difference/mean": 1.4330933094024658, "step": 1001, "step_time": 9.144588641996961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 7.026662230491638, "epoch": 0.01002, "grad_norm": 0.016390452161431313, "kl": 0.2773772506043315, "learning_rate": 9.99956814399839e-06, "loss": -0.0146, "step": 1002, "step_time": 5.098003837996657 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.0, "completions/max_length": 808.0, "completions/max_terminated_length": 808.0, "completions/mean_length": 329.8125, "completions/mean_terminated_length": 329.8125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.296431362628937, "epoch": 0.01003, "frac_reward_zero_std": 0.0, "grad_norm": 0.012482847087085247, "kl": 0.2878277497366071, "learning_rate": 9.999567249441e-06, "loss": -0.0052, "num_tokens": 22730709.0, "reward": 0.7401301860809326, "reward_std": 1.3766822814941406, "rewards/rollout_reward_func/mean": 0.7401301860809326, "rewards/rollout_reward_func/std": 1.7131017446517944, "sampling/importance_sampling_ratio/max": 0.5589606165885925, "sampling/importance_sampling_ratio/mean": 0.10198862850666046, "sampling/importance_sampling_ratio/min": 8.514951181387551e-21, "sampling/sampling_logp_difference/max": 9.713821411132812, "sampling/sampling_logp_difference/mean": 1.5932083129882812, "step": 1003, "step_time": 8.160393297002884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 7.2979536056518555, "epoch": 0.01004, "grad_norm": 0.012922468595206738, "kl": 0.28694989066571, "learning_rate": 9.999566353958118e-06, "loss": -0.0052, "step": 1004, "step_time": 4.524150645000191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1099.0, "completions/max_terminated_length": 1099.0, "completions/mean_length": 325.03125, "completions/mean_terminated_length": 325.03125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.205267012119293, "epoch": 0.01005, "frac_reward_zero_std": 0.0, "grad_norm": 0.08166635036468506, "kl": 0.49998316913843155, "learning_rate": 9.999565457549745e-06, "loss": -0.0098, "num_tokens": 22776838.0, "reward": 0.20423458516597748, "reward_std": 1.4229846000671387, "rewards/rollout_reward_func/mean": 0.20423458516597748, "rewards/rollout_reward_func/std": 1.4173210859298706, "sampling/importance_sampling_ratio/max": 0.5502206683158875, "sampling/importance_sampling_ratio/mean": 0.17241445183753967, "sampling/importance_sampling_ratio/min": 6.0835959479668e-14, "sampling/sampling_logp_difference/max": 10.926610946655273, "sampling/sampling_logp_difference/mean": 1.3821101188659668, "step": 1005, "step_time": 9.808647259007557 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 6.199913561344147, "epoch": 0.01006, "grad_norm": 0.049815889447927475, "kl": 0.5018227286636829, "learning_rate": 9.999564560215878e-06, "loss": -0.0101, "step": 1006, "step_time": 5.598799640003563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 836.0, "completions/max_terminated_length": 836.0, "completions/mean_length": 246.21875, "completions/mean_terminated_length": 246.21875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.045422434806824, "epoch": 0.01007, "frac_reward_zero_std": 0.0, "grad_norm": 0.03386063501238823, "kl": 0.2843629438430071, "learning_rate": 9.999563661956521e-06, "loss": -0.012, "num_tokens": 22819742.0, "reward": 0.34329596161842346, "reward_std": 1.4920954704284668, "rewards/rollout_reward_func/mean": 0.34329596161842346, "rewards/rollout_reward_func/std": 1.6015452146530151, "sampling/importance_sampling_ratio/max": 0.5526420474052429, "sampling/importance_sampling_ratio/mean": 0.1261054426431656, "sampling/importance_sampling_ratio/min": 1.8862125443774858e-06, "sampling/sampling_logp_difference/max": 2.9586334228515625, "sampling/sampling_logp_difference/mean": 1.3238964080810547, "step": 1007, "step_time": 8.187184864997107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 7.059656739234924, "epoch": 0.01008, "grad_norm": 0.03298722952604294, "kl": 0.28315782360732555, "learning_rate": 9.999562762771671e-06, "loss": -0.0121, "step": 1008, "step_time": 4.530417068992392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 623.0, "completions/max_terminated_length": 623.0, "completions/mean_length": 147.4375, "completions/mean_terminated_length": 147.4375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.955439448356628, "epoch": 0.01009, "frac_reward_zero_std": 0.25, "grad_norm": 0.00993993878364563, "kl": 0.49089537374675274, "learning_rate": 9.999561862661328e-06, "loss": -0.0008, "num_tokens": 22857679.0, "reward": 0.30379241704940796, "reward_std": 0.7002266049385071, "rewards/rollout_reward_func/mean": 0.30379241704940796, "rewards/rollout_reward_func/std": 1.378680944442749, "sampling/importance_sampling_ratio/max": 0.5580303072929382, "sampling/importance_sampling_ratio/mean": 0.24891503155231476, "sampling/importance_sampling_ratio/min": 5.981670625487823e-08, "sampling/sampling_logp_difference/max": 2.882373332977295, "sampling/sampling_logp_difference/mean": 1.257134199142456, "step": 1009, "step_time": 7.1714608060028695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 5.952609419822693, "epoch": 0.0101, "grad_norm": 0.009500070475041866, "kl": 0.48319824039936066, "learning_rate": 9.999560961625496e-06, "loss": -0.0008, "step": 1010, "step_time": 3.99689127399688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1126.0, "completions/max_terminated_length": 1126.0, "completions/mean_length": 260.0625, "completions/mean_terminated_length": 234.87095642089844, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.169197618961334, "epoch": 0.01011, "frac_reward_zero_std": 0.25, "grad_norm": 0.055554021149873734, "kl": 0.5686634629964828, "learning_rate": 9.999560059664175e-06, "loss": -0.012, "num_tokens": 22899177.0, "reward": 1.3275389671325684, "reward_std": 0.9106015563011169, "rewards/rollout_reward_func/mean": 1.3275389671325684, "rewards/rollout_reward_func/std": 1.1467247009277344, "sampling/importance_sampling_ratio/max": 0.5622022151947021, "sampling/importance_sampling_ratio/mean": 0.3061492443084717, "sampling/importance_sampling_ratio/min": 6.433034816378646e-15, "sampling/sampling_logp_difference/max": 4.19537878036499, "sampling/sampling_logp_difference/mean": 0.92018723487854, "step": 1011, "step_time": 9.892333809006232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 5.1786134243011475, "epoch": 0.01012, "grad_norm": 0.05000791326165199, "kl": 0.5679762624204159, "learning_rate": 9.999559156777358e-06, "loss": -0.012, "step": 1012, "step_time": 5.079764106998482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "completions/clipped_ratio": 0.0, "completions/max_length": 1161.0, "completions/max_terminated_length": 1161.0, "completions/mean_length": 306.8125, "completions/mean_terminated_length": 306.8125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.472220361232758, "epoch": 0.01013, "frac_reward_zero_std": 0.25, "grad_norm": 0.08331741392612457, "kl": 0.6408160887658596, "learning_rate": 9.999558252965055e-06, "loss": -0.0158, "num_tokens": 22943774.0, "reward": 1.1062134504318237, "reward_std": 1.176191806793213, "rewards/rollout_reward_func/mean": 1.1062134504318237, "rewards/rollout_reward_func/std": 1.409841775894165, "sampling/importance_sampling_ratio/max": 0.551531970500946, "sampling/importance_sampling_ratio/mean": 0.2313864529132843, "sampling/importance_sampling_ratio/min": 1.7776688991943956e-06, "sampling/sampling_logp_difference/max": 2.7482407093048096, "sampling/sampling_logp_difference/mean": 1.0416544675827026, "step": 1013, "step_time": 9.148807807003323 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 5.483553230762482, "epoch": 0.01014, "grad_norm": 0.033725570887327194, "kl": 0.6404068134725094, "learning_rate": 9.99955734822726e-06, "loss": -0.0161, "step": 1014, "step_time": 5.220303217000037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0028409091755747795, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0028409091755747795, "completions/clipped_ratio": 0.0, "completions/max_length": 1630.0, "completions/max_terminated_length": 1630.0, "completions/mean_length": 343.34375, "completions/mean_terminated_length": 343.34375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.352690100669861, "epoch": 0.01015, "frac_reward_zero_std": 0.25, "grad_norm": 0.12921226024627686, "kl": 0.3110955283045769, "learning_rate": 9.999556442563976e-06, "loss": -0.0063, "num_tokens": 22988452.0, "reward": 0.9510009288787842, "reward_std": 0.9919543266296387, "rewards/rollout_reward_func/mean": 0.9510009288787842, "rewards/rollout_reward_func/std": 1.349972128868103, "sampling/importance_sampling_ratio/max": 0.559735119342804, "sampling/importance_sampling_ratio/mean": 0.1734287291765213, "sampling/importance_sampling_ratio/min": 4.5295205808315586e-08, "sampling/sampling_logp_difference/max": 4.456480026245117, "sampling/sampling_logp_difference/mean": 1.2685883045196533, "step": 1015, "step_time": 10.528129660993727 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 6.360983431339264, "epoch": 0.01016, "grad_norm": 0.020305994898080826, "kl": 0.2926192246377468, "learning_rate": 9.9995555359752e-06, "loss": -0.0066, "step": 1016, "step_time": 6.066723939002259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1374.0, "completions/max_terminated_length": 1374.0, "completions/mean_length": 349.625, "completions/mean_terminated_length": 346.0967712402344, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.358592867851257, "epoch": 0.01017, "frac_reward_zero_std": 0.0, "grad_norm": 0.057205185294151306, "kl": 0.44899942725896835, "learning_rate": 9.999554628460936e-06, "loss": -0.0113, "num_tokens": 23035444.0, "reward": 0.27937251329421997, "reward_std": 1.2926634550094604, "rewards/rollout_reward_func/mean": 0.27937251329421997, "rewards/rollout_reward_func/std": 1.569323182106018, "sampling/importance_sampling_ratio/max": 0.5511528849601746, "sampling/importance_sampling_ratio/mean": 0.14388598501682281, "sampling/importance_sampling_ratio/min": 5.873507091827585e-20, "sampling/sampling_logp_difference/max": 4.381139755249023, "sampling/sampling_logp_difference/mean": 1.2627894878387451, "step": 1017, "step_time": 10.948239385001216 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 6.370680719614029, "epoch": 0.01018, "grad_norm": 0.04831882566213608, "kl": 0.44687989354133606, "learning_rate": 9.999553720021182e-06, "loss": -0.0116, "step": 1018, "step_time": 5.6945219689951045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1724.0, "completions/max_terminated_length": 1724.0, "completions/mean_length": 606.625, "completions/mean_terminated_length": 606.625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.6989442110061646, "epoch": 0.01019, "frac_reward_zero_std": 0.0, "grad_norm": 0.023961277678608894, "kl": 0.31576596200466156, "learning_rate": 9.999552810655939e-06, "loss": -0.0082, "num_tokens": 23091350.0, "reward": 0.8159989714622498, "reward_std": 1.3713834285736084, "rewards/rollout_reward_func/mean": 0.8159989714622498, "rewards/rollout_reward_func/std": 1.4630259275436401, "sampling/importance_sampling_ratio/max": 0.5428508520126343, "sampling/importance_sampling_ratio/mean": 0.09994738548994064, "sampling/importance_sampling_ratio/min": 1.059659131275037e-11, "sampling/sampling_logp_difference/max": 4.456984996795654, "sampling/sampling_logp_difference/mean": 1.344396710395813, "step": 1019, "step_time": 11.371331694004766 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 6.711520850658417, "epoch": 0.0102, "grad_norm": 0.02325947768986225, "kl": 0.31837151292711496, "learning_rate": 9.999551900365207e-06, "loss": -0.0082, "step": 1020, "step_time": 6.376222409999173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1665.0, "completions/max_terminated_length": 1665.0, "completions/mean_length": 321.8125, "completions/mean_terminated_length": 321.8125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.739571273326874, "epoch": 0.01021, "frac_reward_zero_std": 0.25, "grad_norm": 0.031169896945357323, "kl": 0.38478662818670273, "learning_rate": 9.999550989148985e-06, "loss": -0.0134, "num_tokens": 23136310.0, "reward": 0.8148485422134399, "reward_std": 1.0521773099899292, "rewards/rollout_reward_func/mean": 0.8148485422134399, "rewards/rollout_reward_func/std": 1.3731796741485596, "sampling/importance_sampling_ratio/max": 0.5613196492195129, "sampling/importance_sampling_ratio/mean": 0.16256815195083618, "sampling/importance_sampling_ratio/min": 1.2543477367898959e-08, "sampling/sampling_logp_difference/max": 2.8176064491271973, "sampling/sampling_logp_difference/mean": 1.3582704067230225, "step": 1021, "step_time": 10.406798961004824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 6.73691600561142, "epoch": 0.01022, "grad_norm": 0.03002464771270752, "kl": 0.3871182929724455, "learning_rate": 9.999550077007277e-06, "loss": -0.0133, "step": 1022, "step_time": 6.115024357000948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1271.0, "completions/max_terminated_length": 1271.0, "completions/mean_length": 178.75, "completions/mean_terminated_length": 178.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.662509381771088, "epoch": 0.01023, "frac_reward_zero_std": 0.0, "grad_norm": 0.07193656265735626, "kl": 0.5310253985226154, "learning_rate": 9.999549163940078e-06, "loss": -0.0154, "num_tokens": 23176511.0, "reward": 0.49015238881111145, "reward_std": 1.1022510528564453, "rewards/rollout_reward_func/mean": 0.49015238881111145, "rewards/rollout_reward_func/std": 1.2291022539138794, "sampling/importance_sampling_ratio/max": 0.5604272484779358, "sampling/importance_sampling_ratio/mean": 0.24598291516304016, "sampling/importance_sampling_ratio/min": 3.892282655936633e-18, "sampling/sampling_logp_difference/max": 5.424139022827148, "sampling/sampling_logp_difference/mean": 1.1261054277420044, "step": 1023, "step_time": 10.254455366997718 }, { "clip_ratio/high_max": 0.0625, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 5.579624205827713, "epoch": 0.01024, "grad_norm": 0.06680368632078171, "kl": 0.5526213347911835, "learning_rate": 9.999548249947393e-06, "loss": -0.0156, "step": 1024, "step_time": 5.40847180199853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1592.0, "completions/max_terminated_length": 1592.0, "completions/mean_length": 506.375, "completions/mean_terminated_length": 506.375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.741760313510895, "epoch": 0.01025, "frac_reward_zero_std": 0.0, "grad_norm": 0.03184869885444641, "kl": 0.3664694866165519, "learning_rate": 9.999547335029218e-06, "loss": -0.013, "num_tokens": 23229217.0, "reward": 0.5197092294692993, "reward_std": 1.1970524787902832, "rewards/rollout_reward_func/mean": 0.5197092294692993, "rewards/rollout_reward_func/std": 1.3192402124404907, "sampling/importance_sampling_ratio/max": 0.3152080774307251, "sampling/importance_sampling_ratio/mean": 0.06485322117805481, "sampling/importance_sampling_ratio/min": 1.3899776885040183e-09, "sampling/sampling_logp_difference/max": 12.874980926513672, "sampling/sampling_logp_difference/mean": 1.4760990142822266, "step": 1025, "step_time": 11.0590726159935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 6.726491212844849, "epoch": 0.01026, "grad_norm": 0.03187604248523712, "kl": 0.3646420114673674, "learning_rate": 9.999546419185557e-06, "loss": -0.0131, "step": 1026, "step_time": 6.130726482006139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1063.0, "completions/max_terminated_length": 1063.0, "completions/mean_length": 261.625, "completions/mean_terminated_length": 251.51612854003906, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.740803480148315, "epoch": 0.01027, "frac_reward_zero_std": 0.25, "grad_norm": 0.06840773671865463, "kl": 0.4827948212623596, "learning_rate": 9.99954550241641e-06, "loss": -0.0125, "num_tokens": 23273473.0, "reward": 1.1075716018676758, "reward_std": 1.329734206199646, "rewards/rollout_reward_func/mean": 1.1075716018676758, "rewards/rollout_reward_func/std": 1.5849062204360962, "sampling/importance_sampling_ratio/max": 0.5630297660827637, "sampling/importance_sampling_ratio/mean": 0.22868673503398895, "sampling/importance_sampling_ratio/min": 2.1757556541711325e-19, "sampling/sampling_logp_difference/max": 11.126856803894043, "sampling/sampling_logp_difference/mean": 1.2369940280914307, "step": 1027, "step_time": 8.883375684999919 }, { "clip_ratio/high_max": 0.013888888992369175, "clip_ratio/high_mean": 0.0069444444961845875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0069444444961845875, "entropy": 5.731098294258118, "epoch": 0.01028, "grad_norm": 0.055261336266994476, "kl": 0.45696516148746014, "learning_rate": 9.999544584721775e-06, "loss": -0.013, "step": 1028, "step_time": 5.455373880999105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 892.0, "completions/max_terminated_length": 892.0, "completions/mean_length": 355.1875, "completions/mean_terminated_length": 355.1875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.495829701423645, "epoch": 0.01029, "frac_reward_zero_std": 0.25, "grad_norm": 0.01005642581731081, "kl": 0.30635163094848394, "learning_rate": 9.99954366610165e-06, "loss": -0.0047, "num_tokens": 23319963.0, "reward": 1.2288106679916382, "reward_std": 0.9616278409957886, "rewards/rollout_reward_func/mean": 1.2288106679916382, "rewards/rollout_reward_func/std": 1.2749781608581543, "sampling/importance_sampling_ratio/max": 0.5598100423812866, "sampling/importance_sampling_ratio/mean": 0.17222511768341064, "sampling/importance_sampling_ratio/min": 1.7732670887227187e-09, "sampling/sampling_logp_difference/max": 4.44066047668457, "sampling/sampling_logp_difference/mean": 1.2969691753387451, "step": 1029, "step_time": 8.992150620004395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 6.434171199798584, "epoch": 0.0103, "grad_norm": 0.00898697879165411, "kl": 0.31518874876201153, "learning_rate": 9.999542746556042e-06, "loss": -0.0047, "step": 1030, "step_time": 4.7133619069973065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1270.0, "completions/max_terminated_length": 1270.0, "completions/mean_length": 219.40625, "completions/mean_terminated_length": 219.40625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.71353280544281, "epoch": 0.01031, "frac_reward_zero_std": 0.5, "grad_norm": 0.007632548920810223, "kl": 0.6220719646662474, "learning_rate": 9.999541826084947e-06, "loss": -0.0034, "num_tokens": 23358748.0, "reward": 0.21470902860164642, "reward_std": 0.5573363900184631, "rewards/rollout_reward_func/mean": 0.21470902860164642, "rewards/rollout_reward_func/std": 1.4988292455673218, "sampling/importance_sampling_ratio/max": 0.5568652749061584, "sampling/importance_sampling_ratio/mean": 0.30094483494758606, "sampling/importance_sampling_ratio/min": 5.578616165340762e-15, "sampling/sampling_logp_difference/max": 4.1865339279174805, "sampling/sampling_logp_difference/mean": 1.2363862991333008, "step": 1031, "step_time": 9.081513817000086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.714335680007935, "epoch": 0.01032, "grad_norm": 0.007198524195700884, "kl": 0.6188767775893211, "learning_rate": 9.999540904688363e-06, "loss": -0.0034, "step": 1032, "step_time": 5.2129482660020585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1199.0, "completions/max_terminated_length": 1199.0, "completions/mean_length": 377.84375, "completions/mean_terminated_length": 377.1000061035156, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.178113698959351, "epoch": 0.01033, "frac_reward_zero_std": 0.0, "grad_norm": 0.059663768857717514, "kl": 0.39372030831873417, "learning_rate": 9.999539982366296e-06, "loss": -0.0126, "num_tokens": 23407867.0, "reward": 1.1227607727050781, "reward_std": 1.5161700248718262, "rewards/rollout_reward_func/mean": 1.1227607727050781, "rewards/rollout_reward_func/std": 1.492297887802124, "sampling/importance_sampling_ratio/max": 0.5611627697944641, "sampling/importance_sampling_ratio/mean": 0.11803194135427475, "sampling/importance_sampling_ratio/min": 7.517843569893046e-10, "sampling/sampling_logp_difference/max": 3.7795064449310303, "sampling/sampling_logp_difference/mean": 1.2002170085906982, "step": 1033, "step_time": 9.85814744599702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 6.188810855150223, "epoch": 0.01034, "grad_norm": 0.06503457576036453, "kl": 0.39083748403936625, "learning_rate": 9.99953905911874e-06, "loss": -0.0126, "step": 1034, "step_time": 6.059410152003693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 949.0, "completions/max_terminated_length": 949.0, "completions/mean_length": 388.28125, "completions/mean_terminated_length": 389.4193420410156, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.020006626844406, "epoch": 0.01035, "frac_reward_zero_std": 0.0, "grad_norm": 0.04203319922089577, "kl": 0.2047954834997654, "learning_rate": 9.999538134945701e-06, "loss": -0.0079, "num_tokens": 23456622.0, "reward": 0.787524938583374, "reward_std": 1.3023033142089844, "rewards/rollout_reward_func/mean": 0.787524938583374, "rewards/rollout_reward_func/std": 1.5432605743408203, "sampling/importance_sampling_ratio/max": 0.559923529624939, "sampling/importance_sampling_ratio/mean": 0.09550091624259949, "sampling/importance_sampling_ratio/min": 1.023177094265637e-12, "sampling/sampling_logp_difference/max": 4.612035274505615, "sampling/sampling_logp_difference/mean": 1.41008460521698, "step": 1035, "step_time": 9.014012101000844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 7.029165089130402, "epoch": 0.01036, "grad_norm": 0.04124002903699875, "kl": 0.20349035412073135, "learning_rate": 9.999537209847177e-06, "loss": -0.008, "step": 1036, "step_time": 4.775646836998931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0026041667442768812, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0026041667442768812, "completions/clipped_ratio": 0.03125, "completions/max_length": 1054.0, "completions/max_terminated_length": 1054.0, "completions/mean_length": 234.375, "completions/mean_terminated_length": 241.41934204101562, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.2491313219070435, "epoch": 0.01037, "frac_reward_zero_std": 0.25, "grad_norm": 0.013799816370010376, "kl": 0.4026375487446785, "learning_rate": 9.999536283823168e-06, "loss": -0.0045, "num_tokens": 23498054.0, "reward": 1.0712730884552002, "reward_std": 0.9615226984024048, "rewards/rollout_reward_func/mean": 1.0712730884552002, "rewards/rollout_reward_func/std": 1.570084571838379, "sampling/importance_sampling_ratio/max": 0.558052122592926, "sampling/importance_sampling_ratio/mean": 0.2356415092945099, "sampling/importance_sampling_ratio/min": 2.3485022438391168e-17, "sampling/sampling_logp_difference/max": 4.477426052093506, "sampling/sampling_logp_difference/mean": 1.1062486171722412, "step": 1037, "step_time": 8.784285850000742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0026041667442768812, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0026041667442768812, "entropy": 5.255600124597549, "epoch": 0.01038, "grad_norm": 0.013417208567261696, "kl": 0.4022479560226202, "learning_rate": 9.999535356873673e-06, "loss": -0.0046, "step": 1038, "step_time": 4.899265211988677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1000.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 207.71875, "completions/mean_terminated_length": 207.71875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.197338938713074, "epoch": 0.01039, "frac_reward_zero_std": 0.5, "grad_norm": 0.043298739939928055, "kl": 0.695661973208189, "learning_rate": 9.999534428998694e-06, "loss": -0.0017, "num_tokens": 23536528.0, "reward": 1.7246989011764526, "reward_std": 0.6492246389389038, "rewards/rollout_reward_func/mean": 1.7246989011764526, "rewards/rollout_reward_func/std": 0.924487292766571, "sampling/importance_sampling_ratio/max": 0.5599098801612854, "sampling/importance_sampling_ratio/mean": 0.3053703308105469, "sampling/importance_sampling_ratio/min": 4.831237674807198e-06, "sampling/sampling_logp_difference/max": 2.6207973957061768, "sampling/sampling_logp_difference/mean": 0.9592883586883545, "step": 1039, "step_time": 9.08226562600612 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 5.196479648351669, "epoch": 0.0104, "grad_norm": 0.021560901775956154, "kl": 0.6953507289290428, "learning_rate": 9.999533500198229e-06, "loss": -0.0019, "step": 1040, "step_time": 5.262066898998455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.4375, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.9322281181812286, "epoch": 0.01041, "frac_reward_zero_std": 0.75, "grad_norm": 0.009216471575200558, "kl": 0.8827744945883751, "learning_rate": 9.999532570472281e-06, "loss": -0.0041, "num_tokens": 23565610.0, "reward": 1.8796122074127197, "reward_std": 0.2563169002532959, "rewards/rollout_reward_func/mean": 1.8796122074127197, "rewards/rollout_reward_func/std": 0.5269449949264526, "sampling/importance_sampling_ratio/max": 0.5627931952476501, "sampling/importance_sampling_ratio/mean": 0.5288608074188232, "sampling/importance_sampling_ratio/min": 3.05805558555982e-14, "sampling/sampling_logp_difference/max": 4.692745208740234, "sampling/sampling_logp_difference/mean": 0.5878821611404419, "step": 1041, "step_time": 5.224408954996761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.9249318838119507, "epoch": 0.01042, "grad_norm": 0.00652997475117445, "kl": 0.8836840912699699, "learning_rate": 9.99953163982085e-06, "loss": -0.0041, "step": 1042, "step_time": 3.167348971001047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1703.0, "completions/max_terminated_length": 1703.0, "completions/mean_length": 567.3125, "completions/mean_terminated_length": 567.3125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.680269300937653, "epoch": 0.01043, "frac_reward_zero_std": 0.0, "grad_norm": 0.0193475428968668, "kl": 0.1606975607573986, "learning_rate": 9.999530708243934e-06, "loss": -0.0046, "num_tokens": 23620036.0, "reward": 0.683759331703186, "reward_std": 1.4074939489364624, "rewards/rollout_reward_func/mean": 0.683759331703186, "rewards/rollout_reward_func/std": 1.4029269218444824, "sampling/importance_sampling_ratio/max": 0.30403369665145874, "sampling/importance_sampling_ratio/mean": 0.059302348643541336, "sampling/importance_sampling_ratio/min": 3.0385233884544505e-08, "sampling/sampling_logp_difference/max": 4.322260856628418, "sampling/sampling_logp_difference/mean": 1.560241460800171, "step": 1043, "step_time": 11.281465964002564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 7.687076687812805, "epoch": 0.01044, "grad_norm": 0.017618995159864426, "kl": 0.15659037651494145, "learning_rate": 9.999529775741534e-06, "loss": -0.0045, "step": 1044, "step_time": 6.289059259001078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.0, "completions/max_terminated_length": 474.0, "completions/mean_length": 165.40625, "completions/mean_terminated_length": 165.40625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.690787971019745, "epoch": 0.01045, "frac_reward_zero_std": 0.25, "grad_norm": 0.12035850435495377, "kl": 0.7132197096943855, "learning_rate": 9.999528842313652e-06, "loss": -0.0085, "num_tokens": 23658711.0, "reward": 1.7059001922607422, "reward_std": 1.0321550369262695, "rewards/rollout_reward_func/mean": 1.7059001922607422, "rewards/rollout_reward_func/std": 1.227677822113037, "sampling/importance_sampling_ratio/max": 0.5585283637046814, "sampling/importance_sampling_ratio/mean": 0.3277355134487152, "sampling/importance_sampling_ratio/min": 0.00015572263509966433, "sampling/sampling_logp_difference/max": 2.9094300270080566, "sampling/sampling_logp_difference/mean": 0.8420001864433289, "step": 1045, "step_time": 7.434118703000422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 4.685460418462753, "epoch": 0.01046, "grad_norm": 0.017439210787415504, "kl": 0.7319671958684921, "learning_rate": 9.999527907960287e-06, "loss": -0.0088, "step": 1046, "step_time": 4.288067855002737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1189.0, "completions/max_terminated_length": 1189.0, "completions/mean_length": 460.625, "completions/mean_terminated_length": 460.625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.257592141628265, "epoch": 0.01047, "frac_reward_zero_std": 0.0, "grad_norm": 0.07502849400043488, "kl": 0.5209483318030834, "learning_rate": 9.999526972681438e-06, "loss": -0.0121, "num_tokens": 23709176.0, "reward": 1.4757375717163086, "reward_std": 1.0949004888534546, "rewards/rollout_reward_func/mean": 1.4757375717163086, "rewards/rollout_reward_func/std": 1.3113032579421997, "sampling/importance_sampling_ratio/max": 0.5509998202323914, "sampling/importance_sampling_ratio/mean": 0.2224971354007721, "sampling/importance_sampling_ratio/min": 2.4242972358479165e-06, "sampling/sampling_logp_difference/max": 2.878077507019043, "sampling/sampling_logp_difference/mean": 0.9587295055389404, "step": 1047, "step_time": 9.741106212997693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.243915617465973, "epoch": 0.01048, "grad_norm": 0.0720822811126709, "kl": 0.5240081958472729, "learning_rate": 9.999526036477107e-06, "loss": -0.0121, "step": 1048, "step_time": 5.3521817269975145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1103.0, "completions/max_terminated_length": 1103.0, "completions/mean_length": 359.71875, "completions/mean_terminated_length": 359.71875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.472366571426392, "epoch": 0.01049, "frac_reward_zero_std": 0.0, "grad_norm": 0.07630094140768051, "kl": 0.27892553247511387, "learning_rate": 9.999525099347293e-06, "loss": -0.011, "num_tokens": 23756049.0, "reward": 0.45939457416534424, "reward_std": 1.6766505241394043, "rewards/rollout_reward_func/mean": 0.45939457416534424, "rewards/rollout_reward_func/std": 2.0455894470214844, "sampling/importance_sampling_ratio/max": 0.3830910921096802, "sampling/importance_sampling_ratio/mean": 0.10714806616306305, "sampling/importance_sampling_ratio/min": 7.10809133774859e-10, "sampling/sampling_logp_difference/max": 4.360036849975586, "sampling/sampling_logp_difference/mean": 1.2605736255645752, "step": 1049, "step_time": 9.111526930002583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 6.508142650127411, "epoch": 0.0105, "grad_norm": 0.08153267204761505, "kl": 0.27666734531521797, "learning_rate": 9.999524161291997e-06, "loss": -0.0112, "step": 1050, "step_time": 5.084677120994456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1175.0, "completions/max_terminated_length": 1175.0, "completions/mean_length": 479.53125, "completions/mean_terminated_length": 479.53125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.857340931892395, "epoch": 0.01051, "frac_reward_zero_std": 0.0, "grad_norm": 0.03820664435625076, "kl": 0.21739054564386606, "learning_rate": 9.99952322231122e-06, "loss": -0.0213, "num_tokens": 23808650.0, "reward": 1.0185956954956055, "reward_std": 1.4901130199432373, "rewards/rollout_reward_func/mean": 1.0185956954956055, "rewards/rollout_reward_func/std": 1.4737446308135986, "sampling/importance_sampling_ratio/max": 0.5523734092712402, "sampling/importance_sampling_ratio/mean": 0.12713922560214996, "sampling/importance_sampling_ratio/min": 1.0401958991998873e-16, "sampling/sampling_logp_difference/max": 4.470718860626221, "sampling/sampling_logp_difference/mean": 1.4938666820526123, "step": 1051, "step_time": 9.817927331998362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 6.861291289329529, "epoch": 0.01052, "grad_norm": 0.03983037546277046, "kl": 0.21947486139833927, "learning_rate": 9.99952228240496e-06, "loss": -0.0214, "step": 1052, "step_time": 5.760027886000898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 982.0, "completions/max_terminated_length": 982.0, "completions/mean_length": 317.5, "completions/mean_terminated_length": 327.2257995605469, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.292166113853455, "epoch": 0.01053, "frac_reward_zero_std": 0.0, "grad_norm": 0.011106329970061779, "kl": 0.2826055521145463, "learning_rate": 9.999521341573221e-06, "loss": -0.0085, "num_tokens": 23853609.0, "reward": 1.4868160486221313, "reward_std": 1.481298804283142, "rewards/rollout_reward_func/mean": 1.4868160486221313, "rewards/rollout_reward_func/std": 1.4551945924758911, "sampling/importance_sampling_ratio/max": 0.5624848008155823, "sampling/importance_sampling_ratio/mean": 0.19570787250995636, "sampling/importance_sampling_ratio/min": 3.769776157902527e-10, "sampling/sampling_logp_difference/max": 4.830379486083984, "sampling/sampling_logp_difference/mean": 1.338525652885437, "step": 1053, "step_time": 8.803619582002284 }, { "clip_ratio/high_max": 0.0062500000931322575, "clip_ratio/high_mean": 0.0031250000465661287, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250000465661287, "entropy": 6.3028863072395325, "epoch": 0.01054, "grad_norm": 0.012932611629366875, "kl": 0.2841065311804414, "learning_rate": 9.999520399815998e-06, "loss": -0.0085, "step": 1054, "step_time": 4.929457892005303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 901.0, "completions/max_terminated_length": 901.0, "completions/mean_length": 220.90625, "completions/mean_terminated_length": 220.90625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.904182434082031, "epoch": 0.01055, "frac_reward_zero_std": 0.25, "grad_norm": 0.06211933121085167, "kl": 0.5472510121762753, "learning_rate": 9.999519457133295e-06, "loss": -0.0097, "num_tokens": 23893680.0, "reward": 0.511253297328949, "reward_std": 0.8720675110816956, "rewards/rollout_reward_func/mean": 0.511253297328949, "rewards/rollout_reward_func/std": 1.3903342485427856, "sampling/importance_sampling_ratio/max": 0.5601359605789185, "sampling/importance_sampling_ratio/mean": 0.2456539273262024, "sampling/importance_sampling_ratio/min": 1.0543021744524594e-06, "sampling/sampling_logp_difference/max": 4.279425621032715, "sampling/sampling_logp_difference/mean": 1.185904622077942, "step": 1055, "step_time": 8.51097695900171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.026041666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.026041666977107525, "entropy": 6.0301318764686584, "epoch": 0.01056, "grad_norm": 0.06773114204406738, "kl": 0.536724517121911, "learning_rate": 9.999518513525112e-06, "loss": -0.0101, "step": 1056, "step_time": 5.092479037004523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1468.0, "completions/max_terminated_length": 1468.0, "completions/mean_length": 249.28125, "completions/mean_terminated_length": 264.8333435058594, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.290334582328796, "epoch": 0.01057, "frac_reward_zero_std": 0.0, "grad_norm": 0.016958314925432205, "kl": 0.45430413633584976, "learning_rate": 9.999517568991448e-06, "loss": -0.0154, "num_tokens": 23933171.0, "reward": 1.2762260437011719, "reward_std": 1.3544080257415771, "rewards/rollout_reward_func/mean": 1.2762260437011719, "rewards/rollout_reward_func/std": 1.4047894477844238, "sampling/importance_sampling_ratio/max": 0.5609026551246643, "sampling/importance_sampling_ratio/mean": 0.28025808930397034, "sampling/importance_sampling_ratio/min": 6.142915338153089e-16, "sampling/sampling_logp_difference/max": 12.518051147460938, "sampling/sampling_logp_difference/mean": 1.218610405921936, "step": 1057, "step_time": 10.160106027997244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.2876476645469666, "epoch": 0.01058, "grad_norm": 0.017249582335352898, "kl": 0.4564319960772991, "learning_rate": 9.999516623532303e-06, "loss": -0.0154, "step": 1058, "step_time": 6.212199509005586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1144.0, "completions/max_terminated_length": 1144.0, "completions/mean_length": 371.625, "completions/mean_terminated_length": 371.625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.914940059185028, "epoch": 0.01059, "frac_reward_zero_std": 0.25, "grad_norm": 0.03862754628062248, "kl": 0.4964059367775917, "learning_rate": 9.99951567714768e-06, "loss": -0.0108, "num_tokens": 23980087.0, "reward": 1.45640230178833, "reward_std": 1.112532138824463, "rewards/rollout_reward_func/mean": 1.45640230178833, "rewards/rollout_reward_func/std": 1.3344899415969849, "sampling/importance_sampling_ratio/max": 0.5604665875434875, "sampling/importance_sampling_ratio/mean": 0.2040138840675354, "sampling/importance_sampling_ratio/min": 1.2279530690884055e-12, "sampling/sampling_logp_difference/max": 12.335165977478027, "sampling/sampling_logp_difference/mean": 1.234926462173462, "step": 1059, "step_time": 9.584063005993812 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 5.931661486625671, "epoch": 0.0106, "grad_norm": 0.02888256125152111, "kl": 0.4965446200221777, "learning_rate": 9.999514729837577e-06, "loss": -0.0108, "step": 1060, "step_time": 5.240421207996405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1516.0, "completions/max_terminated_length": 1516.0, "completions/mean_length": 383.03125, "completions/mean_terminated_length": 382.45159912109375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.4336466789245605, "epoch": 0.01061, "frac_reward_zero_std": 0.25, "grad_norm": 0.06115937978029251, "kl": 0.43413118831813335, "learning_rate": 9.999513781601992e-06, "loss": -0.0119, "num_tokens": 24027772.0, "reward": 1.2669847011566162, "reward_std": 1.2879948616027832, "rewards/rollout_reward_func/mean": 1.2669847011566162, "rewards/rollout_reward_func/std": 1.5072742700576782, "sampling/importance_sampling_ratio/max": 0.5633172392845154, "sampling/importance_sampling_ratio/mean": 0.21609656512737274, "sampling/importance_sampling_ratio/min": 3.284876015641203e-08, "sampling/sampling_logp_difference/max": 3.4791414737701416, "sampling/sampling_logp_difference/mean": 1.0527660846710205, "step": 1061, "step_time": 10.939223934994516 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 5.442171812057495, "epoch": 0.01062, "grad_norm": 0.034198034554719925, "kl": 0.391285534016788, "learning_rate": 9.99951283244093e-06, "loss": -0.012, "step": 1062, "step_time": 6.271865140002774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1129.0, "completions/max_terminated_length": 1129.0, "completions/mean_length": 451.6875, "completions/mean_terminated_length": 454.19354248046875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.886152148246765, "epoch": 0.01063, "frac_reward_zero_std": 0.0, "grad_norm": 0.04286474734544754, "kl": 0.2691928194835782, "learning_rate": 9.999511882354389e-06, "loss": -0.0098, "num_tokens": 24079362.0, "reward": 0.4991976022720337, "reward_std": 1.176917314529419, "rewards/rollout_reward_func/mean": 0.4991976022720337, "rewards/rollout_reward_func/std": 1.3861075639724731, "sampling/importance_sampling_ratio/max": 0.3098215162754059, "sampling/importance_sampling_ratio/mean": 0.09161486476659775, "sampling/importance_sampling_ratio/min": 5.688617437347998e-13, "sampling/sampling_logp_difference/max": 4.087871551513672, "sampling/sampling_logp_difference/mean": 1.4367423057556152, "step": 1063, "step_time": 9.844443962007063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 6.890269577503204, "epoch": 0.01064, "grad_norm": 0.04269097372889519, "kl": 0.2689197761937976, "learning_rate": 9.999510931342367e-06, "loss": -0.0098, "step": 1064, "step_time": 5.217927681002038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1072.0, "completions/max_terminated_length": 1072.0, "completions/mean_length": 330.0, "completions/mean_terminated_length": 330.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.437159597873688, "epoch": 0.01065, "frac_reward_zero_std": 0.0, "grad_norm": 0.2690693438053131, "kl": 0.5205928236246109, "learning_rate": 9.999509979404867e-06, "loss": -0.0089, "num_tokens": 24123963.0, "reward": 0.7090663313865662, "reward_std": 1.231586217880249, "rewards/rollout_reward_func/mean": 0.7090663313865662, "rewards/rollout_reward_func/std": 1.3765558004379272, "sampling/importance_sampling_ratio/max": 0.5549708604812622, "sampling/importance_sampling_ratio/mean": 0.17650151252746582, "sampling/importance_sampling_ratio/min": 2.2387923763744766e-06, "sampling/sampling_logp_difference/max": 4.153647422790527, "sampling/sampling_logp_difference/mean": 1.2608544826507568, "step": 1065, "step_time": 8.847912993005593 }, { "clip_ratio/high_max": 0.0416666679084301, "clip_ratio/high_mean": 0.02083333395421505, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.028645834885537624, "entropy": 6.306271612644196, "epoch": 0.01066, "grad_norm": 0.16857688128948212, "kl": 0.8500861637294292, "learning_rate": 9.999509026541889e-06, "loss": -0.0085, "step": 1066, "step_time": 5.035440173003735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1285.0, "completions/max_terminated_length": 1285.0, "completions/mean_length": 300.3125, "completions/mean_terminated_length": 290.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.038204371929169, "epoch": 0.01067, "frac_reward_zero_std": 0.0, "grad_norm": 0.02009822055697441, "kl": 0.3319006562232971, "learning_rate": 9.999508072753433e-06, "loss": -0.0168, "num_tokens": 24169771.0, "reward": 1.2802945375442505, "reward_std": 1.4383597373962402, "rewards/rollout_reward_func/mean": 1.2802945375442505, "rewards/rollout_reward_func/std": 1.4300031661987305, "sampling/importance_sampling_ratio/max": 0.5588461756706238, "sampling/importance_sampling_ratio/mean": 0.19842031598091125, "sampling/importance_sampling_ratio/min": 3.5347088076604846e-14, "sampling/sampling_logp_difference/max": 4.367580413818359, "sampling/sampling_logp_difference/mean": 1.2212471961975098, "step": 1067, "step_time": 10.078461177999998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 6.02621990442276, "epoch": 0.01068, "grad_norm": 0.01945413276553154, "kl": 0.3350803777575493, "learning_rate": 9.999507118039498e-06, "loss": -0.0168, "step": 1068, "step_time": 5.952537166005641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1072.0, "completions/max_terminated_length": 1072.0, "completions/mean_length": 266.15625, "completions/mean_terminated_length": 268.2666931152344, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.85611879825592, "epoch": 0.01069, "frac_reward_zero_std": 0.25, "grad_norm": 0.05079314112663269, "kl": 0.38549710996448994, "learning_rate": 9.999506162400088e-06, "loss": -0.0131, "num_tokens": 24213381.0, "reward": 0.7895504832267761, "reward_std": 1.3639869689941406, "rewards/rollout_reward_func/mean": 0.7895504832267761, "rewards/rollout_reward_func/std": 1.728414535522461, "sampling/importance_sampling_ratio/max": 0.5611130595207214, "sampling/importance_sampling_ratio/mean": 0.24282386898994446, "sampling/importance_sampling_ratio/min": 4.57274930407193e-23, "sampling/sampling_logp_difference/max": 13.366423606872559, "sampling/sampling_logp_difference/mean": 1.335707187652588, "step": 1069, "step_time": 9.264338504002808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.85695868730545, "epoch": 0.0107, "grad_norm": 0.047369107604026794, "kl": 0.3878761772066355, "learning_rate": 9.999505205835198e-06, "loss": -0.0132, "step": 1070, "step_time": 5.004769923998538 }, { "clip_ratio/high_max": 0.0052083334885537624, "clip_ratio/high_mean": 0.0026041667442768812, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0026041667442768812, "completions/clipped_ratio": 0.0, "completions/max_length": 1486.0, "completions/max_terminated_length": 1486.0, "completions/mean_length": 327.09375, "completions/mean_terminated_length": 327.09375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.737452745437622, "epoch": 0.01071, "frac_reward_zero_std": 0.25, "grad_norm": 0.019797012209892273, "kl": 0.5470072291791439, "learning_rate": 9.999504248344831e-06, "loss": -0.0141, "num_tokens": 24256707.0, "reward": 1.2628936767578125, "reward_std": 1.1945538520812988, "rewards/rollout_reward_func/mean": 1.2628936767578125, "rewards/rollout_reward_func/std": 1.4955103397369385, "sampling/importance_sampling_ratio/max": 0.560240626335144, "sampling/importance_sampling_ratio/mean": 0.2925076186656952, "sampling/importance_sampling_ratio/min": 9.819438014438409e-13, "sampling/sampling_logp_difference/max": 4.021556854248047, "sampling/sampling_logp_difference/mean": 0.8703249096870422, "step": 1071, "step_time": 10.642893915006425 }, { "clip_ratio/high_max": 0.0052083334885537624, "clip_ratio/high_mean": 0.0026041667442768812, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0026041667442768812, "entropy": 4.739121198654175, "epoch": 0.01072, "grad_norm": 0.020269736647605896, "kl": 0.5467889383435249, "learning_rate": 9.99950328992899e-06, "loss": -0.0141, "step": 1072, "step_time": 5.796281797996926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0069444444961845875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0069444444961845875, "completions/clipped_ratio": 0.0, "completions/max_length": 1613.0, "completions/max_terminated_length": 1613.0, "completions/mean_length": 296.125, "completions/mean_terminated_length": 296.125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.240292310714722, "epoch": 0.01073, "frac_reward_zero_std": 0.0, "grad_norm": 0.06690271198749542, "kl": 0.41954231541603804, "learning_rate": 9.99950233058767e-06, "loss": -0.0161, "num_tokens": 24299749.0, "reward": 0.5662968158721924, "reward_std": 1.4429824352264404, "rewards/rollout_reward_func/mean": 0.5662968158721924, "rewards/rollout_reward_func/std": 1.5547194480895996, "sampling/importance_sampling_ratio/max": 0.5596572160720825, "sampling/importance_sampling_ratio/mean": 0.17706772685050964, "sampling/importance_sampling_ratio/min": 7.818014637450688e-06, "sampling/sampling_logp_difference/max": 4.874086856842041, "sampling/sampling_logp_difference/mean": 1.3062989711761475, "step": 1073, "step_time": 10.110309698997298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 6.191892564296722, "epoch": 0.01074, "grad_norm": 0.04206470772624016, "kl": 0.42751661967486143, "learning_rate": 9.999501370320872e-06, "loss": -0.0162, "step": 1074, "step_time": 5.893765114993585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 973.0, "completions/max_terminated_length": 973.0, "completions/mean_length": 296.9375, "completions/mean_terminated_length": 296.9375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.3983436822891235, "epoch": 0.01075, "frac_reward_zero_std": 0.25, "grad_norm": 0.031203493475914, "kl": 0.47863810416311026, "learning_rate": 9.999500409128599e-06, "loss": -0.009, "num_tokens": 24343560.0, "reward": 0.9356694221496582, "reward_std": 1.1390869617462158, "rewards/rollout_reward_func/mean": 0.9356694221496582, "rewards/rollout_reward_func/std": 1.5572612285614014, "sampling/importance_sampling_ratio/max": 0.5574838519096375, "sampling/importance_sampling_ratio/mean": 0.22111999988555908, "sampling/importance_sampling_ratio/min": 0.00015698425704613328, "sampling/sampling_logp_difference/max": 3.449831485748291, "sampling/sampling_logp_difference/mean": 1.0298564434051514, "step": 1075, "step_time": 8.957821893000073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.368903636932373, "epoch": 0.01076, "grad_norm": 0.031114546582102776, "kl": 0.48255669604986906, "learning_rate": 9.99949944701085e-06, "loss": -0.009, "step": 1076, "step_time": 4.834362776004127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 578.0, "completions/max_terminated_length": 578.0, "completions/mean_length": 235.65625, "completions/mean_terminated_length": 235.65625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.181142568588257, "epoch": 0.01077, "frac_reward_zero_std": 0.25, "grad_norm": 0.011647826991975307, "kl": 0.3837782070040703, "learning_rate": 9.999498483967625e-06, "loss": -0.0136, "num_tokens": 24386710.0, "reward": 0.8459852933883667, "reward_std": 1.243207573890686, "rewards/rollout_reward_func/mean": 0.8459852933883667, "rewards/rollout_reward_func/std": 1.66226065158844, "sampling/importance_sampling_ratio/max": 0.5511637926101685, "sampling/importance_sampling_ratio/mean": 0.16585679352283478, "sampling/importance_sampling_ratio/min": 1.7308770861284355e-18, "sampling/sampling_logp_difference/max": 4.419558048248291, "sampling/sampling_logp_difference/mean": 1.301142930984497, "step": 1077, "step_time": 7.457783683003072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 6.10521537065506, "epoch": 0.01078, "grad_norm": 0.012406893074512482, "kl": 0.3972381763160229, "learning_rate": 9.999497519998923e-06, "loss": -0.0136, "step": 1078, "step_time": 4.018171145995439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/max_terminated_length": 503.0, "completions/mean_length": 262.9375, "completions/mean_terminated_length": 262.9375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.597766816616058, "epoch": 0.01079, "frac_reward_zero_std": 0.0, "grad_norm": 0.0873761773109436, "kl": 0.5024280119687319, "learning_rate": 9.99949655510475e-06, "loss": -0.0151, "num_tokens": 24429670.0, "reward": 0.6980579495429993, "reward_std": 1.2082555294036865, "rewards/rollout_reward_func/mean": 0.6980579495429993, "rewards/rollout_reward_func/std": 1.422646403312683, "sampling/importance_sampling_ratio/max": 0.5661078691482544, "sampling/importance_sampling_ratio/mean": 0.2086608111858368, "sampling/importance_sampling_ratio/min": 5.030041734244151e-07, "sampling/sampling_logp_difference/max": 3.964533805847168, "sampling/sampling_logp_difference/mean": 1.0531718730926514, "step": 1079, "step_time": 7.772258591998252 }, { "clip_ratio/high_max": 0.06770833395421505, "clip_ratio/high_mean": 0.033854166977107525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.033854166977107525, "entropy": 5.532522618770599, "epoch": 0.0108, "grad_norm": 0.053419556468725204, "kl": 0.5127236731350422, "learning_rate": 9.9994955892851e-06, "loss": -0.0154, "step": 1080, "step_time": 4.350427740992018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1169.0, "completions/max_terminated_length": 1169.0, "completions/mean_length": 269.65625, "completions/mean_terminated_length": 271.3666687011719, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.109251648187637, "epoch": 0.01081, "frac_reward_zero_std": 0.25, "grad_norm": 0.028345869854092598, "kl": 0.6292689889669418, "learning_rate": 9.999494622539973e-06, "loss": -0.0135, "num_tokens": 24469996.0, "reward": 1.034594178199768, "reward_std": 0.9366666078567505, "rewards/rollout_reward_func/mean": 1.034594178199768, "rewards/rollout_reward_func/std": 1.378726601600647, "sampling/importance_sampling_ratio/max": 0.552639365196228, "sampling/importance_sampling_ratio/mean": 0.29238560795783997, "sampling/importance_sampling_ratio/min": 2.3164616058668486e-10, "sampling/sampling_logp_difference/max": 3.897063732147217, "sampling/sampling_logp_difference/mean": 1.023942470550537, "step": 1081, "step_time": 9.476698570997542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.101557910442352, "epoch": 0.01082, "grad_norm": 0.03277108445763588, "kl": 0.630999643355608, "learning_rate": 9.999493654869373e-06, "loss": -0.0136, "step": 1082, "step_time": 5.206078904004244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1313.0, "completions/max_terminated_length": 1313.0, "completions/mean_length": 463.8125, "completions/mean_terminated_length": 463.8125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.76236492395401, "epoch": 0.01083, "frac_reward_zero_std": 0.0, "grad_norm": 0.0118155088275671, "kl": 0.1696173483505845, "learning_rate": 9.999492686273298e-06, "loss": -0.0103, "num_tokens": 24520916.0, "reward": 0.24736462533473969, "reward_std": 1.322066307067871, "rewards/rollout_reward_func/mean": 0.24736462533473969, "rewards/rollout_reward_func/std": 1.4136861562728882, "sampling/importance_sampling_ratio/max": 0.5518165826797485, "sampling/importance_sampling_ratio/mean": 0.05875719338655472, "sampling/importance_sampling_ratio/min": 2.4033530203914266e-16, "sampling/sampling_logp_difference/max": 10.651744842529297, "sampling/sampling_logp_difference/mean": 1.755528211593628, "step": 1083, "step_time": 10.097916865997831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.005681818351149559, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005681818351149559, "entropy": 7.76188737154007, "epoch": 0.01084, "grad_norm": 0.012032303027808666, "kl": 0.16556764859706163, "learning_rate": 9.99949171675175e-06, "loss": -0.0104, "step": 1084, "step_time": 5.548792446996231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1336.0, "completions/max_terminated_length": 1336.0, "completions/mean_length": 385.5, "completions/mean_terminated_length": 386.58062744140625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.770015776157379, "epoch": 0.01085, "frac_reward_zero_std": 0.0, "grad_norm": 0.02979699708521366, "kl": 0.43116660974919796, "learning_rate": 9.999490746304727e-06, "loss": -0.0188, "num_tokens": 24568255.0, "reward": 0.9984549283981323, "reward_std": 1.4722868204116821, "rewards/rollout_reward_func/mean": 0.9984549283981323, "rewards/rollout_reward_func/std": 1.627169132232666, "sampling/importance_sampling_ratio/max": 0.5583681464195251, "sampling/importance_sampling_ratio/mean": 0.1862306445837021, "sampling/importance_sampling_ratio/min": 2.3455119859247376e-17, "sampling/sampling_logp_difference/max": 10.673718452453613, "sampling/sampling_logp_difference/mean": 1.3548588752746582, "step": 1085, "step_time": 10.69349789300395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.776143550872803, "epoch": 0.01086, "grad_norm": 0.028231767937541008, "kl": 0.43228245712816715, "learning_rate": 9.999489774932232e-06, "loss": -0.0189, "step": 1086, "step_time": 6.044377023998095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 761.0, "completions/max_terminated_length": 761.0, "completions/mean_length": 185.9375, "completions/mean_terminated_length": 185.9375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.193114817142487, "epoch": 0.01087, "frac_reward_zero_std": 0.0, "grad_norm": 0.05902639776468277, "kl": 0.34619177505373955, "learning_rate": 9.999488802634262e-06, "loss": -0.0078, "num_tokens": 24607677.0, "reward": -0.15007475018501282, "reward_std": 1.4496508836746216, "rewards/rollout_reward_func/mean": -0.15007475018501282, "rewards/rollout_reward_func/std": 1.6219466924667358, "sampling/importance_sampling_ratio/max": 0.5610287189483643, "sampling/importance_sampling_ratio/mean": 0.2334195226430893, "sampling/importance_sampling_ratio/min": 1.1668188544849327e-16, "sampling/sampling_logp_difference/max": 4.805532455444336, "sampling/sampling_logp_difference/mean": 1.3346843719482422, "step": 1087, "step_time": 7.985551629997644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 6.206127166748047, "epoch": 0.01088, "grad_norm": 0.05766480416059494, "kl": 0.3416685648262501, "learning_rate": 9.999487829410819e-06, "loss": -0.0079, "step": 1088, "step_time": 4.3286669889967015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1895.0, "completions/max_terminated_length": 1895.0, "completions/mean_length": 319.6875, "completions/mean_terminated_length": 319.6875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.867032051086426, "epoch": 0.01089, "frac_reward_zero_std": 0.0, "grad_norm": 0.017097342759370804, "kl": 0.2451300173997879, "learning_rate": 9.999486855261904e-06, "loss": -0.0133, "num_tokens": 24653309.0, "reward": 0.11823242902755737, "reward_std": 1.1027235984802246, "rewards/rollout_reward_func/mean": 0.11823242902755737, "rewards/rollout_reward_func/std": 1.3343849182128906, "sampling/importance_sampling_ratio/max": 0.5538328886032104, "sampling/importance_sampling_ratio/mean": 0.12759700417518616, "sampling/importance_sampling_ratio/min": 3.355353896949964e-07, "sampling/sampling_logp_difference/max": 2.8829073905944824, "sampling/sampling_logp_difference/mean": 1.472942590713501, "step": 1089, "step_time": 11.562311220997799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 6.885697424411774, "epoch": 0.0109, "grad_norm": 0.017398208379745483, "kl": 0.24504855647683144, "learning_rate": 9.999485880187515e-06, "loss": -0.0133, "step": 1090, "step_time": 7.007250469003338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1713.0, "completions/max_terminated_length": 1713.0, "completions/mean_length": 292.1875, "completions/mean_terminated_length": 286.8387145996094, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.419333398342133, "epoch": 0.01091, "frac_reward_zero_std": 0.25, "grad_norm": 0.0464060977101326, "kl": 0.34547716099768877, "learning_rate": 9.999484904187655e-06, "loss": -0.0168, "num_tokens": 24697132.0, "reward": 0.7262570858001709, "reward_std": 1.146662712097168, "rewards/rollout_reward_func/mean": 0.7262570858001709, "rewards/rollout_reward_func/std": 1.6518141031265259, "sampling/importance_sampling_ratio/max": 0.5587000250816345, "sampling/importance_sampling_ratio/mean": 0.21253974735736847, "sampling/importance_sampling_ratio/min": 1.582047937903812e-17, "sampling/sampling_logp_difference/max": 4.074856758117676, "sampling/sampling_logp_difference/mean": 1.3308457136154175, "step": 1091, "step_time": 10.773705290997896 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 6.42773973941803, "epoch": 0.01092, "grad_norm": 0.024979280307888985, "kl": 0.3501308150589466, "learning_rate": 9.99948392726232e-06, "loss": -0.017, "step": 1092, "step_time": 6.419730198002071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1411.0, "completions/max_terminated_length": 1411.0, "completions/mean_length": 412.53125, "completions/mean_terminated_length": 411.0666809082031, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.57367479801178, "epoch": 0.01093, "frac_reward_zero_std": 0.0, "grad_norm": 0.01740359701216221, "kl": 0.31903981510549784, "learning_rate": 9.999482949411516e-06, "loss": -0.0112, "num_tokens": 24746753.0, "reward": 0.43867939710617065, "reward_std": 1.617113471031189, "rewards/rollout_reward_func/mean": 0.43867939710617065, "rewards/rollout_reward_func/std": 1.7692135572433472, "sampling/importance_sampling_ratio/max": 0.5612088441848755, "sampling/importance_sampling_ratio/mean": 0.1270546168088913, "sampling/importance_sampling_ratio/min": 3.2604248962574665e-16, "sampling/sampling_logp_difference/max": 4.6766839027404785, "sampling/sampling_logp_difference/mean": 1.3577792644500732, "step": 1093, "step_time": 10.38357633099804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 6.5711599588394165, "epoch": 0.01094, "grad_norm": 0.017216991633176804, "kl": 0.3190868869423866, "learning_rate": 9.99948197063524e-06, "loss": -0.0113, "step": 1094, "step_time": 5.783610600003158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 472.0, "completions/max_terminated_length": 472.0, "completions/mean_length": 71.96875, "completions/mean_terminated_length": 73.7741928100586, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.7160235047340393, "epoch": 0.01095, "frac_reward_zero_std": 0.5, "grad_norm": 0.026246776804327965, "kl": 0.6986615546047688, "learning_rate": 9.999480990933493e-06, "loss": -0.0115, "num_tokens": 24778935.0, "reward": 1.7002216577529907, "reward_std": 0.7784755229949951, "rewards/rollout_reward_func/mean": 1.7002216577529907, "rewards/rollout_reward_func/std": 1.0990650653839111, "sampling/importance_sampling_ratio/max": 0.560905396938324, "sampling/importance_sampling_ratio/mean": 0.44183486700057983, "sampling/importance_sampling_ratio/min": 4.938936087363344e-13, "sampling/sampling_logp_difference/max": 4.647972106933594, "sampling/sampling_logp_difference/mean": 0.6740753650665283, "step": 1095, "step_time": 6.841457753002032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.7146700620651245, "epoch": 0.01096, "grad_norm": 0.025372367352247238, "kl": 0.6987457051873207, "learning_rate": 9.999480010306273e-06, "loss": -0.0116, "step": 1096, "step_time": 4.277789877996838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 680.0, "completions/max_terminated_length": 680.0, "completions/mean_length": 265.0, "completions/mean_terminated_length": 263.7419128417969, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.2285685539245605, "epoch": 0.01097, "frac_reward_zero_std": 0.25, "grad_norm": 0.02693038433790207, "kl": 0.3670834256336093, "learning_rate": 9.999479028753583e-06, "loss": -0.0175, "num_tokens": 24823237.0, "reward": 0.749638557434082, "reward_std": 1.1460449695587158, "rewards/rollout_reward_func/mean": 0.749638557434082, "rewards/rollout_reward_func/std": 1.504583477973938, "sampling/importance_sampling_ratio/max": 0.5607194304466248, "sampling/importance_sampling_ratio/mean": 0.2056369185447693, "sampling/importance_sampling_ratio/min": 2.24052205358922e-16, "sampling/sampling_logp_difference/max": 13.146407127380371, "sampling/sampling_logp_difference/mean": 1.3319997787475586, "step": 1097, "step_time": 8.13435443400158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 6.251648545265198, "epoch": 0.01098, "grad_norm": 0.024227043613791466, "kl": 0.3635488422587514, "learning_rate": 9.999478046275422e-06, "loss": -0.0175, "step": 1098, "step_time": 4.65649045000464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1134.0, "completions/max_terminated_length": 1134.0, "completions/mean_length": 379.65625, "completions/mean_terminated_length": 379.65625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.684715509414673, "epoch": 0.01099, "frac_reward_zero_std": 0.25, "grad_norm": 0.06687144935131073, "kl": 0.6566080823540688, "learning_rate": 9.99947706287179e-06, "loss": -0.0184, "num_tokens": 24869769.0, "reward": 1.591308832168579, "reward_std": 1.145331859588623, "rewards/rollout_reward_func/mean": 1.591308832168579, "rewards/rollout_reward_func/std": 1.3889764547348022, "sampling/importance_sampling_ratio/max": 0.5607499480247498, "sampling/importance_sampling_ratio/mean": 0.25013649463653564, "sampling/importance_sampling_ratio/min": 8.30700810183771e-06, "sampling/sampling_logp_difference/max": 3.2270865440368652, "sampling/sampling_logp_difference/mean": 0.8217568397521973, "step": 1099, "step_time": 9.098182043999259 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0052083334885537624, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 4.6810691356658936, "epoch": 0.011, "grad_norm": 0.02549157291650772, "kl": 0.6630642674863338, "learning_rate": 9.999476078542688e-06, "loss": -0.0187, "step": 1100, "step_time": 5.163844056001835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 656.0, "completions/max_terminated_length": 656.0, "completions/mean_length": 308.9375, "completions/mean_terminated_length": 317.0000305175781, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.321139872074127, "epoch": 0.01101, "frac_reward_zero_std": 0.0, "grad_norm": 0.05975858494639397, "kl": 0.21871075127273798, "learning_rate": 9.999475093288116e-06, "loss": -0.0138, "num_tokens": 24916121.0, "reward": 0.2556132376194, "reward_std": 1.587601900100708, "rewards/rollout_reward_func/mean": 0.2556132376194, "rewards/rollout_reward_func/std": 1.5825738906860352, "sampling/importance_sampling_ratio/max": 0.55581134557724, "sampling/importance_sampling_ratio/mean": 0.12492796778678894, "sampling/importance_sampling_ratio/min": 4.522733480655521e-16, "sampling/sampling_logp_difference/max": 11.762374877929688, "sampling/sampling_logp_difference/mean": 1.7083046436309814, "step": 1101, "step_time": 8.065794154994364 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 7.324125528335571, "epoch": 0.01102, "grad_norm": 0.04784378781914711, "kl": 0.2148186769336462, "learning_rate": 9.999474107108074e-06, "loss": -0.0141, "step": 1102, "step_time": 4.688065578000533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1666.0, "completions/max_terminated_length": 1666.0, "completions/mean_length": 209.15625, "completions/mean_terminated_length": 215.3870849609375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.916422128677368, "epoch": 0.01103, "frac_reward_zero_std": 0.25, "grad_norm": 0.005404180847108364, "kl": 0.5954967141151428, "learning_rate": 9.999473120002564e-06, "loss": -0.0146, "num_tokens": 24954478.0, "reward": 1.7492706775665283, "reward_std": 1.0797343254089355, "rewards/rollout_reward_func/mean": 1.7492706775665283, "rewards/rollout_reward_func/std": 1.2744605541229248, "sampling/importance_sampling_ratio/max": 0.5607761740684509, "sampling/importance_sampling_ratio/mean": 0.31855762004852295, "sampling/importance_sampling_ratio/min": 9.484044904242197e-30, "sampling/sampling_logp_difference/max": 8.329172134399414, "sampling/sampling_logp_difference/mean": 1.2392909526824951, "step": 1103, "step_time": 10.56051956799638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.915634095668793, "epoch": 0.01104, "grad_norm": 0.0055740103125572205, "kl": 0.5943596735596657, "learning_rate": 9.999472131971582e-06, "loss": -0.0146, "step": 1104, "step_time": 6.2221913280045555 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "completions/clipped_ratio": 0.03125, "completions/max_length": 1099.0, "completions/max_terminated_length": 1099.0, "completions/mean_length": 425.90625, "completions/mean_terminated_length": 421.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.659493029117584, "epoch": 0.01105, "frac_reward_zero_std": 0.25, "grad_norm": 0.011271463707089424, "kl": 0.5371679738163948, "learning_rate": 9.999471143015132e-06, "loss": -0.0114, "num_tokens": 25002644.0, "reward": 1.3611294031143188, "reward_std": 1.3354038000106812, "rewards/rollout_reward_func/mean": 1.3611294031143188, "rewards/rollout_reward_func/std": 1.5210046768188477, "sampling/importance_sampling_ratio/max": 0.56157386302948, "sampling/importance_sampling_ratio/mean": 0.22773835062980652, "sampling/importance_sampling_ratio/min": 5.001857667269916e-16, "sampling/sampling_logp_difference/max": 4.208077430725098, "sampling/sampling_logp_difference/mean": 1.0797451734542847, "step": 1105, "step_time": 9.67497255300259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.653162807226181, "epoch": 0.01106, "grad_norm": 0.031259700655937195, "kl": 0.547290007583797, "learning_rate": 9.999470153133216e-06, "loss": -0.0114, "step": 1106, "step_time": 5.13408255199829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1775.0, "completions/max_terminated_length": 1775.0, "completions/mean_length": 613.65625, "completions/mean_terminated_length": 613.65625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.827561020851135, "epoch": 0.01107, "frac_reward_zero_std": 0.0, "grad_norm": 0.01689976267516613, "kl": 0.31990708876401186, "learning_rate": 9.999469162325828e-06, "loss": -0.0127, "num_tokens": 25058635.0, "reward": 0.765641987323761, "reward_std": 1.4701080322265625, "rewards/rollout_reward_func/mean": 0.765641987323761, "rewards/rollout_reward_func/std": 1.5636703968048096, "sampling/importance_sampling_ratio/max": 0.5471782088279724, "sampling/importance_sampling_ratio/mean": 0.08802895247936249, "sampling/importance_sampling_ratio/min": 1.2972591001414457e-08, "sampling/sampling_logp_difference/max": 3.4549918174743652, "sampling/sampling_logp_difference/mean": 1.3387446403503418, "step": 1107, "step_time": 12.238770746993396 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0052083334885537624, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0052083334885537624, "entropy": 6.81325626373291, "epoch": 0.01108, "grad_norm": 0.016052475199103355, "kl": 0.32096936786547303, "learning_rate": 9.999468170592971e-06, "loss": -0.0127, "step": 1108, "step_time": 6.853674661000696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 967.0, "completions/max_terminated_length": 967.0, "completions/mean_length": 277.34375, "completions/mean_terminated_length": 277.34375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.904060184955597, "epoch": 0.01109, "frac_reward_zero_std": 0.25, "grad_norm": 0.04626591503620148, "kl": 0.5913947764784098, "learning_rate": 9.999467177934649e-06, "loss": -0.0114, "num_tokens": 25101876.0, "reward": 0.46842870116233826, "reward_std": 0.848312497138977, "rewards/rollout_reward_func/mean": 0.46842870116233826, "rewards/rollout_reward_func/std": 1.4511345624923706, "sampling/importance_sampling_ratio/max": 0.5650038123130798, "sampling/importance_sampling_ratio/mean": 0.26913997530937195, "sampling/importance_sampling_ratio/min": 4.984331098967232e-06, "sampling/sampling_logp_difference/max": 3.299711227416992, "sampling/sampling_logp_difference/mean": 1.1937808990478516, "step": 1109, "step_time": 9.320070263005618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 5.8925551772117615, "epoch": 0.0111, "grad_norm": 0.04200401157140732, "kl": 0.5947797577828169, "learning_rate": 9.999466184350858e-06, "loss": -0.0116, "step": 1110, "step_time": 4.860965619001945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 580.0, "completions/max_terminated_length": 580.0, "completions/mean_length": 206.28125, "completions/mean_terminated_length": 206.28125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.227772980928421, "epoch": 0.01111, "frac_reward_zero_std": 0.5, "grad_norm": 0.01207348145544529, "kl": 0.8967251926660538, "learning_rate": 9.999465189841599e-06, "loss": -0.0088, "num_tokens": 25143141.0, "reward": 1.246354103088379, "reward_std": 0.6140555143356323, "rewards/rollout_reward_func/mean": 1.246354103088379, "rewards/rollout_reward_func/std": 1.4759048223495483, "sampling/importance_sampling_ratio/max": 0.570247232913971, "sampling/importance_sampling_ratio/mean": 0.3569905161857605, "sampling/importance_sampling_ratio/min": 1.140009463475043e-19, "sampling/sampling_logp_difference/max": 10.652325630187988, "sampling/sampling_logp_difference/mean": 0.8238855600357056, "step": 1111, "step_time": 7.701142490997881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.218030571937561, "epoch": 0.01112, "grad_norm": 0.025653090327978134, "kl": 0.9043528437614441, "learning_rate": 9.999464194406873e-06, "loss": -0.0089, "step": 1112, "step_time": 4.033454294996773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1190.0, "completions/max_terminated_length": 1190.0, "completions/mean_length": 404.65625, "completions/mean_terminated_length": 404.65625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.216558456420898, "epoch": 0.01113, "frac_reward_zero_std": 0.0, "grad_norm": 0.018353380262851715, "kl": 0.29416167456656694, "learning_rate": 9.99946319804668e-06, "loss": 0.0044, "num_tokens": 25191459.0, "reward": 0.5225840210914612, "reward_std": 1.0620299577713013, "rewards/rollout_reward_func/mean": 0.5225840210914612, "rewards/rollout_reward_func/std": 1.3359224796295166, "sampling/importance_sampling_ratio/max": 0.5630428194999695, "sampling/importance_sampling_ratio/mean": 0.1327565312385559, "sampling/importance_sampling_ratio/min": 7.63420292151755e-15, "sampling/sampling_logp_difference/max": 3.9861602783203125, "sampling/sampling_logp_difference/mean": 1.475992202758789, "step": 1113, "step_time": 9.907218530999671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 7.216469705104828, "epoch": 0.01114, "grad_norm": 0.018476618453860283, "kl": 0.2924752561375499, "learning_rate": 9.999462200761019e-06, "loss": 0.0044, "step": 1114, "step_time": 5.4009358570001496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1100.0, "completions/max_terminated_length": 1100.0, "completions/mean_length": 357.40625, "completions/mean_terminated_length": 357.40625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.716992080211639, "epoch": 0.01115, "frac_reward_zero_std": 0.25, "grad_norm": 0.05857764557003975, "kl": 0.5729916617274284, "learning_rate": 9.999461202549894e-06, "loss": -0.0129, "num_tokens": 25238072.0, "reward": 0.956305980682373, "reward_std": 1.1312838792800903, "rewards/rollout_reward_func/mean": 0.956305980682373, "rewards/rollout_reward_func/std": 1.4631398916244507, "sampling/importance_sampling_ratio/max": 0.5527782440185547, "sampling/importance_sampling_ratio/mean": 0.2145252227783203, "sampling/importance_sampling_ratio/min": 4.414633399509385e-14, "sampling/sampling_logp_difference/max": 9.391061782836914, "sampling/sampling_logp_difference/mean": 1.2842175960540771, "step": 1115, "step_time": 9.48010710899689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.714677274227142, "epoch": 0.01116, "grad_norm": 0.05772817134857178, "kl": 0.5718969572335482, "learning_rate": 9.9994602034133e-06, "loss": -0.013, "step": 1116, "step_time": 5.100726758999372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 947.0, "completions/max_terminated_length": 947.0, "completions/mean_length": 267.75, "completions/mean_terminated_length": 275.8709716796875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.530094772577286, "epoch": 0.01117, "frac_reward_zero_std": 0.25, "grad_norm": 0.024605831131339073, "kl": 0.6736784391105175, "learning_rate": 9.999459203351241e-06, "loss": -0.0164, "num_tokens": 25282160.0, "reward": 0.5775565505027771, "reward_std": 0.935706615447998, "rewards/rollout_reward_func/mean": 0.5775565505027771, "rewards/rollout_reward_func/std": 1.466478943824768, "sampling/importance_sampling_ratio/max": 0.5611785054206848, "sampling/importance_sampling_ratio/mean": 0.2552052140235901, "sampling/importance_sampling_ratio/min": 1.9709349707852653e-09, "sampling/sampling_logp_difference/max": 3.804236888885498, "sampling/sampling_logp_difference/mean": 0.8511437773704529, "step": 1117, "step_time": 8.829161906007357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.529885768890381, "epoch": 0.01118, "grad_norm": 0.024702219292521477, "kl": 0.6728467810899019, "learning_rate": 9.999458202363715e-06, "loss": -0.0165, "step": 1118, "step_time": 4.7648973319992365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1516.0, "completions/max_terminated_length": 1516.0, "completions/mean_length": 312.71875, "completions/mean_terminated_length": 312.71875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.9077229499816895, "epoch": 0.01119, "frac_reward_zero_std": 0.25, "grad_norm": 0.025360548868775368, "kl": 0.3883872274309397, "learning_rate": 9.999457200450725e-06, "loss": -0.0094, "num_tokens": 25326623.0, "reward": 0.6521519422531128, "reward_std": 1.499039649963379, "rewards/rollout_reward_func/mean": 0.6521519422531128, "rewards/rollout_reward_func/std": 1.9182639122009277, "sampling/importance_sampling_ratio/max": 0.5620541572570801, "sampling/importance_sampling_ratio/mean": 0.21710965037345886, "sampling/importance_sampling_ratio/min": 1.4964939287764878e-09, "sampling/sampling_logp_difference/max": 2.625535726547241, "sampling/sampling_logp_difference/mean": 1.2057082653045654, "step": 1119, "step_time": 10.583502110999689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.921511769294739, "epoch": 0.0112, "grad_norm": 0.025504738092422485, "kl": 0.3847852237522602, "learning_rate": 9.99945619761227e-06, "loss": -0.0094, "step": 1120, "step_time": 5.819067665001057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1390.0, "completions/max_terminated_length": 1390.0, "completions/mean_length": 558.6875, "completions/mean_terminated_length": 558.6875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.027631014585495, "epoch": 0.01121, "frac_reward_zero_std": 0.0, "grad_norm": 0.048179399222135544, "kl": 0.5122455153614283, "learning_rate": 9.999455193848349e-06, "loss": -0.0091, "num_tokens": 25380606.0, "reward": 1.5443708896636963, "reward_std": 0.9392328262329102, "rewards/rollout_reward_func/mean": 1.5443708896636963, "rewards/rollout_reward_func/std": 1.2906750440597534, "sampling/importance_sampling_ratio/max": 0.5460683107376099, "sampling/importance_sampling_ratio/mean": 0.20111960172653198, "sampling/importance_sampling_ratio/min": 1.1819025758086354e-08, "sampling/sampling_logp_difference/max": 2.775179386138916, "sampling/sampling_logp_difference/mean": 1.158867597579956, "step": 1121, "step_time": 10.964401287008513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 6.0700856149196625, "epoch": 0.01122, "grad_norm": 0.06684669852256775, "kl": 0.5069806426763535, "learning_rate": 9.999454189158961e-06, "loss": -0.0095, "step": 1122, "step_time": 5.790028181992966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1479.0, "completions/max_terminated_length": 1479.0, "completions/mean_length": 410.9375, "completions/mean_terminated_length": 410.9375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.8716588616371155, "epoch": 0.01123, "frac_reward_zero_std": 0.0, "grad_norm": 0.06265155971050262, "kl": 0.49451169930398464, "learning_rate": 9.999453183544113e-06, "loss": -0.0141, "num_tokens": 25429431.0, "reward": 1.1883385181427002, "reward_std": 1.0743381977081299, "rewards/rollout_reward_func/mean": 1.1883385181427002, "rewards/rollout_reward_func/std": 1.5807987451553345, "sampling/importance_sampling_ratio/max": 0.5541650056838989, "sampling/importance_sampling_ratio/mean": 0.18586567044258118, "sampling/importance_sampling_ratio/min": 4.172178400083419e-26, "sampling/sampling_logp_difference/max": 4.770873546600342, "sampling/sampling_logp_difference/mean": 1.0972920656204224, "step": 1123, "step_time": 10.603984777993901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.033333334140479565, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.033333334140479565, "entropy": 5.993392288684845, "epoch": 0.01124, "grad_norm": 0.045149173587560654, "kl": 0.481368076056242, "learning_rate": 9.999452177003797e-06, "loss": -0.0147, "step": 1124, "step_time": 5.86596112999905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 931.0, "completions/max_terminated_length": 931.0, "completions/mean_length": 328.78125, "completions/mean_terminated_length": 328.78125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.523647308349609, "epoch": 0.01125, "frac_reward_zero_std": 0.0, "grad_norm": 0.05395124480128288, "kl": 0.436585683375597, "learning_rate": 9.999451169538017e-06, "loss": -0.0191, "num_tokens": 25477440.0, "reward": 0.719426155090332, "reward_std": 1.3481881618499756, "rewards/rollout_reward_func/mean": 0.719426155090332, "rewards/rollout_reward_func/std": 1.6684781312942505, "sampling/importance_sampling_ratio/max": 0.5514923334121704, "sampling/importance_sampling_ratio/mean": 0.1781424880027771, "sampling/importance_sampling_ratio/min": 3.1462354854738805e-06, "sampling/sampling_logp_difference/max": 2.8042750358581543, "sampling/sampling_logp_difference/mean": 1.0820921659469604, "step": 1125, "step_time": 8.849361826996756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.026041666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.026041666977107525, "entropy": 5.530317783355713, "epoch": 0.01126, "grad_norm": 0.019150974228978157, "kl": 0.4462257204577327, "learning_rate": 9.999450161146776e-06, "loss": -0.0193, "step": 1126, "step_time": 4.790766353009531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1177.0, "completions/max_terminated_length": 1177.0, "completions/mean_length": 315.09375, "completions/mean_terminated_length": 315.09375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.891977906227112, "epoch": 0.01127, "frac_reward_zero_std": 0.25, "grad_norm": 0.005297617055475712, "kl": 0.3688208796083927, "learning_rate": 9.999449151830068e-06, "loss": -0.0078, "num_tokens": 25521076.0, "reward": 0.6203365325927734, "reward_std": 1.1878390312194824, "rewards/rollout_reward_func/mean": 0.6203365325927734, "rewards/rollout_reward_func/std": 1.6866267919540405, "sampling/importance_sampling_ratio/max": 0.5536226630210876, "sampling/importance_sampling_ratio/mean": 0.1746487021446228, "sampling/importance_sampling_ratio/min": 5.198809560624795e-08, "sampling/sampling_logp_difference/max": 4.363369941711426, "sampling/sampling_logp_difference/mean": 1.5076605081558228, "step": 1127, "step_time": 10.018585816007544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 6.902256786823273, "epoch": 0.01128, "grad_norm": 0.0065424079075455666, "kl": 0.37937339302152395, "learning_rate": 9.999448141587897e-06, "loss": -0.0079, "step": 1128, "step_time": 5.173658201998478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.005681818351149559, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005681818351149559, "completions/clipped_ratio": 0.0625, "completions/max_length": 566.0, "completions/max_terminated_length": 566.0, "completions/mean_length": 155.40625, "completions/mean_terminated_length": 164.70001220703125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.005222797393799, "epoch": 0.01129, "frac_reward_zero_std": 0.0, "grad_norm": 0.049322593957185745, "kl": 0.6988685503602028, "learning_rate": 9.999447130420266e-06, "loss": -0.0186, "num_tokens": 25560606.0, "reward": 0.4501684308052063, "reward_std": 1.5753552913665771, "rewards/rollout_reward_func/mean": 0.4501684308052063, "rewards/rollout_reward_func/std": 1.595892310142517, "sampling/importance_sampling_ratio/max": 0.5539711713790894, "sampling/importance_sampling_ratio/mean": 0.1650979220867157, "sampling/importance_sampling_ratio/min": 2.4405372167635164e-10, "sampling/sampling_logp_difference/max": 4.692249298095703, "sampling/sampling_logp_difference/mean": 1.2380516529083252, "step": 1129, "step_time": 7.077404362997186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 6.007034063339233, "epoch": 0.0113, "grad_norm": 0.04904624819755554, "kl": 0.7143141888082027, "learning_rate": 9.99944611832717e-06, "loss": -0.0185, "step": 1130, "step_time": 3.7704440409979725 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0052083334885537624, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0052083334885537624, "completions/clipped_ratio": 0.0, "completions/max_length": 1535.0, "completions/max_terminated_length": 1535.0, "completions/mean_length": 487.0625, "completions/mean_terminated_length": 487.0625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.3235228061676025, "epoch": 0.01131, "frac_reward_zero_std": 0.0, "grad_norm": 0.054586853832006454, "kl": 0.4841133002191782, "learning_rate": 9.99944510530861e-06, "loss": -0.015, "num_tokens": 25613556.0, "reward": 0.7178966403007507, "reward_std": 1.291111707687378, "rewards/rollout_reward_func/mean": 0.7178966403007507, "rewards/rollout_reward_func/std": 1.5371392965316772, "sampling/importance_sampling_ratio/max": 0.5397340655326843, "sampling/importance_sampling_ratio/mean": 0.10888680815696716, "sampling/importance_sampling_ratio/min": 2.2010060370717812e-21, "sampling/sampling_logp_difference/max": 4.692408561706543, "sampling/sampling_logp_difference/mean": 1.3125965595245361, "step": 1131, "step_time": 11.378681927002617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.008049242664128542, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008049242664128542, "entropy": 6.303860127925873, "epoch": 0.01132, "grad_norm": 0.06713315844535828, "kl": 0.5071947574615479, "learning_rate": 9.99944409136459e-06, "loss": -0.0153, "step": 1132, "step_time": 6.222999293997418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 614.0, "completions/max_terminated_length": 614.0, "completions/mean_length": 420.75, "completions/mean_terminated_length": 420.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 7.324622869491577, "epoch": 0.01133, "frac_reward_zero_std": 0.0, "grad_norm": 0.022111594676971436, "kl": 0.26528350356966257, "learning_rate": 9.999443076495105e-06, "loss": -0.0108, "num_tokens": 25664024.0, "reward": 1.0434842109680176, "reward_std": 1.21474289894104, "rewards/rollout_reward_func/mean": 1.0434842109680176, "rewards/rollout_reward_func/std": 1.5313764810562134, "sampling/importance_sampling_ratio/max": 0.34059953689575195, "sampling/importance_sampling_ratio/mean": 0.0871090367436409, "sampling/importance_sampling_ratio/min": 7.02831014289074e-17, "sampling/sampling_logp_difference/max": 10.716975212097168, "sampling/sampling_logp_difference/mean": 1.6835284233093262, "step": 1133, "step_time": 7.85132963399883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 7.30375075340271, "epoch": 0.01134, "grad_norm": 0.01927625946700573, "kl": 0.26385080721229315, "learning_rate": 9.999442060700163e-06, "loss": -0.0108, "step": 1134, "step_time": 4.157742798000982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1012.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 330.46875, "completions/mean_terminated_length": 325.4193420410156, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.41002082824707, "epoch": 0.01135, "frac_reward_zero_std": 0.0, "grad_norm": 0.01332632265985012, "kl": 0.3802255894988775, "learning_rate": 9.999441043979755e-06, "loss": -0.0063, "num_tokens": 25710055.0, "reward": 0.1597970575094223, "reward_std": 1.0064942836761475, "rewards/rollout_reward_func/mean": 0.1597970575094223, "rewards/rollout_reward_func/std": 1.166365623474121, "sampling/importance_sampling_ratio/max": 0.5477599501609802, "sampling/importance_sampling_ratio/mean": 0.09012825787067413, "sampling/importance_sampling_ratio/min": 1.7566890520023608e-15, "sampling/sampling_logp_difference/max": 4.571268081665039, "sampling/sampling_logp_difference/mean": 1.6146571636199951, "step": 1135, "step_time": 9.053582693999488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 7.389611661434174, "epoch": 0.01136, "grad_norm": 0.013890893198549747, "kl": 0.3794155474752188, "learning_rate": 9.999440026333887e-06, "loss": -0.0063, "step": 1136, "step_time": 5.206487745006598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.0, "completions/max_terminated_length": 533.0, "completions/mean_length": 62.875, "completions/mean_terminated_length": 62.875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.338386178016663, "epoch": 0.01137, "frac_reward_zero_std": 0.5, "grad_norm": 0.009776446968317032, "kl": 0.6005594953894615, "learning_rate": 9.999439007762558e-06, "loss": -0.0004, "num_tokens": 25742284.0, "reward": 1.2375695705413818, "reward_std": 0.7371745109558105, "rewards/rollout_reward_func/mean": 1.2375695705413818, "rewards/rollout_reward_func/std": 1.3037123680114746, "sampling/importance_sampling_ratio/max": 0.5623223781585693, "sampling/importance_sampling_ratio/mean": 0.30572256445884705, "sampling/importance_sampling_ratio/min": 7.489823872219858e-09, "sampling/sampling_logp_difference/max": 4.09085750579834, "sampling/sampling_logp_difference/mean": 1.037245273590088, "step": 1137, "step_time": 6.664256775002286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.32789620757103, "epoch": 0.01138, "grad_norm": 0.009995573200285435, "kl": 0.596720390021801, "learning_rate": 9.999437988265768e-06, "loss": -0.0004, "step": 1138, "step_time": 4.145821828995395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 886.0, "completions/max_terminated_length": 886.0, "completions/mean_length": 336.59375, "completions/mean_terminated_length": 336.59375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.0691832304000854, "epoch": 0.01139, "frac_reward_zero_std": 0.25, "grad_norm": 0.048314418643713, "kl": 0.41234034951776266, "learning_rate": 9.999436967843518e-06, "loss": -0.0092, "num_tokens": 25788336.0, "reward": 0.9126017093658447, "reward_std": 1.453322172164917, "rewards/rollout_reward_func/mean": 0.9126017093658447, "rewards/rollout_reward_func/std": 1.8128958940505981, "sampling/importance_sampling_ratio/max": 0.5592928528785706, "sampling/importance_sampling_ratio/mean": 0.20547929406166077, "sampling/importance_sampling_ratio/min": 4.743167730868899e-38, "sampling/sampling_logp_difference/max": 4.612776279449463, "sampling/sampling_logp_difference/mean": 1.2842769622802734, "step": 1139, "step_time": 8.210812855995755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 6.062258183956146, "epoch": 0.0114, "grad_norm": 0.033690761774778366, "kl": 0.4092285195365548, "learning_rate": 9.999435946495807e-06, "loss": -0.0092, "step": 1140, "step_time": 4.519071808997978 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/max_terminated_length": 515.0, "completions/mean_length": 112.875, "completions/mean_terminated_length": 112.875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.768404066562653, "epoch": 0.01141, "frac_reward_zero_std": 0.25, "grad_norm": 0.06602641195058823, "kl": 0.6559855528175831, "learning_rate": 9.999434924222635e-06, "loss": -0.015, "num_tokens": 25822309.0, "reward": 1.522886037826538, "reward_std": 1.0729470252990723, "rewards/rollout_reward_func/mean": 1.522886037826538, "rewards/rollout_reward_func/std": 1.2687129974365234, "sampling/importance_sampling_ratio/max": 0.5578706860542297, "sampling/importance_sampling_ratio/mean": 0.3515391945838928, "sampling/importance_sampling_ratio/min": 1.3350893504073761e-14, "sampling/sampling_logp_difference/max": 2.948362350463867, "sampling/sampling_logp_difference/mean": 0.9640525579452515, "step": 1141, "step_time": 6.708681707004871 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 4.666583627462387, "epoch": 0.01142, "grad_norm": 0.0535547249019146, "kl": 0.6712718233466148, "learning_rate": 9.999433901024004e-06, "loss": -0.0151, "step": 1142, "step_time": 4.166410956997424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1964.0, "completions/max_terminated_length": 1964.0, "completions/mean_length": 654.625, "completions/mean_terminated_length": 638.0967407226562, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.480511784553528, "epoch": 0.01143, "frac_reward_zero_std": 0.0, "grad_norm": 0.0676465854048729, "kl": 0.3099858039058745, "learning_rate": 9.999432876899914e-06, "loss": -0.0082, "num_tokens": 25880141.0, "reward": 0.8967109322547913, "reward_std": 1.3265072107315063, "rewards/rollout_reward_func/mean": 0.8967109322547913, "rewards/rollout_reward_func/std": 1.7250043153762817, "sampling/importance_sampling_ratio/max": 0.5537680983543396, "sampling/importance_sampling_ratio/mean": 0.10339820384979248, "sampling/importance_sampling_ratio/min": 1.5927405228044478e-11, "sampling/sampling_logp_difference/max": 3.799678325653076, "sampling/sampling_logp_difference/mean": 1.13993501663208, "step": 1143, "step_time": 12.783452173003752 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 6.452073335647583, "epoch": 0.01144, "grad_norm": 0.04847890883684158, "kl": 0.32003419008105993, "learning_rate": 9.999431851850363e-06, "loss": -0.0084, "step": 1144, "step_time": 7.2304999440057145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1045.0, "completions/max_terminated_length": 1045.0, "completions/mean_length": 444.28125, "completions/mean_terminated_length": 458.0967712402344, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.285528153181076, "epoch": 0.01145, "frac_reward_zero_std": 0.25, "grad_norm": 0.05462197959423065, "kl": 0.6054706303402781, "learning_rate": 9.999430825875353e-06, "loss": -0.0105, "num_tokens": 25928405.0, "reward": 1.4337692260742188, "reward_std": 1.350130319595337, "rewards/rollout_reward_func/mean": 1.4337692260742188, "rewards/rollout_reward_func/std": 1.7225103378295898, "sampling/importance_sampling_ratio/max": 0.5505104064941406, "sampling/importance_sampling_ratio/mean": 0.25080665946006775, "sampling/importance_sampling_ratio/min": 5.787223589237556e-09, "sampling/sampling_logp_difference/max": 3.8762717247009277, "sampling/sampling_logp_difference/mean": 0.8401609063148499, "step": 1145, "step_time": 9.380934921002336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.285090863704681, "epoch": 0.01146, "grad_norm": 0.03964316099882126, "kl": 0.6095951460301876, "learning_rate": 9.999429798974887e-06, "loss": -0.0107, "step": 1146, "step_time": 5.109421616998588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1049.0, "completions/max_terminated_length": 1049.0, "completions/mean_length": 301.125, "completions/mean_terminated_length": 301.125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.19139689207077, "epoch": 0.01147, "frac_reward_zero_std": 0.0, "grad_norm": 0.05050928518176079, "kl": 0.49416646361351013, "learning_rate": 9.99942877114896e-06, "loss": 0.001, "num_tokens": 25973707.0, "reward": 1.129353642463684, "reward_std": 1.2446765899658203, "rewards/rollout_reward_func/mean": 1.129353642463684, "rewards/rollout_reward_func/std": 1.4160304069519043, "sampling/importance_sampling_ratio/max": 0.5719518661499023, "sampling/importance_sampling_ratio/mean": 0.1848665475845337, "sampling/importance_sampling_ratio/min": 1.145783517131349e-05, "sampling/sampling_logp_difference/max": 2.364365577697754, "sampling/sampling_logp_difference/mean": 1.1065893173217773, "step": 1147, "step_time": 9.038483011001517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 6.178856909275055, "epoch": 0.01148, "grad_norm": 0.05470314994454384, "kl": 0.5002294294536114, "learning_rate": 9.999427742397575e-06, "loss": 0.0009, "step": 1148, "step_time": 5.419274701998802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1481.0, "completions/max_terminated_length": 1481.0, "completions/mean_length": 329.1875, "completions/mean_terminated_length": 329.1875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.159745693206787, "epoch": 0.01149, "frac_reward_zero_std": 0.25, "grad_norm": 0.019235078245401382, "kl": 0.8716816492378712, "learning_rate": 9.999426712720733e-06, "loss": -0.0104, "num_tokens": 26017656.0, "reward": 1.4250731468200684, "reward_std": 1.0316553115844727, "rewards/rollout_reward_func/mean": 1.4250731468200684, "rewards/rollout_reward_func/std": 1.2620258331298828, "sampling/importance_sampling_ratio/max": 0.5594328045845032, "sampling/importance_sampling_ratio/mean": 0.31660667061805725, "sampling/importance_sampling_ratio/min": 3.730694515979849e-05, "sampling/sampling_logp_difference/max": 4.789341449737549, "sampling/sampling_logp_difference/mean": 0.720919668674469, "step": 1149, "step_time": 10.6669371220014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.156437486410141, "epoch": 0.0115, "grad_norm": 0.019346104934811592, "kl": 0.8735330030322075, "learning_rate": 9.99942568211843e-06, "loss": -0.0104, "step": 1150, "step_time": 6.109108959000878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1095.0, "completions/max_terminated_length": 1095.0, "completions/mean_length": 311.90625, "completions/mean_terminated_length": 311.90625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.8721480667591095, "epoch": 0.01151, "frac_reward_zero_std": 0.5, "grad_norm": 0.004662476014345884, "kl": 0.553910780698061, "learning_rate": 9.999424650590674e-06, "loss": -0.0053, "num_tokens": 26060957.0, "reward": 1.5996310710906982, "reward_std": 0.6196053624153137, "rewards/rollout_reward_func/mean": 1.5996310710906982, "rewards/rollout_reward_func/std": 0.9563988447189331, "sampling/importance_sampling_ratio/max": 0.5626754760742188, "sampling/importance_sampling_ratio/mean": 0.30444633960723877, "sampling/importance_sampling_ratio/min": 2.753697663138155e-05, "sampling/sampling_logp_difference/max": 3.867095947265625, "sampling/sampling_logp_difference/mean": 0.8846275806427002, "step": 1151, "step_time": 8.970421926005656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.883045673370361, "epoch": 0.01152, "grad_norm": 0.004542779177427292, "kl": 0.5513016358017921, "learning_rate": 9.999423618137458e-06, "loss": -0.0054, "step": 1152, "step_time": 5.0955342129927885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1088.0, "completions/max_terminated_length": 1088.0, "completions/mean_length": 462.46875, "completions/mean_terminated_length": 462.46875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.2390031814575195, "epoch": 0.01153, "frac_reward_zero_std": 0.0, "grad_norm": 0.06070706620812416, "kl": 0.3126149633899331, "learning_rate": 9.999422584758785e-06, "loss": -0.0126, "num_tokens": 26111436.0, "reward": 0.38730403780937195, "reward_std": 1.0937204360961914, "rewards/rollout_reward_func/mean": 0.38730403780937195, "rewards/rollout_reward_func/std": 1.445006012916565, "sampling/importance_sampling_ratio/max": 0.3752998411655426, "sampling/importance_sampling_ratio/mean": 0.07217029482126236, "sampling/importance_sampling_ratio/min": 5.516971879337973e-10, "sampling/sampling_logp_difference/max": 8.690698623657227, "sampling/sampling_logp_difference/mean": 1.4027235507965088, "step": 1153, "step_time": 9.703097728001012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 7.219995439052582, "epoch": 0.01154, "grad_norm": 0.05585022643208504, "kl": 0.31652028765529394, "learning_rate": 9.999421550454654e-06, "loss": -0.0127, "step": 1154, "step_time": 5.094955162010592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1418.0, "completions/max_terminated_length": 1418.0, "completions/mean_length": 380.28125, "completions/mean_terminated_length": 380.28125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.098679423332214, "epoch": 0.01155, "frac_reward_zero_std": 0.0, "grad_norm": 0.02892220765352249, "kl": 0.3486034497618675, "learning_rate": 9.999420515225069e-06, "loss": -0.0151, "num_tokens": 26158875.0, "reward": 0.5633348226547241, "reward_std": 1.6771228313446045, "rewards/rollout_reward_func/mean": 0.5633348226547241, "rewards/rollout_reward_func/std": 1.6518981456756592, "sampling/importance_sampling_ratio/max": 0.4771585762500763, "sampling/importance_sampling_ratio/mean": 0.0949840396642685, "sampling/importance_sampling_ratio/min": 2.074244321192964e-06, "sampling/sampling_logp_difference/max": 4.208661079406738, "sampling/sampling_logp_difference/mean": 1.3663291931152344, "step": 1155, "step_time": 10.425906562995806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 7.14448207616806, "epoch": 0.01156, "grad_norm": 0.032039202749729156, "kl": 0.342612462118268, "learning_rate": 9.999419479070025e-06, "loss": -0.0152, "step": 1156, "step_time": 5.653792214990972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1031.0, "completions/max_terminated_length": 1031.0, "completions/mean_length": 353.375, "completions/mean_terminated_length": 353.375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.910484433174133, "epoch": 0.01157, "frac_reward_zero_std": 0.25, "grad_norm": 0.011084180325269699, "kl": 0.37728744000196457, "learning_rate": 9.999418441989527e-06, "loss": -0.0106, "num_tokens": 26205235.0, "reward": 1.4506926536560059, "reward_std": 1.2187602519989014, "rewards/rollout_reward_func/mean": 1.4506926536560059, "rewards/rollout_reward_func/std": 1.4032483100891113, "sampling/importance_sampling_ratio/max": 0.5604188442230225, "sampling/importance_sampling_ratio/mean": 0.18167081475257874, "sampling/importance_sampling_ratio/min": 2.5563607778167352e-05, "sampling/sampling_logp_difference/max": 3.0801961421966553, "sampling/sampling_logp_difference/mean": 1.0708844661712646, "step": 1157, "step_time": 8.85851986900525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.948027998209, "epoch": 0.01158, "grad_norm": 0.010691523551940918, "kl": 0.37464827857911587, "learning_rate": 9.999417403983573e-06, "loss": -0.0106, "step": 1158, "step_time": 4.871814117006579 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.03125, "completions/max_length": 1011.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 388.6875, "completions/mean_terminated_length": 387.8064270019531, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.3837133049964905, "epoch": 0.01159, "frac_reward_zero_std": 0.0, "grad_norm": 0.013579430989921093, "kl": 0.21590504562482238, "learning_rate": 9.999416365052164e-06, "loss": -0.0167, "num_tokens": 26254319.0, "reward": 0.9941831231117249, "reward_std": 1.7625524997711182, "rewards/rollout_reward_func/mean": 0.9941831231117249, "rewards/rollout_reward_func/std": 1.762650489807129, "sampling/importance_sampling_ratio/max": 0.36265772581100464, "sampling/importance_sampling_ratio/mean": 0.07113393396139145, "sampling/importance_sampling_ratio/min": 2.9090974342173296e-18, "sampling/sampling_logp_difference/max": 4.408419609069824, "sampling/sampling_logp_difference/mean": 1.5179388523101807, "step": 1159, "step_time": 9.485281721998035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 7.412699103355408, "epoch": 0.0116, "grad_norm": 0.013109114952385426, "kl": 0.21106948610395193, "learning_rate": 9.999415325195299e-06, "loss": -0.0165, "step": 1160, "step_time": 4.968520212001749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1181.0, "completions/max_terminated_length": 1181.0, "completions/mean_length": 351.5625, "completions/mean_terminated_length": 352.9677429199219, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.434332311153412, "epoch": 0.01161, "frac_reward_zero_std": 0.25, "grad_norm": 0.010707827284932137, "kl": 0.5727899670600891, "learning_rate": 9.999414284412979e-06, "loss": -0.0019, "num_tokens": 26300298.0, "reward": 0.9037189483642578, "reward_std": 0.9132963418960571, "rewards/rollout_reward_func/mean": 0.9037189483642578, "rewards/rollout_reward_func/std": 1.2949274778366089, "sampling/importance_sampling_ratio/max": 0.5530560612678528, "sampling/importance_sampling_ratio/mean": 0.23496891558170319, "sampling/importance_sampling_ratio/min": 7.619934863279288e-18, "sampling/sampling_logp_difference/max": 5.009173393249512, "sampling/sampling_logp_difference/mean": 1.1166248321533203, "step": 1161, "step_time": 9.666348732003826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.443391799926758, "epoch": 0.01162, "grad_norm": 0.010330664925277233, "kl": 0.5724034486338496, "learning_rate": 9.999413242705202e-06, "loss": -0.0019, "step": 1162, "step_time": 5.229481630009104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 957.0, "completions/max_terminated_length": 957.0, "completions/mean_length": 101.5, "completions/mean_terminated_length": 104.25806427001953, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.886264741420746, "epoch": 0.01163, "frac_reward_zero_std": 0.25, "grad_norm": 0.00992137286812067, "kl": 0.3681075107306242, "learning_rate": 9.999412200071973e-06, "loss": -0.0088, "num_tokens": 26336644.0, "reward": 1.034805417060852, "reward_std": 0.8140865564346313, "rewards/rollout_reward_func/mean": 1.034805417060852, "rewards/rollout_reward_func/std": 1.4824761152267456, "sampling/importance_sampling_ratio/max": 0.5608329772949219, "sampling/importance_sampling_ratio/mean": 0.278659462928772, "sampling/importance_sampling_ratio/min": 2.714462335462464e-12, "sampling/sampling_logp_difference/max": 4.681458473205566, "sampling/sampling_logp_difference/mean": 1.1632956266403198, "step": 1163, "step_time": 8.437105877997965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.904509603977203, "epoch": 0.01164, "grad_norm": 0.009827638044953346, "kl": 0.3669598214328289, "learning_rate": 9.999411156513289e-06, "loss": -0.0088, "step": 1164, "step_time": 4.727721510003903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0031250000465661287, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250000465661287, "completions/clipped_ratio": 0.03125, "completions/max_length": 1585.0, "completions/max_terminated_length": 1585.0, "completions/mean_length": 488.5, "completions/mean_terminated_length": 492.6773986816406, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.384243130683899, "epoch": 0.01165, "frac_reward_zero_std": 0.0, "grad_norm": 0.007903127931058407, "kl": 0.19894387293606997, "learning_rate": 9.999410112029152e-06, "loss": -0.0123, "num_tokens": 26386906.0, "reward": -0.0004900731146335602, "reward_std": 1.351810336112976, "rewards/rollout_reward_func/mean": -0.0004900731146335602, "rewards/rollout_reward_func/std": 1.6148775815963745, "sampling/importance_sampling_ratio/max": 0.5547598004341125, "sampling/importance_sampling_ratio/mean": 0.07303060591220856, "sampling/importance_sampling_ratio/min": 2.108351297197455e-16, "sampling/sampling_logp_difference/max": 9.343233108520508, "sampling/sampling_logp_difference/mean": 1.6187386512756348, "step": 1165, "step_time": 11.205152105001616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0031250000465661287, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250000465661287, "entropy": 7.388152301311493, "epoch": 0.01166, "grad_norm": 0.007559757214039564, "kl": 0.19667464960366488, "learning_rate": 9.99940906661956e-06, "loss": -0.0123, "step": 1166, "step_time": 6.068041780003114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 444.0, "completions/max_terminated_length": 444.0, "completions/mean_length": 62.6875, "completions/mean_terminated_length": 62.6875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.509818017482758, "epoch": 0.01167, "frac_reward_zero_std": 0.25, "grad_norm": 0.009744525887072086, "kl": 0.5533003453165293, "learning_rate": 9.999408020284516e-06, "loss": -0.0062, "num_tokens": 26420242.0, "reward": -0.00032445788383483887, "reward_std": 0.7689988017082214, "rewards/rollout_reward_func/mean": -0.00032445788383483887, "rewards/rollout_reward_func/std": 1.5279499292373657, "sampling/importance_sampling_ratio/max": 0.5634933710098267, "sampling/importance_sampling_ratio/mean": 0.2841326892375946, "sampling/importance_sampling_ratio/min": 0.00016896746819838881, "sampling/sampling_logp_difference/max": 2.581435203552246, "sampling/sampling_logp_difference/mean": 1.0013108253479004, "step": 1167, "step_time": 7.096972827002901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.502355396747589, "epoch": 0.01168, "grad_norm": 0.0102047985419631, "kl": 0.554323835298419, "learning_rate": 9.999406973024017e-06, "loss": -0.0062, "step": 1168, "step_time": 3.6164927659956447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1045.0, "completions/max_terminated_length": 1045.0, "completions/mean_length": 287.75, "completions/mean_terminated_length": 287.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.376652210950851, "epoch": 0.01169, "frac_reward_zero_std": 0.5, "grad_norm": 0.005643609911203384, "kl": 0.5212748907506466, "learning_rate": 9.999405924838066e-06, "loss": -0.005, "num_tokens": 26461899.0, "reward": 1.9138354063034058, "reward_std": 0.7948159575462341, "rewards/rollout_reward_func/mean": 1.9138354063034058, "rewards/rollout_reward_func/std": 1.1065144538879395, "sampling/importance_sampling_ratio/max": 0.562716543674469, "sampling/importance_sampling_ratio/mean": 0.33881330490112305, "sampling/importance_sampling_ratio/min": 1.8404138245387003e-05, "sampling/sampling_logp_difference/max": 2.463392734527588, "sampling/sampling_logp_difference/mean": 0.8658673763275146, "step": 1169, "step_time": 8.74952228300026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.374464243650436, "epoch": 0.0117, "grad_norm": 0.006055673584342003, "kl": 0.5207544651348144, "learning_rate": 9.999404875726661e-06, "loss": -0.0049, "step": 1170, "step_time": 5.4210235510036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1117.0, "completions/max_terminated_length": 1117.0, "completions/mean_length": 539.65625, "completions/mean_terminated_length": 546.6129150390625, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "entropy": 6.54932028055191, "epoch": 0.01171, "frac_reward_zero_std": 0.0, "grad_norm": 0.0202158372849226, "kl": 0.3480294067412615, "learning_rate": 9.999403825689805e-06, "loss": -0.007, "num_tokens": 26516556.0, "reward": 0.5529367923736572, "reward_std": 1.495970606803894, "rewards/rollout_reward_func/mean": 0.5529367923736572, "rewards/rollout_reward_func/std": 1.5130020380020142, "sampling/importance_sampling_ratio/max": 0.34143537282943726, "sampling/importance_sampling_ratio/mean": 0.0881233662366867, "sampling/importance_sampling_ratio/min": 3.6709763007181095e-10, "sampling/sampling_logp_difference/max": 13.670907974243164, "sampling/sampling_logp_difference/mean": 1.3228192329406738, "step": 1171, "step_time": 9.613998117998563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 6.543099910020828, "epoch": 0.01172, "grad_norm": 0.02120506390929222, "kl": 0.34829144878312945, "learning_rate": 9.999402774727496e-06, "loss": -0.007, "step": 1172, "step_time": 5.250723545992514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1078.0, "completions/max_terminated_length": 1078.0, "completions/mean_length": 270.6875, "completions/mean_terminated_length": 270.6875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.852683842182159, "epoch": 0.01173, "frac_reward_zero_std": 0.25, "grad_norm": 0.010562003590166569, "kl": 0.4005259517580271, "learning_rate": 9.999401722839737e-06, "loss": -0.0136, "num_tokens": 26558367.0, "reward": 0.6837718486785889, "reward_std": 0.8455359935760498, "rewards/rollout_reward_func/mean": 0.6837718486785889, "rewards/rollout_reward_func/std": 1.2807260751724243, "sampling/importance_sampling_ratio/max": 0.5585485100746155, "sampling/importance_sampling_ratio/mean": 0.17075665295124054, "sampling/importance_sampling_ratio/min": 1.1064434346508278e-07, "sampling/sampling_logp_difference/max": 2.925018310546875, "sampling/sampling_logp_difference/mean": 1.3906139135360718, "step": 1173, "step_time": 9.103943091995461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 6.839500427246094, "epoch": 0.01174, "grad_norm": 0.009911185130476952, "kl": 0.40029629692435265, "learning_rate": 9.999400670026525e-06, "loss": -0.0135, "step": 1174, "step_time": 4.871480953999708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1182.0, "completions/max_terminated_length": 1182.0, "completions/mean_length": 372.9375, "completions/mean_terminated_length": 372.9375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.525488406419754, "epoch": 0.01175, "frac_reward_zero_std": 0.25, "grad_norm": 0.018561337143182755, "kl": 0.4744579363614321, "learning_rate": 9.999399616287859e-06, "loss": -0.0108, "num_tokens": 26605054.0, "reward": 1.3788104057312012, "reward_std": 1.2390893697738647, "rewards/rollout_reward_func/mean": 1.3788104057312012, "rewards/rollout_reward_func/std": 1.4488365650177002, "sampling/importance_sampling_ratio/max": 0.5617702007293701, "sampling/importance_sampling_ratio/mean": 0.2256229817867279, "sampling/importance_sampling_ratio/min": 1.6002606173515233e-07, "sampling/sampling_logp_difference/max": 2.835475444793701, "sampling/sampling_logp_difference/mean": 1.0786947011947632, "step": 1175, "step_time": 9.448016497997742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.513638406991959, "epoch": 0.01176, "grad_norm": 0.017762573435902596, "kl": 0.47409985587000847, "learning_rate": 9.999398561623746e-06, "loss": -0.0109, "step": 1176, "step_time": 5.7068741289949685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 982.0, "completions/max_terminated_length": 982.0, "completions/mean_length": 295.40625, "completions/mean_terminated_length": 305.20001220703125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.719045430421829, "epoch": 0.01177, "frac_reward_zero_std": 0.25, "grad_norm": 0.029218759387731552, "kl": 0.6111013498157263, "learning_rate": 9.999397506034179e-06, "loss": -0.012, "num_tokens": 26649487.0, "reward": 1.0566437244415283, "reward_std": 0.8526495099067688, "rewards/rollout_reward_func/mean": 1.0566437244415283, "rewards/rollout_reward_func/std": 1.2021492719650269, "sampling/importance_sampling_ratio/max": 0.5512263178825378, "sampling/importance_sampling_ratio/mean": 0.2198142260313034, "sampling/importance_sampling_ratio/min": 2.5325443466630307e-18, "sampling/sampling_logp_difference/max": 11.856338500976562, "sampling/sampling_logp_difference/mean": 1.2084178924560547, "step": 1177, "step_time": 9.074246202002541 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 5.723942399024963, "epoch": 0.01178, "grad_norm": 0.01693631336092949, "kl": 0.6069647334516048, "learning_rate": 9.999396449519164e-06, "loss": -0.0121, "step": 1178, "step_time": 5.321383318998414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 952.0, "completions/max_terminated_length": 952.0, "completions/mean_length": 386.5, "completions/mean_terminated_length": 387.8709411621094, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.4654428362846375, "epoch": 0.01179, "frac_reward_zero_std": 0.0, "grad_norm": 0.0251290425658226, "kl": 0.3233295362442732, "learning_rate": 9.999395392078698e-06, "loss": -0.0119, "num_tokens": 26698297.0, "reward": 0.6069393754005432, "reward_std": 1.800410270690918, "rewards/rollout_reward_func/mean": 0.6069393754005432, "rewards/rollout_reward_func/std": 1.8071644306182861, "sampling/importance_sampling_ratio/max": 0.5493956208229065, "sampling/importance_sampling_ratio/mean": 0.14075544476509094, "sampling/importance_sampling_ratio/min": 6.35726063805131e-12, "sampling/sampling_logp_difference/max": 2.859835624694824, "sampling/sampling_logp_difference/mean": 1.2534470558166504, "step": 1179, "step_time": 9.08298106199436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 6.458537697792053, "epoch": 0.0118, "grad_norm": 0.027545657008886337, "kl": 0.3228750228881836, "learning_rate": 9.999394333712782e-06, "loss": -0.0119, "step": 1180, "step_time": 4.808561573998304 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.03125, "completions/max_length": 1144.0, "completions/max_terminated_length": 1144.0, "completions/mean_length": 474.71875, "completions/mean_terminated_length": 458.19354248046875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.365890204906464, "epoch": 0.01181, "frac_reward_zero_std": 0.0, "grad_norm": 0.006940352730453014, "kl": 0.2411997844465077, "learning_rate": 9.999393274421414e-06, "loss": -0.0069, "num_tokens": 26749796.0, "reward": 0.5458208918571472, "reward_std": 1.1984987258911133, "rewards/rollout_reward_func/mean": 0.5458208918571472, "rewards/rollout_reward_func/std": 1.4547991752624512, "sampling/importance_sampling_ratio/max": 0.5583367943763733, "sampling/importance_sampling_ratio/mean": 0.06868462264537811, "sampling/importance_sampling_ratio/min": 2.374408713443665e-18, "sampling/sampling_logp_difference/max": 12.334726333618164, "sampling/sampling_logp_difference/mean": 1.6777031421661377, "step": 1181, "step_time": 9.912380793994089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 7.382702112197876, "epoch": 0.01182, "grad_norm": 0.006683387793600559, "kl": 0.23992858966812491, "learning_rate": 9.999392214204598e-06, "loss": -0.007, "step": 1182, "step_time": 5.8036310610041255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1041.0, "completions/max_terminated_length": 1041.0, "completions/mean_length": 211.875, "completions/mean_terminated_length": 211.875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.517148315906525, "epoch": 0.01183, "frac_reward_zero_std": 0.25, "grad_norm": 0.03686723858118057, "kl": 0.5710533335804939, "learning_rate": 9.999391153062331e-06, "loss": -0.0164, "num_tokens": 26789183.0, "reward": 1.2660679817199707, "reward_std": 1.0849758386611938, "rewards/rollout_reward_func/mean": 1.2660679817199707, "rewards/rollout_reward_func/std": 1.3771251440048218, "sampling/importance_sampling_ratio/max": 0.5610297322273254, "sampling/importance_sampling_ratio/mean": 0.32535430788993835, "sampling/importance_sampling_ratio/min": 8.85838901464919e-17, "sampling/sampling_logp_difference/max": 3.957974910736084, "sampling/sampling_logp_difference/mean": 0.8386557102203369, "step": 1183, "step_time": 9.207252397001866 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 4.533058822154999, "epoch": 0.01184, "grad_norm": 0.015834316611289978, "kl": 0.5649457946419716, "learning_rate": 9.999390090994617e-06, "loss": -0.0165, "step": 1184, "step_time": 5.4488306819948775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1459.0, "completions/max_terminated_length": 1459.0, "completions/mean_length": 498.625, "completions/mean_terminated_length": 498.625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.127776861190796, "epoch": 0.01185, "frac_reward_zero_std": 0.0, "grad_norm": 0.042435940355062485, "kl": 0.4953116001561284, "learning_rate": 9.999389028001453e-06, "loss": -0.0118, "num_tokens": 26842033.0, "reward": 1.1263047456741333, "reward_std": 1.768860101699829, "rewards/rollout_reward_func/mean": 1.1263047456741333, "rewards/rollout_reward_func/std": 1.7256247997283936, "sampling/importance_sampling_ratio/max": 0.5514805912971497, "sampling/importance_sampling_ratio/mean": 0.1317891776561737, "sampling/importance_sampling_ratio/min": 2.440577079711861e-23, "sampling/sampling_logp_difference/max": 8.689151763916016, "sampling/sampling_logp_difference/mean": 1.3560752868652344, "step": 1185, "step_time": 10.930918254001881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 6.132589817047119, "epoch": 0.01186, "grad_norm": 0.0431046187877655, "kl": 0.48966096714138985, "learning_rate": 9.999387964082844e-06, "loss": -0.0118, "step": 1186, "step_time": 5.840057828012505 }, { "clip_ratio/high_max": 0.005681818351149559, "clip_ratio/high_mean": 0.0028409091755747795, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0028409091755747795, "completions/clipped_ratio": 0.0, "completions/max_length": 1637.0, "completions/max_terminated_length": 1637.0, "completions/mean_length": 506.09375, "completions/mean_terminated_length": 506.09375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.783915102481842, "epoch": 0.01187, "frac_reward_zero_std": 0.0, "grad_norm": 0.10657382011413574, "kl": 0.39539504144340754, "learning_rate": 9.999386899238782e-06, "loss": -0.0131, "num_tokens": 26893272.0, "reward": 0.21157151460647583, "reward_std": 1.52811598777771, "rewards/rollout_reward_func/mean": 0.21157151460647583, "rewards/rollout_reward_func/std": 1.5165468454360962, "sampling/importance_sampling_ratio/max": 0.5080711841583252, "sampling/importance_sampling_ratio/mean": 0.08188723772764206, "sampling/importance_sampling_ratio/min": 5.545241373153048e-15, "sampling/sampling_logp_difference/max": 11.604814529418945, "sampling/sampling_logp_difference/mean": 1.4993001222610474, "step": 1187, "step_time": 12.027079128005425 }, { "clip_ratio/high_max": 0.014610390178859234, "clip_ratio/high_mean": 0.007305195089429617, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007305195089429617, "entropy": 6.769433438777924, "epoch": 0.01188, "grad_norm": 0.054129116237163544, "kl": 0.37964807730168104, "learning_rate": 9.999385833469273e-06, "loss": -0.0134, "step": 1188, "step_time": 6.235351188995992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1519.0, "completions/max_terminated_length": 1519.0, "completions/mean_length": 402.78125, "completions/mean_terminated_length": 391.70965576171875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.536502242088318, "epoch": 0.01189, "frac_reward_zero_std": 0.0, "grad_norm": 0.024059809744358063, "kl": 0.2906153220683336, "learning_rate": 9.999384766774318e-06, "loss": -0.0159, "num_tokens": 26942717.0, "reward": 0.22621887922286987, "reward_std": 1.2100539207458496, "rewards/rollout_reward_func/mean": 0.22621887922286987, "rewards/rollout_reward_func/std": 1.2759050130844116, "sampling/importance_sampling_ratio/max": 0.3987085521221161, "sampling/importance_sampling_ratio/mean": 0.0674072653055191, "sampling/importance_sampling_ratio/min": 2.1153923909204045e-16, "sampling/sampling_logp_difference/max": 4.8855109214782715, "sampling/sampling_logp_difference/mean": 1.5645767450332642, "step": 1189, "step_time": 10.941807361010433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 7.519472181797028, "epoch": 0.0119, "grad_norm": 0.024401120841503143, "kl": 0.2894052527844906, "learning_rate": 9.999383699153913e-06, "loss": -0.0159, "step": 1190, "step_time": 6.450279249002051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 910.0, "completions/max_terminated_length": 910.0, "completions/mean_length": 259.65625, "completions/mean_terminated_length": 259.65625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.3575193881988525, "epoch": 0.01191, "frac_reward_zero_std": 0.25, "grad_norm": 0.006233566906303167, "kl": 0.5103734247386456, "learning_rate": 9.999382630608064e-06, "loss": -0.0067, "num_tokens": 26984557.0, "reward": 1.0911107063293457, "reward_std": 1.1888041496276855, "rewards/rollout_reward_func/mean": 1.0911107063293457, "rewards/rollout_reward_func/std": 1.469608187675476, "sampling/importance_sampling_ratio/max": 0.5619445443153381, "sampling/importance_sampling_ratio/mean": 0.2926563620567322, "sampling/importance_sampling_ratio/min": 2.8281719414297024e-12, "sampling/sampling_logp_difference/max": 9.143128395080566, "sampling/sampling_logp_difference/mean": 1.284715175628662, "step": 1191, "step_time": 8.809036023998488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.350730717182159, "epoch": 0.01192, "grad_norm": 0.006808124948292971, "kl": 0.5092137660831213, "learning_rate": 9.999381561136765e-06, "loss": -0.0067, "step": 1192, "step_time": 4.706152021004527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1267.0, "completions/max_terminated_length": 1267.0, "completions/mean_length": 291.34375, "completions/mean_terminated_length": 291.34375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.504055619239807, "epoch": 0.01193, "frac_reward_zero_std": 0.25, "grad_norm": 0.03986186534166336, "kl": 0.6620152667164803, "learning_rate": 9.99938049074002e-06, "loss": -0.0133, "num_tokens": 27026741.0, "reward": 1.2079498767852783, "reward_std": 0.9934324026107788, "rewards/rollout_reward_func/mean": 1.2079498767852783, "rewards/rollout_reward_func/std": 1.3984768390655518, "sampling/importance_sampling_ratio/max": 0.560448944568634, "sampling/importance_sampling_ratio/mean": 0.32207363843917847, "sampling/importance_sampling_ratio/min": 5.4021709967960144e-17, "sampling/sampling_logp_difference/max": 10.22462272644043, "sampling/sampling_logp_difference/mean": 1.0415153503417969, "step": 1193, "step_time": 10.276095312998223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.464480757713318, "epoch": 0.01194, "grad_norm": 0.03243807330727577, "kl": 0.663605060428381, "learning_rate": 9.99937941941783e-06, "loss": -0.0134, "step": 1194, "step_time": 5.3754063050037075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 564.0, "completions/max_terminated_length": 564.0, "completions/mean_length": 195.21875, "completions/mean_terminated_length": 195.21875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.133198410272598, "epoch": 0.01195, "frac_reward_zero_std": 0.25, "grad_norm": 0.0221868883818388, "kl": 0.6373599320650101, "learning_rate": 9.999378347170195e-06, "loss": -0.0139, "num_tokens": 27065802.0, "reward": 1.2992308139801025, "reward_std": 0.8376550674438477, "rewards/rollout_reward_func/mean": 1.2992308139801025, "rewards/rollout_reward_func/std": 1.2337154150009155, "sampling/importance_sampling_ratio/max": 0.5600427985191345, "sampling/importance_sampling_ratio/mean": 0.2934928238391876, "sampling/importance_sampling_ratio/min": 3.639843981773083e-11, "sampling/sampling_logp_difference/max": 11.186820030212402, "sampling/sampling_logp_difference/mean": 1.0708953142166138, "step": 1195, "step_time": 7.554476805999002 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 5.126092791557312, "epoch": 0.01196, "grad_norm": 0.008585477247834206, "kl": 0.6363696493208408, "learning_rate": 9.999377273997111e-06, "loss": -0.014, "step": 1196, "step_time": 4.532733261996327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1291.0, "completions/max_terminated_length": 1291.0, "completions/mean_length": 418.96875, "completions/mean_terminated_length": 418.96875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.736514449119568, "epoch": 0.01197, "frac_reward_zero_std": 0.0, "grad_norm": 0.12852822244167328, "kl": 0.31884978618472815, "learning_rate": 9.999376199898583e-06, "loss": -0.0093, "num_tokens": 27115546.0, "reward": 0.7890310883522034, "reward_std": 1.5169868469238281, "rewards/rollout_reward_func/mean": 0.7890310883522034, "rewards/rollout_reward_func/std": 1.7228626012802124, "sampling/importance_sampling_ratio/max": 0.5559544563293457, "sampling/importance_sampling_ratio/mean": 0.1847861409187317, "sampling/importance_sampling_ratio/min": 2.942818655427004e-09, "sampling/sampling_logp_difference/max": 2.62250018119812, "sampling/sampling_logp_difference/mean": 1.3702940940856934, "step": 1197, "step_time": 9.987420100002055 }, { "clip_ratio/high_max": 0.109375, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0546875, "entropy": 6.570190489292145, "epoch": 0.01198, "grad_norm": 0.06491431593894958, "kl": 0.33649473637342453, "learning_rate": 9.99937512487461e-06, "loss": -0.0101, "step": 1198, "step_time": 5.412967303000187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1308.0, "completions/max_terminated_length": 1308.0, "completions/mean_length": 440.34375, "completions/mean_terminated_length": 449.4838562011719, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.764318227767944, "epoch": 0.01199, "frac_reward_zero_std": 0.0, "grad_norm": 0.007943030446767807, "kl": 0.281961680855602, "learning_rate": 9.99937404892519e-06, "loss": -0.0218, "num_tokens": 27166645.0, "reward": 0.6051013469696045, "reward_std": 1.3646199703216553, "rewards/rollout_reward_func/mean": 0.6051013469696045, "rewards/rollout_reward_func/std": 1.4157320261001587, "sampling/importance_sampling_ratio/max": 0.32820114493370056, "sampling/importance_sampling_ratio/mean": 0.09832805395126343, "sampling/importance_sampling_ratio/min": 1.49441405988285e-21, "sampling/sampling_logp_difference/max": 4.486514091491699, "sampling/sampling_logp_difference/mean": 1.4442501068115234, "step": 1199, "step_time": 10.461075288996653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 6.749708771705627, "epoch": 0.012, "grad_norm": 0.007936357520520687, "kl": 0.28414959367364645, "learning_rate": 9.999372972050326e-06, "loss": -0.0218, "step": 1200, "step_time": 5.497424450994004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/max_terminated_length": 479.0, "completions/mean_length": 103.71875, "completions/mean_terminated_length": 103.71875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.881185621023178, "epoch": 0.01201, "frac_reward_zero_std": 0.25, "grad_norm": 0.22739601135253906, "kl": 0.8334765285253525, "learning_rate": 9.999371894250018e-06, "loss": -0.0027, "num_tokens": 27201631.0, "reward": 1.4344053268432617, "reward_std": 0.9322790503501892, "rewards/rollout_reward_func/mean": 1.4344053268432617, "rewards/rollout_reward_func/std": 1.112852692604065, "sampling/importance_sampling_ratio/max": 0.5541999936103821, "sampling/importance_sampling_ratio/mean": 0.3896354138851166, "sampling/importance_sampling_ratio/min": 0.0002008770825341344, "sampling/sampling_logp_difference/max": 2.7455222606658936, "sampling/sampling_logp_difference/mean": 0.5741575956344604, "step": 1201, "step_time": 6.5704519570026605 }, { "clip_ratio/high_max": 0.08333333395421505, "clip_ratio/high_mean": 0.057291666977107525, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.07291666697710752, "entropy": 3.6725234389305115, "epoch": 0.01202, "grad_norm": 0.11836028844118118, "kl": 1.109265848994255, "learning_rate": 9.999370815524266e-06, "loss": -0.0026, "step": 1202, "step_time": 4.215697403000377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0028409091755747795, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0028409091755747795, "completions/clipped_ratio": 0.03125, "completions/max_length": 1354.0, "completions/max_terminated_length": 1354.0, "completions/mean_length": 505.8125, "completions/mean_terminated_length": 508.70965576171875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.192568242549896, "epoch": 0.01203, "frac_reward_zero_std": 0.0, "grad_norm": 0.029804328456521034, "kl": 0.29163014609366655, "learning_rate": 9.999369735873068e-06, "loss": -0.007, "num_tokens": 27254731.0, "reward": 0.2718302011489868, "reward_std": 0.9622860550880432, "rewards/rollout_reward_func/mean": 0.2718302011489868, "rewards/rollout_reward_func/std": 1.3497252464294434, "sampling/importance_sampling_ratio/max": 0.31237494945526123, "sampling/importance_sampling_ratio/mean": 0.06516112387180328, "sampling/importance_sampling_ratio/min": 7.64836727285001e-08, "sampling/sampling_logp_difference/max": 3.9525504112243652, "sampling/sampling_logp_difference/mean": 1.3010423183441162, "step": 1203, "step_time": 10.81932592099838 }, { "clip_ratio/high_max": 0.0625, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 7.161794364452362, "epoch": 0.01204, "grad_norm": 0.014753422699868679, "kl": 0.29198237182572484, "learning_rate": 9.999368655296428e-06, "loss": -0.0071, "step": 1204, "step_time": 5.781028623994644 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "completions/clipped_ratio": 0.03125, "completions/max_length": 989.0, "completions/max_terminated_length": 989.0, "completions/mean_length": 216.34375, "completions/mean_terminated_length": 222.8064422607422, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.798920422792435, "epoch": 0.01205, "frac_reward_zero_std": 0.25, "grad_norm": 0.004947314970195293, "kl": 0.4252886213362217, "learning_rate": 9.999367573794344e-06, "loss": -0.0133, "num_tokens": 27294327.0, "reward": 1.531482458114624, "reward_std": 1.133758306503296, "rewards/rollout_reward_func/mean": 1.531482458114624, "rewards/rollout_reward_func/std": 1.3505709171295166, "sampling/importance_sampling_ratio/max": 0.5606817603111267, "sampling/importance_sampling_ratio/mean": 0.29321226477622986, "sampling/importance_sampling_ratio/min": 2.370803477581146e-13, "sampling/sampling_logp_difference/max": 10.435445785522461, "sampling/sampling_logp_difference/mean": 1.3409709930419922, "step": 1205, "step_time": 8.895783214004041 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.018229166977107525, "entropy": 5.790730267763138, "epoch": 0.01206, "grad_norm": 0.00490487040951848, "kl": 0.42205933295190334, "learning_rate": 9.999366491366816e-06, "loss": -0.0133, "step": 1206, "step_time": 4.813868977998936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "completions/clipped_ratio": 0.0, "completions/max_length": 546.0, "completions/max_terminated_length": 546.0, "completions/mean_length": 303.59375, "completions/mean_terminated_length": 303.59375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.516513168811798, "epoch": 0.01207, "frac_reward_zero_std": 0.0, "grad_norm": 0.06440559029579163, "kl": 0.31564874574542046, "learning_rate": 9.999365408013845e-06, "loss": -0.0174, "num_tokens": 27340466.0, "reward": 0.2199014127254486, "reward_std": 1.5163259506225586, "rewards/rollout_reward_func/mean": 0.2199014127254486, "rewards/rollout_reward_func/std": 1.6158097982406616, "sampling/importance_sampling_ratio/max": 0.5597584247589111, "sampling/importance_sampling_ratio/mean": 0.1449326127767563, "sampling/importance_sampling_ratio/min": 1.860542676404247e-16, "sampling/sampling_logp_difference/max": 12.123607635498047, "sampling/sampling_logp_difference/mean": 1.5145844221115112, "step": 1207, "step_time": 7.874031338003988 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 6.508630990982056, "epoch": 0.01208, "grad_norm": 0.08626207709312439, "kl": 0.3139740712940693, "learning_rate": 9.999364323735433e-06, "loss": -0.0174, "step": 1208, "step_time": 3.982763595002325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0052083334885537624, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0052083334885537624, "completions/clipped_ratio": 0.0, "completions/max_length": 1097.0, "completions/max_terminated_length": 1097.0, "completions/mean_length": 456.09375, "completions/mean_terminated_length": 456.09375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.046133399009705, "epoch": 0.01209, "frac_reward_zero_std": 0.0, "grad_norm": 0.018328331410884857, "kl": 0.37201456259936094, "learning_rate": 9.999363238531578e-06, "loss": -0.0155, "num_tokens": 27392065.0, "reward": 1.2001962661743164, "reward_std": 1.7364178895950317, "rewards/rollout_reward_func/mean": 1.2001962661743164, "rewards/rollout_reward_func/std": 1.714629888534546, "sampling/importance_sampling_ratio/max": 0.31725332140922546, "sampling/importance_sampling_ratio/mean": 0.06971673667430878, "sampling/importance_sampling_ratio/min": 5.09159654029645e-05, "sampling/sampling_logp_difference/max": 3.0274410247802734, "sampling/sampling_logp_difference/mean": 1.3594417572021484, "step": 1209, "step_time": 9.262193788996228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 7.0878987312316895, "epoch": 0.0121, "grad_norm": 0.01634562388062477, "kl": 0.3727897731587291, "learning_rate": 9.99936215240228e-06, "loss": -0.0155, "step": 1210, "step_time": 5.17888869399394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0026041667442768812, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0026041667442768812, "completions/clipped_ratio": 0.03125, "completions/max_length": 1044.0, "completions/max_terminated_length": 1044.0, "completions/mean_length": 288.6875, "completions/mean_terminated_length": 283.58062744140625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.325662434101105, "epoch": 0.01211, "frac_reward_zero_std": 0.25, "grad_norm": 0.020435553044080734, "kl": 0.5331990094855428, "learning_rate": 9.999361065347541e-06, "loss": -0.0134, "num_tokens": 27436497.0, "reward": 0.9810046553611755, "reward_std": 1.331316351890564, "rewards/rollout_reward_func/mean": 0.9810046553611755, "rewards/rollout_reward_func/std": 1.535681128501892, "sampling/importance_sampling_ratio/max": 0.5497297644615173, "sampling/importance_sampling_ratio/mean": 0.1939866542816162, "sampling/importance_sampling_ratio/min": 5.3865994997792654e-11, "sampling/sampling_logp_difference/max": 3.9924416542053223, "sampling/sampling_logp_difference/mean": 1.3763532638549805, "step": 1211, "step_time": 8.960058848002518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 6.344601929187775, "epoch": 0.01212, "grad_norm": 0.017501598224043846, "kl": 0.535295614041388, "learning_rate": 9.99935997736736e-06, "loss": -0.0134, "step": 1212, "step_time": 5.032832402997883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1270.0, "completions/max_terminated_length": 1270.0, "completions/mean_length": 400.8125, "completions/mean_terminated_length": 400.8125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.886089622974396, "epoch": 0.01213, "frac_reward_zero_std": 0.25, "grad_norm": 0.009921908378601074, "kl": 0.5098431585356593, "learning_rate": 9.999358888461737e-06, "loss": -0.012, "num_tokens": 27484655.0, "reward": 0.8703619241714478, "reward_std": 1.274528980255127, "rewards/rollout_reward_func/mean": 0.8703619241714478, "rewards/rollout_reward_func/std": 1.578533411026001, "sampling/importance_sampling_ratio/max": 0.55848228931427, "sampling/importance_sampling_ratio/mean": 0.20496386289596558, "sampling/importance_sampling_ratio/min": 6.898887416978678e-19, "sampling/sampling_logp_difference/max": 4.31170654296875, "sampling/sampling_logp_difference/mean": 1.115742564201355, "step": 1213, "step_time": 10.499632421993738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.899304926395416, "epoch": 0.01214, "grad_norm": 0.009218855760991573, "kl": 0.5087140910327435, "learning_rate": 9.999357798630673e-06, "loss": -0.012, "step": 1214, "step_time": 5.556862879995606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 843.0, "completions/max_terminated_length": 843.0, "completions/mean_length": 254.28125, "completions/mean_terminated_length": 254.28125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.8041921854019165, "epoch": 0.01215, "frac_reward_zero_std": 0.25, "grad_norm": 0.020649271085858345, "kl": 0.6563268788158894, "learning_rate": 9.99935670787417e-06, "loss": -0.0157, "num_tokens": 27526584.0, "reward": 0.9375166296958923, "reward_std": 1.160906195640564, "rewards/rollout_reward_func/mean": 0.9375166296958923, "rewards/rollout_reward_func/std": 1.5112364292144775, "sampling/importance_sampling_ratio/max": 0.5553401708602905, "sampling/importance_sampling_ratio/mean": 0.21374748647212982, "sampling/importance_sampling_ratio/min": 1.0660735824785661e-05, "sampling/sampling_logp_difference/max": 2.886234998703003, "sampling/sampling_logp_difference/mean": 1.0956295728683472, "step": 1215, "step_time": 8.30960843000139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.818695485591888, "epoch": 0.01216, "grad_norm": 0.019017117097973824, "kl": 0.6567798592150211, "learning_rate": 9.999355616192225e-06, "loss": -0.0158, "step": 1216, "step_time": 4.91419302599752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1490.0, "completions/max_terminated_length": 1490.0, "completions/mean_length": 338.90625, "completions/mean_terminated_length": 334.1612854003906, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.76449128985405, "epoch": 0.01217, "frac_reward_zero_std": 0.0, "grad_norm": 0.03449101373553276, "kl": 0.4492824412882328, "learning_rate": 9.99935452358484e-06, "loss": -0.0096, "num_tokens": 27572864.0, "reward": -0.09099289774894714, "reward_std": 0.7877914309501648, "rewards/rollout_reward_func/mean": -0.09099289774894714, "rewards/rollout_reward_func/std": 1.2671482563018799, "sampling/importance_sampling_ratio/max": 0.550255298614502, "sampling/importance_sampling_ratio/mean": 0.18221576511859894, "sampling/importance_sampling_ratio/min": 2.7774236244226813e-09, "sampling/sampling_logp_difference/max": 4.032496452331543, "sampling/sampling_logp_difference/mean": 1.4437475204467773, "step": 1217, "step_time": 10.294860933005111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 6.801723480224609, "epoch": 0.01218, "grad_norm": 0.03858715295791626, "kl": 0.4488505907356739, "learning_rate": 9.999353430052015e-06, "loss": -0.0097, "step": 1218, "step_time": 5.90305481899486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.005681818351149559, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005681818351149559, "completions/clipped_ratio": 0.03125, "completions/max_length": 1127.0, "completions/max_terminated_length": 1127.0, "completions/mean_length": 332.84375, "completions/mean_terminated_length": 343.06451416015625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.829856693744659, "epoch": 0.01219, "frac_reward_zero_std": 0.25, "grad_norm": 0.006790849845856428, "kl": 0.4772218083962798, "learning_rate": 9.999352335593749e-06, "loss": -0.0057, "num_tokens": 27617979.0, "reward": 0.8671344518661499, "reward_std": 1.0657240152359009, "rewards/rollout_reward_func/mean": 0.8671344518661499, "rewards/rollout_reward_func/std": 1.4193288087844849, "sampling/importance_sampling_ratio/max": 0.554718017578125, "sampling/importance_sampling_ratio/mean": 0.1763203740119934, "sampling/importance_sampling_ratio/min": 9.671432356144968e-15, "sampling/sampling_logp_difference/max": 4.185243606567383, "sampling/sampling_logp_difference/mean": 1.600338339805603, "step": 1219, "step_time": 9.990855485000793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 6.8340564370155334, "epoch": 0.0122, "grad_norm": 0.006780840922147036, "kl": 0.4788576094433665, "learning_rate": 9.999351240210043e-06, "loss": -0.0057, "step": 1220, "step_time": 5.2356323700041685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1426.0, "completions/max_terminated_length": 1426.0, "completions/mean_length": 577.0, "completions/mean_terminated_length": 582.4193115234375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.713840842247009, "epoch": 0.01221, "frac_reward_zero_std": 0.0, "grad_norm": 0.024075806140899658, "kl": 0.4003632850944996, "learning_rate": 9.9993501439009e-06, "loss": -0.0221, "num_tokens": 27673471.0, "reward": 1.4643425941467285, "reward_std": 1.6540658473968506, "rewards/rollout_reward_func/mean": 1.4643425941467285, "rewards/rollout_reward_func/std": 1.6269521713256836, "sampling/importance_sampling_ratio/max": 0.32250428199768066, "sampling/importance_sampling_ratio/mean": 0.09523613750934601, "sampling/importance_sampling_ratio/min": 2.2866878121448533e-19, "sampling/sampling_logp_difference/max": 4.710781097412109, "sampling/sampling_logp_difference/mean": 1.4298617839813232, "step": 1221, "step_time": 10.587862982003571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 6.7107566595077515, "epoch": 0.01222, "grad_norm": 0.022779470309615135, "kl": 0.40281332191079855, "learning_rate": 9.999349046666318e-06, "loss": -0.0222, "step": 1222, "step_time": 6.273975362997589 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "completions/clipped_ratio": 0.03125, "completions/max_length": 1019.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 367.625, "completions/mean_terminated_length": 378.9677429199219, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.1908639669418335, "epoch": 0.01223, "frac_reward_zero_std": 0.0, "grad_norm": 0.08737529069185257, "kl": 0.28440374694764614, "learning_rate": 9.999347948506298e-06, "loss": -0.0163, "num_tokens": 27721107.0, "reward": 0.9062286019325256, "reward_std": 1.4828673601150513, "rewards/rollout_reward_func/mean": 0.9062286019325256, "rewards/rollout_reward_func/std": 1.4416229724884033, "sampling/importance_sampling_ratio/max": 0.4875297546386719, "sampling/importance_sampling_ratio/mean": 0.09898202866315842, "sampling/importance_sampling_ratio/min": 2.148943974766078e-15, "sampling/sampling_logp_difference/max": 12.631112098693848, "sampling/sampling_logp_difference/mean": 1.5620849132537842, "step": 1223, "step_time": 8.871566064000945 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 7.219615995883942, "epoch": 0.01224, "grad_norm": 0.05022260546684265, "kl": 0.28018728271126747, "learning_rate": 9.999346849420837e-06, "loss": -0.0167, "step": 1224, "step_time": 4.9558709949888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1054.0, "completions/max_terminated_length": 1054.0, "completions/mean_length": 426.125, "completions/mean_terminated_length": 426.125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.974072635173798, "epoch": 0.01225, "frac_reward_zero_std": 0.0, "grad_norm": 0.02690671943128109, "kl": 0.32564034778624773, "learning_rate": 9.99934574940994e-06, "loss": -0.0041, "num_tokens": 27772305.0, "reward": 0.7114834785461426, "reward_std": 1.3042325973510742, "rewards/rollout_reward_func/mean": 0.7114834785461426, "rewards/rollout_reward_func/std": 1.5363869667053223, "sampling/importance_sampling_ratio/max": 0.558556854724884, "sampling/importance_sampling_ratio/mean": 0.1287248432636261, "sampling/importance_sampling_ratio/min": 1.3507626961395403e-10, "sampling/sampling_logp_difference/max": 4.212499618530273, "sampling/sampling_logp_difference/mean": 1.4371428489685059, "step": 1225, "step_time": 9.512785836999683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 6.9610626101493835, "epoch": 0.01226, "grad_norm": 0.025632468983530998, "kl": 0.3236699979752302, "learning_rate": 9.999344648473603e-06, "loss": -0.0042, "step": 1226, "step_time": 5.113006560004578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.005681818351149559, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005681818351149559, "completions/clipped_ratio": 0.0, "completions/max_length": 552.0, "completions/max_terminated_length": 552.0, "completions/mean_length": 211.03125, "completions/mean_terminated_length": 211.03125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.654196739196777, "epoch": 0.01227, "frac_reward_zero_std": 0.25, "grad_norm": 0.014767486602067947, "kl": 0.44261737167835236, "learning_rate": 9.99934354661183e-06, "loss": -0.0154, "num_tokens": 27814002.0, "reward": 0.7258175611495972, "reward_std": 1.2434762716293335, "rewards/rollout_reward_func/mean": 0.7258175611495972, "rewards/rollout_reward_func/std": 1.7583167552947998, "sampling/importance_sampling_ratio/max": 0.550297200679779, "sampling/importance_sampling_ratio/mean": 0.19768379628658295, "sampling/importance_sampling_ratio/min": 9.038479193606078e-14, "sampling/sampling_logp_difference/max": 4.621830940246582, "sampling/sampling_logp_difference/mean": 1.3403310775756836, "step": 1227, "step_time": 7.497453885003779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 6.644178926944733, "epoch": 0.01228, "grad_norm": 0.01534936111420393, "kl": 0.4466835353523493, "learning_rate": 9.99934244382462e-06, "loss": -0.0154, "step": 1228, "step_time": 4.38580777399693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 2.53125, "completions/mean_terminated_length": 2.096774101257324, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.12013903260231, "epoch": 0.01229, "frac_reward_zero_std": 0.5, "grad_norm": 0.004066831897944212, "kl": 0.829371303319931, "learning_rate": 9.999341340111972e-06, "loss": -0.0095, "num_tokens": 27843073.0, "reward": 1.7097759246826172, "reward_std": 0.5296487808227539, "rewards/rollout_reward_func/mean": 1.7097759246826172, "rewards/rollout_reward_func/std": 0.7309488654136658, "sampling/importance_sampling_ratio/max": 0.5615022778511047, "sampling/importance_sampling_ratio/mean": 0.514590322971344, "sampling/importance_sampling_ratio/min": 3.05990784544514e-16, "sampling/sampling_logp_difference/max": 4.951870441436768, "sampling/sampling_logp_difference/mean": 0.739978551864624, "step": 1229, "step_time": 5.20000145699305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.1183309853076935, "epoch": 0.0123, "grad_norm": 0.0039026280865073204, "kl": 0.8307956755161285, "learning_rate": 9.999340235473887e-06, "loss": -0.0095, "step": 1230, "step_time": 3.1531829799969273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1100.0, "completions/max_terminated_length": 1100.0, "completions/mean_length": 484.6875, "completions/mean_terminated_length": 484.6875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.734242916107178, "epoch": 0.01231, "frac_reward_zero_std": 0.0, "grad_norm": 0.02763747237622738, "kl": 0.5177968079224229, "learning_rate": 9.999339129910366e-06, "loss": -0.0143, "num_tokens": 27895577.0, "reward": 0.8072165250778198, "reward_std": 1.4261577129364014, "rewards/rollout_reward_func/mean": 0.8072165250778198, "rewards/rollout_reward_func/std": 1.4235436916351318, "sampling/importance_sampling_ratio/max": 0.3135051429271698, "sampling/importance_sampling_ratio/mean": 0.15433582663536072, "sampling/importance_sampling_ratio/min": 3.804081182001706e-12, "sampling/sampling_logp_difference/max": 11.021062850952148, "sampling/sampling_logp_difference/mean": 1.2246919870376587, "step": 1231, "step_time": 9.740720095989673 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 5.716738939285278, "epoch": 0.01232, "grad_norm": 0.028054388239979744, "kl": 0.5208168467506766, "learning_rate": 9.99933802342141e-06, "loss": -0.0143, "step": 1232, "step_time": 5.195678661995771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1425.0, "completions/max_terminated_length": 1425.0, "completions/mean_length": 287.96875, "completions/mean_terminated_length": 287.96875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.457605242729187, "epoch": 0.01233, "frac_reward_zero_std": 0.5, "grad_norm": 0.023166202008724213, "kl": 0.605684332549572, "learning_rate": 9.999336916007016e-06, "loss": -0.0005, "num_tokens": 27936649.0, "reward": 1.3214423656463623, "reward_std": 1.0395777225494385, "rewards/rollout_reward_func/mean": 1.3214423656463623, "rewards/rollout_reward_func/std": 1.5526230335235596, "sampling/importance_sampling_ratio/max": 0.5638123750686646, "sampling/importance_sampling_ratio/mean": 0.30974388122558594, "sampling/importance_sampling_ratio/min": 8.31959201864522e-17, "sampling/sampling_logp_difference/max": 2.792508363723755, "sampling/sampling_logp_difference/mean": 1.1314940452575684, "step": 1233, "step_time": 10.279013917002885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.466507822275162, "epoch": 0.01234, "grad_norm": 0.023404676467180252, "kl": 0.6030097156763077, "learning_rate": 9.999335807667186e-06, "loss": -0.0005, "step": 1234, "step_time": 5.652128688994708 }, { "clip_ratio/high_max": 0.03645833395421505, "clip_ratio/high_mean": 0.018229166977107525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.018229166977107525, "completions/clipped_ratio": 0.03125, "completions/max_length": 1383.0, "completions/max_terminated_length": 1383.0, "completions/mean_length": 322.625, "completions/mean_terminated_length": 311.8709716796875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.775711297988892, "epoch": 0.01235, "frac_reward_zero_std": 0.0, "grad_norm": 0.06814596056938171, "kl": 0.5158127257600427, "learning_rate": 9.999334698401922e-06, "loss": -0.0054, "num_tokens": 27981235.0, "reward": 0.5722406506538391, "reward_std": 1.2431797981262207, "rewards/rollout_reward_func/mean": 0.5722406506538391, "rewards/rollout_reward_func/std": 1.4040403366088867, "sampling/importance_sampling_ratio/max": 0.5784423351287842, "sampling/importance_sampling_ratio/mean": 0.21553874015808105, "sampling/importance_sampling_ratio/min": 1.1050500190539847e-17, "sampling/sampling_logp_difference/max": 4.4033308029174805, "sampling/sampling_logp_difference/mean": 1.1979010105133057, "step": 1235, "step_time": 10.124009378003393 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 5.84404993057251, "epoch": 0.01236, "grad_norm": 0.04420842230319977, "kl": 0.5069889472797513, "learning_rate": 9.999333588211223e-06, "loss": -0.0055, "step": 1236, "step_time": 5.542099395999685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0052083334885537624, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0052083334885537624, "completions/clipped_ratio": 0.0, "completions/max_length": 1088.0, "completions/max_terminated_length": 1088.0, "completions/mean_length": 377.125, "completions/mean_terminated_length": 377.125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.682982325553894, "epoch": 0.01237, "frac_reward_zero_std": 0.25, "grad_norm": 0.003579062409698963, "kl": 0.3378196535632014, "learning_rate": 9.999332477095089e-06, "loss": -0.0069, "num_tokens": 28027727.0, "reward": 1.493861436843872, "reward_std": 1.2156527042388916, "rewards/rollout_reward_func/mean": 1.493861436843872, "rewards/rollout_reward_func/std": 1.6320122480392456, "sampling/importance_sampling_ratio/max": 0.5526310205459595, "sampling/importance_sampling_ratio/mean": 0.18632632493972778, "sampling/importance_sampling_ratio/min": 5.3104059344333265e-11, "sampling/sampling_logp_difference/max": 10.00079345703125, "sampling/sampling_logp_difference/mean": 1.5167968273162842, "step": 1237, "step_time": 9.864144601004227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 6.679121017456055, "epoch": 0.01238, "grad_norm": 0.003620097180828452, "kl": 0.3386197341606021, "learning_rate": 9.99933136505352e-06, "loss": -0.0069, "step": 1238, "step_time": 5.134064296998986 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "completions/clipped_ratio": 0.0, "completions/max_length": 923.0, "completions/max_terminated_length": 923.0, "completions/mean_length": 372.9375, "completions/mean_terminated_length": 372.9375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.999027132987976, "epoch": 0.01239, "frac_reward_zero_std": 0.0, "grad_norm": 0.024439262226223946, "kl": 0.29292130935937166, "learning_rate": 9.999330252086517e-06, "loss": -0.0098, "num_tokens": 28075519.0, "reward": 0.6820250749588013, "reward_std": 1.3841365575790405, "rewards/rollout_reward_func/mean": 0.6820250749588013, "rewards/rollout_reward_func/std": 1.4281363487243652, "sampling/importance_sampling_ratio/max": 0.5623171329498291, "sampling/importance_sampling_ratio/mean": 0.09724774956703186, "sampling/importance_sampling_ratio/min": 3.473952006061154e-07, "sampling/sampling_logp_difference/max": 4.467317581176758, "sampling/sampling_logp_difference/mean": 1.4517492055892944, "step": 1239, "step_time": 9.225820356004988 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 7.003680169582367, "epoch": 0.0124, "grad_norm": 0.022253919392824173, "kl": 0.29231970477849245, "learning_rate": 9.99932913819408e-06, "loss": -0.0099, "step": 1240, "step_time": 4.776299237000785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1018.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 257.96875, "completions/mean_terminated_length": 257.96875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.432997643947601, "epoch": 0.01241, "frac_reward_zero_std": 0.0, "grad_norm": 0.009392169304192066, "kl": 0.6206987341865897, "learning_rate": 9.99932802337621e-06, "loss": -0.0175, "num_tokens": 28115965.0, "reward": 0.9912210702896118, "reward_std": 1.0983209609985352, "rewards/rollout_reward_func/mean": 0.9912210702896118, "rewards/rollout_reward_func/std": 1.4100805521011353, "sampling/importance_sampling_ratio/max": 0.5622701644897461, "sampling/importance_sampling_ratio/mean": 0.27507710456848145, "sampling/importance_sampling_ratio/min": 3.801430281669127e-10, "sampling/sampling_logp_difference/max": 4.460948944091797, "sampling/sampling_logp_difference/mean": 1.245530128479004, "step": 1241, "step_time": 8.569925375995808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.427513301372528, "epoch": 0.01242, "grad_norm": 0.009119224734604359, "kl": 0.6191346738487482, "learning_rate": 9.999326907632905e-06, "loss": -0.0175, "step": 1242, "step_time": 5.33923727900401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1599.0, "completions/max_terminated_length": 1599.0, "completions/mean_length": 392.53125, "completions/mean_terminated_length": 393.1612854003906, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.5256770849227905, "epoch": 0.01243, "frac_reward_zero_std": 0.25, "grad_norm": 0.09153991937637329, "kl": 0.49426014721393585, "learning_rate": 9.999325790964166e-06, "loss": -0.003, "num_tokens": 28162574.0, "reward": 0.6531635522842407, "reward_std": 1.0284295082092285, "rewards/rollout_reward_func/mean": 0.6531635522842407, "rewards/rollout_reward_func/std": 1.3974789381027222, "sampling/importance_sampling_ratio/max": 0.551946759223938, "sampling/importance_sampling_ratio/mean": 0.18955785036087036, "sampling/importance_sampling_ratio/min": 3.1266662012437835e-18, "sampling/sampling_logp_difference/max": 4.269582748413086, "sampling/sampling_logp_difference/mean": 1.3591821193695068, "step": 1243, "step_time": 11.309971579001285 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 6.530195772647858, "epoch": 0.01244, "grad_norm": 0.017267197370529175, "kl": 0.4901100918650627, "learning_rate": 9.999324673369997e-06, "loss": -0.0033, "step": 1244, "step_time": 6.1884839389967965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1198.0, "completions/max_terminated_length": 1198.0, "completions/mean_length": 278.84375, "completions/mean_terminated_length": 278.84375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.683059245347977, "epoch": 0.01245, "frac_reward_zero_std": 0.25, "grad_norm": 0.020566176623106003, "kl": 0.5407955814152956, "learning_rate": 9.999323554850393e-06, "loss": -0.0166, "num_tokens": 28205726.0, "reward": 1.2218866348266602, "reward_std": 1.1269233226776123, "rewards/rollout_reward_func/mean": 1.2218866348266602, "rewards/rollout_reward_func/std": 1.3228833675384521, "sampling/importance_sampling_ratio/max": 0.5572207570075989, "sampling/importance_sampling_ratio/mean": 0.22783440351486206, "sampling/importance_sampling_ratio/min": 1.5572707070532488e-06, "sampling/sampling_logp_difference/max": 4.121526718139648, "sampling/sampling_logp_difference/mean": 1.1854422092437744, "step": 1245, "step_time": 9.738343960001657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.67211526632309, "epoch": 0.01246, "grad_norm": 0.02129148691892624, "kl": 0.5406066849827766, "learning_rate": 9.999322435405358e-06, "loss": -0.0166, "step": 1246, "step_time": 5.248405594003998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 928.0, "completions/max_terminated_length": 928.0, "completions/mean_length": 283.96875, "completions/mean_terminated_length": 283.96875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.924357712268829, "epoch": 0.01247, "frac_reward_zero_std": 0.25, "grad_norm": 0.04871193319559097, "kl": 0.6205619592219591, "learning_rate": 9.99932131503489e-06, "loss": -0.0159, "num_tokens": 28250709.0, "reward": 1.49161696434021, "reward_std": 1.1837902069091797, "rewards/rollout_reward_func/mean": 1.49161696434021, "rewards/rollout_reward_func/std": 1.414755940437317, "sampling/importance_sampling_ratio/max": 0.5571900010108948, "sampling/importance_sampling_ratio/mean": 0.24457336962223053, "sampling/importance_sampling_ratio/min": 4.422561232786393e-06, "sampling/sampling_logp_difference/max": 2.832820415496826, "sampling/sampling_logp_difference/mean": 0.8992977142333984, "step": 1247, "step_time": 8.439303618994018 }, { "clip_ratio/high_max": 0.03645833395421505, "clip_ratio/high_mean": 0.018229166977107525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.018229166977107525, "entropy": 4.930486351251602, "epoch": 0.01248, "grad_norm": 0.016979482024908066, "kl": 0.6219927612692118, "learning_rate": 9.99932019373899e-06, "loss": -0.0159, "step": 1248, "step_time": 5.094411625999783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1042.0, "completions/max_terminated_length": 1042.0, "completions/mean_length": 181.53125, "completions/mean_terminated_length": 181.53125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.99153459072113, "epoch": 0.01249, "frac_reward_zero_std": 0.25, "grad_norm": 0.03137471526861191, "kl": 0.4810411036014557, "learning_rate": 9.99931907151766e-06, "loss": -0.0142, "num_tokens": 28290184.0, "reward": 0.4122195243835449, "reward_std": 1.1582716703414917, "rewards/rollout_reward_func/mean": 0.4122195243835449, "rewards/rollout_reward_func/std": 1.6624698638916016, "sampling/importance_sampling_ratio/max": 0.5531143546104431, "sampling/importance_sampling_ratio/mean": 0.25879552960395813, "sampling/importance_sampling_ratio/min": 1.2182003672478459e-07, "sampling/sampling_logp_difference/max": 2.9779953956604004, "sampling/sampling_logp_difference/mean": 1.2558033466339111, "step": 1249, "step_time": 8.70790616699378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 6.004633665084839, "epoch": 0.0125, "grad_norm": 0.030706537887454033, "kl": 0.47697286680340767, "learning_rate": 9.999317948370898e-06, "loss": -0.0142, "step": 1250, "step_time": 4.893185663000622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 194.0, "completions/mean_terminated_length": 194.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.194969594478607, "epoch": 0.01251, "frac_reward_zero_std": 0.5, "grad_norm": 0.007594864349812269, "kl": 0.5502069257199764, "learning_rate": 9.999316824298703e-06, "loss": -0.0085, "num_tokens": 28327659.0, "reward": 1.3849120140075684, "reward_std": 0.6935402154922485, "rewards/rollout_reward_func/mean": 1.3849120140075684, "rewards/rollout_reward_func/std": 1.193963646888733, "sampling/importance_sampling_ratio/max": 0.56248539686203, "sampling/importance_sampling_ratio/mean": 0.31256189942359924, "sampling/importance_sampling_ratio/min": 4.118439392186701e-05, "sampling/sampling_logp_difference/max": 2.9355390071868896, "sampling/sampling_logp_difference/mean": 0.9198457598686218, "step": 1251, "step_time": 8.840095081992331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.195997536182404, "epoch": 0.01252, "grad_norm": 0.006998794618993998, "kl": 0.5478282943367958, "learning_rate": 9.999315699301079e-06, "loss": -0.0085, "step": 1252, "step_time": 4.710600045993488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/max_terminated_length": 480.0, "completions/mean_length": 71.5625, "completions/mean_terminated_length": 71.5625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.233216643333435, "epoch": 0.01253, "frac_reward_zero_std": 0.5, "grad_norm": 0.015046589076519012, "kl": 0.6678053103387356, "learning_rate": 9.999314573378024e-06, "loss": -0.0025, "num_tokens": 28362123.0, "reward": 0.9868353009223938, "reward_std": 0.5033491849899292, "rewards/rollout_reward_func/mean": 0.9868353009223938, "rewards/rollout_reward_func/std": 1.353135108947754, "sampling/importance_sampling_ratio/max": 0.5759463906288147, "sampling/importance_sampling_ratio/mean": 0.3538162112236023, "sampling/importance_sampling_ratio/min": 5.894549293117279e-09, "sampling/sampling_logp_difference/max": 10.259727478027344, "sampling/sampling_logp_difference/mean": 1.258884310722351, "step": 1253, "step_time": 6.7228193010014365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.226240754127502, "epoch": 0.01254, "grad_norm": 0.015221587382256985, "kl": 0.6678277254104614, "learning_rate": 9.999313446529542e-06, "loss": -0.0025, "step": 1254, "step_time": 4.197052766001434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 962.0, "completions/max_terminated_length": 962.0, "completions/mean_length": 381.3125, "completions/mean_terminated_length": 381.3125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.132582306861877, "epoch": 0.01255, "frac_reward_zero_std": 0.0, "grad_norm": 0.02982637658715248, "kl": 0.5158860664814711, "learning_rate": 9.999312318755627e-06, "loss": -0.0184, "num_tokens": 28409692.0, "reward": 0.8889142870903015, "reward_std": 1.0766644477844238, "rewards/rollout_reward_func/mean": 0.8889142870903015, "rewards/rollout_reward_func/std": 1.3653488159179688, "sampling/importance_sampling_ratio/max": 0.5504457950592041, "sampling/importance_sampling_ratio/mean": 0.1952269822359085, "sampling/importance_sampling_ratio/min": 8.436097415250032e-13, "sampling/sampling_logp_difference/max": 4.517746925354004, "sampling/sampling_logp_difference/mean": 1.3088767528533936, "step": 1255, "step_time": 8.73394730900327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 6.134204000234604, "epoch": 0.01256, "grad_norm": 0.028330542147159576, "kl": 0.5157874161377549, "learning_rate": 9.999311190056283e-06, "loss": -0.0184, "step": 1256, "step_time": 5.215178773003572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1431.0, "completions/max_terminated_length": 1180.0, "completions/mean_length": 524.0, "completions/mean_terminated_length": 494.7419128417969, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.524254351854324, "epoch": 0.01257, "frac_reward_zero_std": 0.25, "grad_norm": 0.03558873385190964, "kl": 0.5794314686208963, "learning_rate": 9.999310060431508e-06, "loss": -0.0158, "num_tokens": 28462496.0, "reward": 1.66618013381958, "reward_std": 1.3299980163574219, "rewards/rollout_reward_func/mean": 1.66618013381958, "rewards/rollout_reward_func/std": 1.4808955192565918, "sampling/importance_sampling_ratio/max": 0.5732946991920471, "sampling/importance_sampling_ratio/mean": 0.22156882286071777, "sampling/importance_sampling_ratio/min": 1.5745889856226254e-16, "sampling/sampling_logp_difference/max": 4.077667236328125, "sampling/sampling_logp_difference/mean": 1.138596534729004, "step": 1257, "step_time": 10.439132534000237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.526570022106171, "epoch": 0.01258, "grad_norm": 0.03406631574034691, "kl": 0.5791141632944345, "learning_rate": 9.999308929881305e-06, "loss": -0.0159, "step": 1258, "step_time": 5.7440859579983226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1427.0, "completions/max_terminated_length": 1427.0, "completions/mean_length": 589.5, "completions/mean_terminated_length": 597.8709716796875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.016176909208298, "epoch": 0.01259, "frac_reward_zero_std": 0.0, "grad_norm": 0.08154747635126114, "kl": 0.3432694599032402, "learning_rate": 9.999307798405675e-06, "loss": -0.01, "num_tokens": 28517880.0, "reward": 1.0154134035110474, "reward_std": 1.186349868774414, "rewards/rollout_reward_func/mean": 1.0154134035110474, "rewards/rollout_reward_func/std": 1.6437877416610718, "sampling/importance_sampling_ratio/max": 0.35756170749664307, "sampling/importance_sampling_ratio/mean": 0.0774485245347023, "sampling/importance_sampling_ratio/min": 7.746355095150648e-07, "sampling/sampling_logp_difference/max": 3.7437210083007812, "sampling/sampling_logp_difference/mean": 1.3143359422683716, "step": 1259, "step_time": 10.786824537000939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0052083334885537624, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0052083334885537624, "entropy": 7.018782556056976, "epoch": 0.0126, "grad_norm": 0.010443620383739471, "kl": 0.3448371160775423, "learning_rate": 9.999306666004616e-06, "loss": -0.0103, "step": 1260, "step_time": 6.450489371000003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 508.0, "completions/max_terminated_length": 508.0, "completions/mean_length": 93.34375, "completions/mean_terminated_length": 81.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.720948934555054, "epoch": 0.01261, "frac_reward_zero_std": 0.5, "grad_norm": 0.017517825588583946, "kl": 0.7808735743165016, "learning_rate": 9.999305532678127e-06, "loss": -0.0051, "num_tokens": 28553492.0, "reward": 1.417100191116333, "reward_std": 0.745186448097229, "rewards/rollout_reward_func/mean": 1.417100191116333, "rewards/rollout_reward_func/std": 1.3126811981201172, "sampling/importance_sampling_ratio/max": 0.560767412185669, "sampling/importance_sampling_ratio/mean": 0.37763461470603943, "sampling/importance_sampling_ratio/min": 3.381975864377904e-30, "sampling/sampling_logp_difference/max": 4.709599018096924, "sampling/sampling_logp_difference/mean": 1.1304707527160645, "step": 1261, "step_time": 7.302991504002421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.717108726501465, "epoch": 0.01262, "grad_norm": 0.017527727410197258, "kl": 0.7778350785374641, "learning_rate": 9.999304398426211e-06, "loss": -0.0051, "step": 1262, "step_time": 4.288609065992205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 808.0, "completions/max_terminated_length": 808.0, "completions/mean_length": 195.3125, "completions/mean_terminated_length": 201.09677124023438, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.28681743144989, "epoch": 0.01263, "frac_reward_zero_std": 0.25, "grad_norm": 0.05735279619693756, "kl": 0.4837900139391422, "learning_rate": 9.999303263248869e-06, "loss": -0.0161, "num_tokens": 28593169.0, "reward": 0.6008663177490234, "reward_std": 1.102199673652649, "rewards/rollout_reward_func/mean": 0.6008663177490234, "rewards/rollout_reward_func/std": 1.4366779327392578, "sampling/importance_sampling_ratio/max": 0.560916543006897, "sampling/importance_sampling_ratio/mean": 0.3027007579803467, "sampling/importance_sampling_ratio/min": 5.836844962914256e-08, "sampling/sampling_logp_difference/max": 4.898800373077393, "sampling/sampling_logp_difference/mean": 1.0483973026275635, "step": 1263, "step_time": 7.928480324000702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0031250000465661287, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250000465661287, "entropy": 5.292507588863373, "epoch": 0.01264, "grad_norm": 0.002762816147878766, "kl": 0.48152952268719673, "learning_rate": 9.999302127146098e-06, "loss": -0.0161, "step": 1264, "step_time": 4.48774496400074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 566.0, "completions/max_terminated_length": 566.0, "completions/mean_length": 139.75, "completions/mean_terminated_length": 139.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.21740061044693, "epoch": 0.01265, "frac_reward_zero_std": 0.25, "grad_norm": 0.013935755006968975, "kl": 0.3716430012136698, "learning_rate": 9.999300990117899e-06, "loss": -0.0075, "num_tokens": 28631687.0, "reward": 0.7573786973953247, "reward_std": 0.9946650266647339, "rewards/rollout_reward_func/mean": 0.7573786973953247, "rewards/rollout_reward_func/std": 1.650612235069275, "sampling/importance_sampling_ratio/max": 0.5577391386032104, "sampling/importance_sampling_ratio/mean": 0.2985667288303375, "sampling/importance_sampling_ratio/min": 4.820320254061983e-10, "sampling/sampling_logp_difference/max": 2.686504602432251, "sampling/sampling_logp_difference/mean": 1.0203052759170532, "step": 1265, "step_time": 7.105932230002509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.210313647985458, "epoch": 0.01266, "grad_norm": 0.012988295406103134, "kl": 0.36989229917526245, "learning_rate": 9.999299852164274e-06, "loss": -0.0074, "step": 1266, "step_time": 4.262737096007186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 177.46875, "completions/mean_terminated_length": 177.46875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.0417211055755615, "epoch": 0.01267, "frac_reward_zero_std": 0.5, "grad_norm": 0.018022416159510612, "kl": 0.5674188546836376, "learning_rate": 9.999298713285224e-06, "loss": -0.0148, "num_tokens": 28669614.0, "reward": 1.2854478359222412, "reward_std": 0.7746870517730713, "rewards/rollout_reward_func/mean": 1.2854478359222412, "rewards/rollout_reward_func/std": 1.2832592725753784, "sampling/importance_sampling_ratio/max": 0.5576980113983154, "sampling/importance_sampling_ratio/mean": 0.33247900009155273, "sampling/importance_sampling_ratio/min": 4.7246358008123934e-06, "sampling/sampling_logp_difference/max": 2.8045060634613037, "sampling/sampling_logp_difference/mean": 0.9917410612106323, "step": 1267, "step_time": 6.806551175995992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.047150790691376, "epoch": 0.01268, "grad_norm": 0.01758064143359661, "kl": 0.5665433220565319, "learning_rate": 9.999297573480746e-06, "loss": -0.0148, "step": 1268, "step_time": 4.255846837011632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1712.0, "completions/max_terminated_length": 1712.0, "completions/mean_length": 621.40625, "completions/mean_terminated_length": 621.40625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.65878027677536, "epoch": 0.01269, "frac_reward_zero_std": 0.0, "grad_norm": 0.06689409911632538, "kl": 0.41745931189507246, "learning_rate": 9.999296432750842e-06, "loss": -0.0234, "num_tokens": 28726941.0, "reward": 0.7101728916168213, "reward_std": 1.1844956874847412, "rewards/rollout_reward_func/mean": 0.7101728916168213, "rewards/rollout_reward_func/std": 1.330950379371643, "sampling/importance_sampling_ratio/max": 0.3344186842441559, "sampling/importance_sampling_ratio/mean": 0.10171861201524734, "sampling/importance_sampling_ratio/min": 1.1182374493046154e-07, "sampling/sampling_logp_difference/max": 2.7509725093841553, "sampling/sampling_logp_difference/mean": 1.2858176231384277, "step": 1269, "step_time": 11.489654336997773 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 6.647192478179932, "epoch": 0.0127, "grad_norm": 0.011485407128930092, "kl": 0.41137355007231236, "learning_rate": 9.999295291095512e-06, "loss": -0.0235, "step": 1270, "step_time": 6.461558419996436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1116.0, "completions/max_terminated_length": 1116.0, "completions/mean_length": 379.40625, "completions/mean_terminated_length": 379.40625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.021535634994507, "epoch": 0.01271, "frac_reward_zero_std": 0.25, "grad_norm": 0.062299199402332306, "kl": 0.3560728430747986, "learning_rate": 9.999294148514757e-06, "loss": -0.0038, "num_tokens": 28774534.0, "reward": 0.9950019717216492, "reward_std": 1.1373522281646729, "rewards/rollout_reward_func/mean": 0.9950019717216492, "rewards/rollout_reward_func/std": 1.4137123823165894, "sampling/importance_sampling_ratio/max": 0.5642013549804688, "sampling/importance_sampling_ratio/mean": 0.2036329060792923, "sampling/importance_sampling_ratio/min": 1.7547654440619453e-09, "sampling/sampling_logp_difference/max": 4.269315719604492, "sampling/sampling_logp_difference/mean": 1.2215240001678467, "step": 1271, "step_time": 9.727402979006001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.02083333395421505, "clip_ratio/low_min": 0.02083333395421505, "clip_ratio/region_mean": 0.02083333395421505, "entropy": 6.026199012994766, "epoch": 0.01272, "grad_norm": 0.016748694702982903, "kl": 0.357189130038023, "learning_rate": 9.999293005008579e-06, "loss": -0.004, "step": 1272, "step_time": 5.6884459299981245 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01953125, "completions/clipped_ratio": 0.0, "completions/max_length": 1105.0, "completions/max_terminated_length": 1105.0, "completions/mean_length": 183.0625, "completions/mean_terminated_length": 183.0625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.917481034994125, "epoch": 0.01273, "frac_reward_zero_std": 0.25, "grad_norm": 0.01602877490222454, "kl": 0.6538842618465424, "learning_rate": 9.999291860576973e-06, "loss": -0.0162, "num_tokens": 28814234.0, "reward": 1.0109809637069702, "reward_std": 1.080259919166565, "rewards/rollout_reward_func/mean": 1.0109809637069702, "rewards/rollout_reward_func/std": 1.3544988632202148, "sampling/importance_sampling_ratio/max": 0.5448901057243347, "sampling/importance_sampling_ratio/mean": 0.2178734689950943, "sampling/importance_sampling_ratio/min": 8.18132367408353e-12, "sampling/sampling_logp_difference/max": 3.93290376663208, "sampling/sampling_logp_difference/mean": 1.0055749416351318, "step": 1273, "step_time": 9.509357283994177 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 5.914509296417236, "epoch": 0.01274, "grad_norm": 0.016276726499199867, "kl": 0.6559109427034855, "learning_rate": 9.999290715219945e-06, "loss": -0.0162, "step": 1274, "step_time": 5.044193935995281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1544.0, "completions/max_terminated_length": 1544.0, "completions/mean_length": 385.0625, "completions/mean_terminated_length": 385.0625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.674300909042358, "epoch": 0.01275, "frac_reward_zero_std": 0.25, "grad_norm": 0.011111102998256683, "kl": 0.4184967502951622, "learning_rate": 9.999289568937491e-06, "loss": -0.0181, "num_tokens": 28861154.0, "reward": 1.1825052499771118, "reward_std": 1.1792991161346436, "rewards/rollout_reward_func/mean": 1.1825052499771118, "rewards/rollout_reward_func/std": 1.4722572565078735, "sampling/importance_sampling_ratio/max": 0.560204029083252, "sampling/importance_sampling_ratio/mean": 0.2181498110294342, "sampling/importance_sampling_ratio/min": 1.826088919187896e-06, "sampling/sampling_logp_difference/max": 2.7044219970703125, "sampling/sampling_logp_difference/mean": 1.0674798488616943, "step": 1275, "step_time": 10.658990202002315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.6825332045555115, "epoch": 0.01276, "grad_norm": 0.011691211722791195, "kl": 0.41860110126435757, "learning_rate": 9.999288421729613e-06, "loss": -0.0181, "step": 1276, "step_time": 5.967149952004547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1690.0, "completions/max_terminated_length": 1690.0, "completions/mean_length": 432.84375, "completions/mean_terminated_length": 442.17242431640625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.315511047840118, "epoch": 0.01277, "frac_reward_zero_std": 0.0, "grad_norm": 0.029839318245649338, "kl": 0.35377088002860546, "learning_rate": 9.999287273596313e-06, "loss": -0.0201, "num_tokens": 28909663.0, "reward": 0.8517817258834839, "reward_std": 1.5113617181777954, "rewards/rollout_reward_func/mean": 0.8517817258834839, "rewards/rollout_reward_func/std": 1.5343589782714844, "sampling/importance_sampling_ratio/max": 0.5554267764091492, "sampling/importance_sampling_ratio/mean": 0.18123696744441986, "sampling/importance_sampling_ratio/min": 5.021697847048395e-14, "sampling/sampling_logp_difference/max": 12.084198951721191, "sampling/sampling_logp_difference/mean": 1.3362774848937988, "step": 1277, "step_time": 11.788432569996075 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 6.312483072280884, "epoch": 0.01278, "grad_norm": 0.01315068919211626, "kl": 0.34771119616925716, "learning_rate": 9.999286124537588e-06, "loss": -0.0202, "step": 1278, "step_time": 6.305379301000357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/max_terminated_length": 519.0, "completions/mean_length": 164.09375, "completions/mean_terminated_length": 164.09375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.825293481349945, "epoch": 0.01279, "frac_reward_zero_std": 0.25, "grad_norm": 0.016017334535717964, "kl": 0.8307618796825409, "learning_rate": 9.999284974553441e-06, "loss": -0.0179, "num_tokens": 28949109.0, "reward": 0.873508095741272, "reward_std": 0.846281111240387, "rewards/rollout_reward_func/mean": 0.873508095741272, "rewards/rollout_reward_func/std": 1.5295536518096924, "sampling/importance_sampling_ratio/max": 0.6435548067092896, "sampling/importance_sampling_ratio/mean": 0.3176412284374237, "sampling/importance_sampling_ratio/min": 7.67742094467394e-05, "sampling/sampling_logp_difference/max": 4.685308456420898, "sampling/sampling_logp_difference/mean": 0.8965935707092285, "step": 1279, "step_time": 7.8503645209966635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.820867419242859, "epoch": 0.0128, "grad_norm": 0.016092756763100624, "kl": 0.8300297036767006, "learning_rate": 9.99928382364387e-06, "loss": -0.0179, "step": 1280, "step_time": 3.964738089001912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1117.0, "completions/max_terminated_length": 1117.0, "completions/mean_length": 277.6875, "completions/mean_terminated_length": 286.1290283203125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.365046054124832, "epoch": 0.01281, "frac_reward_zero_std": 0.25, "grad_norm": 0.013009244576096535, "kl": 0.5308453291654587, "learning_rate": 9.999282671808878e-06, "loss": -0.0112, "num_tokens": 28993150.0, "reward": 1.100398063659668, "reward_std": 0.9246716499328613, "rewards/rollout_reward_func/mean": 1.100398063659668, "rewards/rollout_reward_func/std": 1.3653056621551514, "sampling/importance_sampling_ratio/max": 0.5619791150093079, "sampling/importance_sampling_ratio/mean": 0.22595813870429993, "sampling/importance_sampling_ratio/min": 5.168123896932232e-22, "sampling/sampling_logp_difference/max": 13.422577857971191, "sampling/sampling_logp_difference/mean": 1.1761810779571533, "step": 1281, "step_time": 9.34523250100392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.366683334112167, "epoch": 0.01282, "grad_norm": 0.012140980921685696, "kl": 0.5365070402622223, "learning_rate": 9.999281519048462e-06, "loss": -0.0112, "step": 1282, "step_time": 5.2170058039992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.004464285913854837, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004464285913854837, "completions/clipped_ratio": 0.0, "completions/max_length": 1354.0, "completions/max_terminated_length": 1354.0, "completions/mean_length": 659.78125, "completions/mean_terminated_length": 659.78125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.784712016582489, "epoch": 0.01283, "frac_reward_zero_std": 0.0, "grad_norm": 0.0909520834684372, "kl": 0.380879832431674, "learning_rate": 9.999280365362623e-06, "loss": -0.0132, "num_tokens": 29051225.0, "reward": 0.6342990398406982, "reward_std": 1.5149867534637451, "rewards/rollout_reward_func/mean": 0.6342990398406982, "rewards/rollout_reward_func/std": 1.6118812561035156, "sampling/importance_sampling_ratio/max": 0.38386473059654236, "sampling/importance_sampling_ratio/mean": 0.0880902111530304, "sampling/importance_sampling_ratio/min": 1.3066751014534361e-09, "sampling/sampling_logp_difference/max": 12.384589195251465, "sampling/sampling_logp_difference/mean": 1.4095749855041504, "step": 1283, "step_time": 10.947372463000647 }, { "clip_ratio/high_max": 0.02142857201397419, "clip_ratio/high_mean": 0.010714286006987095, "clip_ratio/low_mean": 0.004464285913854837, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015178571920841932, "entropy": 6.789349049329758, "epoch": 0.01284, "grad_norm": 0.04266231507062912, "kl": 0.3798297233879566, "learning_rate": 9.999279210751366e-06, "loss": -0.0134, "step": 1284, "step_time": 5.749322121999285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1280.0, "completions/max_terminated_length": 1280.0, "completions/mean_length": 399.78125, "completions/mean_terminated_length": 399.78125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.7930232882499695, "epoch": 0.01285, "frac_reward_zero_std": 0.25, "grad_norm": 0.004124908242374659, "kl": 0.41175515577197075, "learning_rate": 9.999278055214684e-06, "loss": -0.0156, "num_tokens": 29099751.0, "reward": 1.2199586629867554, "reward_std": 1.2363288402557373, "rewards/rollout_reward_func/mean": 1.2199586629867554, "rewards/rollout_reward_func/std": 1.4753690958023071, "sampling/importance_sampling_ratio/max": 0.554396390914917, "sampling/importance_sampling_ratio/mean": 0.20632849633693695, "sampling/importance_sampling_ratio/min": 2.6505950767585773e-09, "sampling/sampling_logp_difference/max": 2.5524802207946777, "sampling/sampling_logp_difference/mean": 1.1334514617919922, "step": 1285, "step_time": 10.717994931994326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.790659546852112, "epoch": 0.01286, "grad_norm": 0.0035199716221541166, "kl": 0.4114386606961489, "learning_rate": 9.999276898752583e-06, "loss": -0.0157, "step": 1286, "step_time": 5.600323639006092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1408.0, "completions/max_terminated_length": 1408.0, "completions/mean_length": 402.59375, "completions/mean_terminated_length": 402.59375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.988836824893951, "epoch": 0.01287, "frac_reward_zero_std": 0.25, "grad_norm": 0.040121208876371384, "kl": 0.3938967874273658, "learning_rate": 9.99927574136506e-06, "loss": -0.0184, "num_tokens": 29147314.0, "reward": 1.4158787727355957, "reward_std": 1.2731847763061523, "rewards/rollout_reward_func/mean": 1.4158787727355957, "rewards/rollout_reward_func/std": 1.440799593925476, "sampling/importance_sampling_ratio/max": 0.5627511143684387, "sampling/importance_sampling_ratio/mean": 0.21529799699783325, "sampling/importance_sampling_ratio/min": 3.231858514141095e-08, "sampling/sampling_logp_difference/max": 4.2826337814331055, "sampling/sampling_logp_difference/mean": 1.172183632850647, "step": 1287, "step_time": 10.43449234799482 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0052083334885537624, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0052083334885537624, "entropy": 5.972698092460632, "epoch": 0.01288, "grad_norm": 0.04459506645798683, "kl": 0.3922841642051935, "learning_rate": 9.999274583052117e-06, "loss": -0.0186, "step": 1288, "step_time": 5.8021535750049225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1045.0, "completions/max_terminated_length": 1045.0, "completions/mean_length": 313.0625, "completions/mean_terminated_length": 313.0625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.346322596073151, "epoch": 0.01289, "frac_reward_zero_std": 0.5, "grad_norm": 0.0030674319714307785, "kl": 0.5970137529075146, "learning_rate": 9.999273423813754e-06, "loss": -0.0086, "num_tokens": 29190213.0, "reward": 1.0673143863677979, "reward_std": 0.5591222047805786, "rewards/rollout_reward_func/mean": 1.0673143863677979, "rewards/rollout_reward_func/std": 1.2099952697753906, "sampling/importance_sampling_ratio/max": 0.5620333552360535, "sampling/importance_sampling_ratio/mean": 0.30948930978775024, "sampling/importance_sampling_ratio/min": 1.167869467799676e-09, "sampling/sampling_logp_difference/max": 4.480216979980469, "sampling/sampling_logp_difference/mean": 1.147169589996338, "step": 1289, "step_time": 9.330719205005153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.34588959813118, "epoch": 0.0129, "grad_norm": 0.0028302099090069532, "kl": 0.595471516251564, "learning_rate": 9.99927226364997e-06, "loss": -0.0086, "step": 1290, "step_time": 4.9823572900022555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1294.0, "completions/max_terminated_length": 1294.0, "completions/mean_length": 479.375, "completions/mean_terminated_length": 479.375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.989854514598846, "epoch": 0.01291, "frac_reward_zero_std": 0.0, "grad_norm": 0.050282061100006104, "kl": 0.37175755575299263, "learning_rate": 9.999271102560767e-06, "loss": -0.0223, "num_tokens": 29242623.0, "reward": 1.1522055864334106, "reward_std": 1.6243739128112793, "rewards/rollout_reward_func/mean": 1.1522055864334106, "rewards/rollout_reward_func/std": 1.5882923603057861, "sampling/importance_sampling_ratio/max": 0.3755035102367401, "sampling/importance_sampling_ratio/mean": 0.08231070637702942, "sampling/importance_sampling_ratio/min": 9.754356824487331e-07, "sampling/sampling_logp_difference/max": 4.509121417999268, "sampling/sampling_logp_difference/mean": 1.391479253768921, "step": 1291, "step_time": 10.204497875001834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 6.990877866744995, "epoch": 0.01292, "grad_norm": 0.04541325941681862, "kl": 0.37149628158658743, "learning_rate": 9.999269940546145e-06, "loss": -0.0224, "step": 1292, "step_time": 5.5139370469987625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 822.0, "completions/max_terminated_length": 822.0, "completions/mean_length": 267.21875, "completions/mean_terminated_length": 267.21875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.8479136526584625, "epoch": 0.01293, "frac_reward_zero_std": 0.25, "grad_norm": 0.05621030181646347, "kl": 0.4725974202156067, "learning_rate": 9.999268777606102e-06, "loss": -0.0115, "num_tokens": 29285289.0, "reward": 0.9556488394737244, "reward_std": 1.3437451124191284, "rewards/rollout_reward_func/mean": 0.9556488394737244, "rewards/rollout_reward_func/std": 1.6670951843261719, "sampling/importance_sampling_ratio/max": 0.5633586645126343, "sampling/importance_sampling_ratio/mean": 0.22834666073322296, "sampling/importance_sampling_ratio/min": 9.030889751293625e-12, "sampling/sampling_logp_difference/max": 9.621313095092773, "sampling/sampling_logp_difference/mean": 1.1840581893920898, "step": 1293, "step_time": 8.393750760005787 }, { "clip_ratio/high_max": 0.02864583395421505, "clip_ratio/high_mean": 0.014322916977107525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.014322916977107525, "entropy": 5.84502112865448, "epoch": 0.01294, "grad_norm": 0.04229451343417168, "kl": 0.4706569314002991, "learning_rate": 9.999267613740642e-06, "loss": -0.0118, "step": 1294, "step_time": 4.457333662001474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 975.0, "completions/max_terminated_length": 975.0, "completions/mean_length": 347.34375, "completions/mean_terminated_length": 347.34375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.860666275024414, "epoch": 0.01295, "frac_reward_zero_std": 0.25, "grad_norm": 0.02788400650024414, "kl": 0.3092774571850896, "learning_rate": 9.999266448949762e-06, "loss": -0.0135, "num_tokens": 29331635.0, "reward": 1.317413091659546, "reward_std": 1.1330890655517578, "rewards/rollout_reward_func/mean": 1.317413091659546, "rewards/rollout_reward_func/std": 1.4288612604141235, "sampling/importance_sampling_ratio/max": 0.554878830909729, "sampling/importance_sampling_ratio/mean": 0.19283923506736755, "sampling/importance_sampling_ratio/min": 1.130744749808765e-20, "sampling/sampling_logp_difference/max": 10.244294166564941, "sampling/sampling_logp_difference/mean": 1.4712411165237427, "step": 1295, "step_time": 9.07344253300107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 6.8434136509895325, "epoch": 0.01296, "grad_norm": 0.02808866836130619, "kl": 0.3083897200413048, "learning_rate": 9.999265283233466e-06, "loss": -0.0135, "step": 1296, "step_time": 5.1339628550013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1027.0, "completions/max_terminated_length": 1027.0, "completions/mean_length": 347.65625, "completions/mean_terminated_length": 358.3548278808594, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.712072163820267, "epoch": 0.01297, "frac_reward_zero_std": 0.25, "grad_norm": 0.0071737924590706825, "kl": 0.3548208028078079, "learning_rate": 9.99926411659175e-06, "loss": -0.0112, "num_tokens": 29377784.0, "reward": 0.781287670135498, "reward_std": 1.34067702293396, "rewards/rollout_reward_func/mean": 0.781287670135498, "rewards/rollout_reward_func/std": 1.6865108013153076, "sampling/importance_sampling_ratio/max": 0.5645146369934082, "sampling/importance_sampling_ratio/mean": 0.20957261323928833, "sampling/importance_sampling_ratio/min": 8.220840674866281e-11, "sampling/sampling_logp_difference/max": 3.8672826290130615, "sampling/sampling_logp_difference/mean": 1.2247272729873657, "step": 1297, "step_time": 9.306762409996736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.7095727622509, "epoch": 0.01298, "grad_norm": 0.007385487202554941, "kl": 0.3545833369717002, "learning_rate": 9.999262949024617e-06, "loss": -0.0111, "step": 1298, "step_time": 4.994938691997959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1685.0, "completions/max_terminated_length": 1685.0, "completions/mean_length": 449.75, "completions/mean_terminated_length": 449.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.493200719356537, "epoch": 0.01299, "frac_reward_zero_std": 0.25, "grad_norm": 0.012038108892738819, "kl": 0.4051217194646597, "learning_rate": 9.999261780532066e-06, "loss": -0.0116, "num_tokens": 29426157.0, "reward": 0.9871468544006348, "reward_std": 1.0361173152923584, "rewards/rollout_reward_func/mean": 0.9871468544006348, "rewards/rollout_reward_func/std": 1.34160315990448, "sampling/importance_sampling_ratio/max": 0.556853175163269, "sampling/importance_sampling_ratio/mean": 0.18800127506256104, "sampling/importance_sampling_ratio/min": 5.54780079653483e-08, "sampling/sampling_logp_difference/max": 2.8757314682006836, "sampling/sampling_logp_difference/mean": 1.3284605741500854, "step": 1299, "step_time": 10.970805862005363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 6.478481233119965, "epoch": 0.013, "grad_norm": 0.012553269043564796, "kl": 0.4052604204043746, "learning_rate": 9.999260611114098e-06, "loss": -0.0116, "step": 1300, "step_time": 6.182788722999248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1305.0, "completions/max_terminated_length": 1305.0, "completions/mean_length": 460.875, "completions/mean_terminated_length": 463.45159912109375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.301852166652679, "epoch": 0.01301, "frac_reward_zero_std": 0.0, "grad_norm": 0.009460928849875927, "kl": 0.17534253653138876, "learning_rate": 9.999259440770715e-06, "loss": -0.0203, "num_tokens": 29478149.0, "reward": 0.37704265117645264, "reward_std": 1.5782692432403564, "rewards/rollout_reward_func/mean": 0.37704265117645264, "rewards/rollout_reward_func/std": 1.6143122911453247, "sampling/importance_sampling_ratio/max": 0.553109884262085, "sampling/importance_sampling_ratio/mean": 0.07795606553554535, "sampling/importance_sampling_ratio/min": 8.361536463219466e-11, "sampling/sampling_logp_difference/max": 8.829025268554688, "sampling/sampling_logp_difference/mean": 1.543454885482788, "step": 1301, "step_time": 10.912888737995672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 7.29320764541626, "epoch": 0.01302, "grad_norm": 0.007361455820500851, "kl": 0.17436027899384499, "learning_rate": 9.999258269501912e-06, "loss": -0.0203, "step": 1302, "step_time": 6.012396670001181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1329.0, "completions/max_terminated_length": 1329.0, "completions/mean_length": 353.8125, "completions/mean_terminated_length": 353.8125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.398291856050491, "epoch": 0.01303, "frac_reward_zero_std": 0.25, "grad_norm": 0.059787947684526443, "kl": 0.5054425280541182, "learning_rate": 9.999257097307696e-06, "loss": -0.0113, "num_tokens": 29523420.0, "reward": 0.7841993570327759, "reward_std": 1.2622253894805908, "rewards/rollout_reward_func/mean": 0.7841993570327759, "rewards/rollout_reward_func/std": 1.6103897094726562, "sampling/importance_sampling_ratio/max": 0.5631797909736633, "sampling/importance_sampling_ratio/mean": 0.2775040566921234, "sampling/importance_sampling_ratio/min": 7.160937798289524e-08, "sampling/sampling_logp_difference/max": 3.110438585281372, "sampling/sampling_logp_difference/mean": 0.9731473922729492, "step": 1303, "step_time": 10.171990681999887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.401327699422836, "epoch": 0.01304, "grad_norm": 0.06278382241725922, "kl": 0.5051312446594238, "learning_rate": 9.999255924188063e-06, "loss": -0.0114, "step": 1304, "step_time": 5.587695911995979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1152.0, "completions/max_terminated_length": 1152.0, "completions/mean_length": 226.34375, "completions/mean_terminated_length": 233.1290283203125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.974377274513245, "epoch": 0.01305, "frac_reward_zero_std": 0.25, "grad_norm": 0.0528084971010685, "kl": 0.3867399003356695, "learning_rate": 9.999254750143012e-06, "loss": -0.011, "num_tokens": 29565090.0, "reward": 0.5629861354827881, "reward_std": 1.0093584060668945, "rewards/rollout_reward_func/mean": 0.5629861354827881, "rewards/rollout_reward_func/std": 1.676814317703247, "sampling/importance_sampling_ratio/max": 0.5592296719551086, "sampling/importance_sampling_ratio/mean": 0.25211840867996216, "sampling/importance_sampling_ratio/min": 5.7600365449550495e-12, "sampling/sampling_logp_difference/max": 3.8727684020996094, "sampling/sampling_logp_difference/mean": 1.275923490524292, "step": 1305, "step_time": 9.075310075000743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 5.97655063867569, "epoch": 0.01306, "grad_norm": 0.006195693276822567, "kl": 0.38065222278237343, "learning_rate": 9.99925357517255e-06, "loss": -0.0111, "step": 1306, "step_time": 5.128797948003921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.005681818351149559, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005681818351149559, "completions/clipped_ratio": 0.0, "completions/max_length": 1052.0, "completions/max_terminated_length": 1052.0, "completions/mean_length": 340.46875, "completions/mean_terminated_length": 340.46875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.2350990772247314, "epoch": 0.01307, "frac_reward_zero_std": 0.25, "grad_norm": 0.032462943345308304, "kl": 0.4498656466603279, "learning_rate": 9.999252399276669e-06, "loss": -0.0146, "num_tokens": 29610353.0, "reward": 1.2964178323745728, "reward_std": 1.2758655548095703, "rewards/rollout_reward_func/mean": 1.2964178323745728, "rewards/rollout_reward_func/std": 1.5469260215759277, "sampling/importance_sampling_ratio/max": 0.5626118779182434, "sampling/importance_sampling_ratio/mean": 0.22110985219478607, "sampling/importance_sampling_ratio/min": 1.9074276380592892e-08, "sampling/sampling_logp_difference/max": 4.5936503410339355, "sampling/sampling_logp_difference/mean": 1.0822350978851318, "step": 1307, "step_time": 10.034920660997159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.231841057538986, "epoch": 0.01308, "grad_norm": 0.03335198387503624, "kl": 0.4492984674870968, "learning_rate": 9.999251222455376e-06, "loss": -0.0147, "step": 1308, "step_time": 5.056439358002535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0031250000465661287, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250000465661287, "completions/clipped_ratio": 0.0, "completions/max_length": 969.0, "completions/max_terminated_length": 969.0, "completions/mean_length": 381.53125, "completions/mean_terminated_length": 381.53125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.168699324131012, "epoch": 0.01309, "frac_reward_zero_std": 0.0, "grad_norm": 0.03892932087182999, "kl": 0.33063175715506077, "learning_rate": 9.999250044708666e-06, "loss": -0.0142, "num_tokens": 29658411.0, "reward": 0.6464420557022095, "reward_std": 1.340714454650879, "rewards/rollout_reward_func/mean": 0.6464420557022095, "rewards/rollout_reward_func/std": 1.7533153295516968, "sampling/importance_sampling_ratio/max": 0.5515179634094238, "sampling/importance_sampling_ratio/mean": 0.14892393350601196, "sampling/importance_sampling_ratio/min": 1.0008775142011928e-08, "sampling/sampling_logp_difference/max": 4.220609664916992, "sampling/sampling_logp_difference/mean": 1.1874821186065674, "step": 1309, "step_time": 9.33142131600107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 6.175828158855438, "epoch": 0.0131, "grad_norm": 0.03961436077952385, "kl": 0.3293277397751808, "learning_rate": 9.999248866036543e-06, "loss": -0.0143, "step": 1310, "step_time": 4.856734672997845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 997.0, "completions/max_terminated_length": 997.0, "completions/mean_length": 351.28125, "completions/mean_terminated_length": 351.28125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.530292749404907, "epoch": 0.01311, "frac_reward_zero_std": 0.0, "grad_norm": 0.030531518161296844, "kl": 0.35452288342639804, "learning_rate": 9.999247686439005e-06, "loss": -0.0179, "num_tokens": 29705494.0, "reward": 0.4057275950908661, "reward_std": 1.4522531032562256, "rewards/rollout_reward_func/mean": 0.4057275950908661, "rewards/rollout_reward_func/std": 1.517204999923706, "sampling/importance_sampling_ratio/max": 0.5585291981697083, "sampling/importance_sampling_ratio/mean": 0.14627856016159058, "sampling/importance_sampling_ratio/min": 7.15260302700492e-14, "sampling/sampling_logp_difference/max": 4.179675102233887, "sampling/sampling_logp_difference/mean": 1.3310258388519287, "step": 1311, "step_time": 9.035280727995996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 6.522859811782837, "epoch": 0.01312, "grad_norm": 0.01368675846606493, "kl": 0.3580013853497803, "learning_rate": 9.999246505916055e-06, "loss": -0.0179, "step": 1312, "step_time": 4.979310093993263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1364.0, "completions/max_terminated_length": 1364.0, "completions/mean_length": 252.96875, "completions/mean_terminated_length": 252.96875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.194057285785675, "epoch": 0.01313, "frac_reward_zero_std": 0.25, "grad_norm": 0.020025936886668205, "kl": 0.5188167113810778, "learning_rate": 9.99924532446769e-06, "loss": -0.0127, "num_tokens": 29747704.0, "reward": 1.0707229375839233, "reward_std": 1.0945115089416504, "rewards/rollout_reward_func/mean": 1.0707229375839233, "rewards/rollout_reward_func/std": 1.4056187868118286, "sampling/importance_sampling_ratio/max": 0.5595279932022095, "sampling/importance_sampling_ratio/mean": 0.26250138878822327, "sampling/importance_sampling_ratio/min": 3.009955633803858e-14, "sampling/sampling_logp_difference/max": 10.412665367126465, "sampling/sampling_logp_difference/mean": 1.0709840059280396, "step": 1313, "step_time": 10.902624228994682 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.008928571827709675, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.024553571827709675, "entropy": 5.172252863645554, "epoch": 0.01314, "grad_norm": 0.017831901088356972, "kl": 0.5322351809591055, "learning_rate": 9.999244142093913e-06, "loss": -0.0128, "step": 1314, "step_time": 5.703473059995304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "completions/clipped_ratio": 0.03125, "completions/max_length": 1503.0, "completions/max_terminated_length": 1503.0, "completions/mean_length": 355.09375, "completions/mean_terminated_length": 352.2257995605469, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.678825616836548, "epoch": 0.01315, "frac_reward_zero_std": 0.25, "grad_norm": 0.05749954655766487, "kl": 0.5355205796658993, "learning_rate": 9.999242958794724e-06, "loss": -0.014, "num_tokens": 29793450.0, "reward": 0.817656397819519, "reward_std": 1.1227424144744873, "rewards/rollout_reward_func/mean": 0.817656397819519, "rewards/rollout_reward_func/std": 1.5051565170288086, "sampling/importance_sampling_ratio/max": 0.5596403479576111, "sampling/importance_sampling_ratio/mean": 0.22456954419612885, "sampling/importance_sampling_ratio/min": 6.694730103617985e-09, "sampling/sampling_logp_difference/max": 3.8686108589172363, "sampling/sampling_logp_difference/mean": 0.9581512212753296, "step": 1315, "step_time": 11.122622261995275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0026041667442768812, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0026041667442768812, "entropy": 5.698349684476852, "epoch": 0.01316, "grad_norm": 0.05700928345322609, "kl": 0.5286512263119221, "learning_rate": 9.999241774570122e-06, "loss": -0.0141, "step": 1316, "step_time": 6.03267977800715 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.0, "completions/max_length": 943.0, "completions/max_terminated_length": 943.0, "completions/mean_length": 213.75, "completions/mean_terminated_length": 213.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.632903873920441, "epoch": 0.01317, "frac_reward_zero_std": 0.5, "grad_norm": 0.0024759978987276554, "kl": 0.6831431090831757, "learning_rate": 9.999240589420108e-06, "loss": -0.0089, "num_tokens": 29832954.0, "reward": 1.2241171598434448, "reward_std": 0.801517128944397, "rewards/rollout_reward_func/mean": 1.2241171598434448, "rewards/rollout_reward_func/std": 1.444940447807312, "sampling/importance_sampling_ratio/max": 0.5650482773780823, "sampling/importance_sampling_ratio/mean": 0.3296271860599518, "sampling/importance_sampling_ratio/min": 3.487745198071934e-05, "sampling/sampling_logp_difference/max": 2.5452818870544434, "sampling/sampling_logp_difference/mean": 0.7861796617507935, "step": 1317, "step_time": 8.406837661001191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.641632556915283, "epoch": 0.01318, "grad_norm": 0.002519424771890044, "kl": 0.682200513780117, "learning_rate": 9.999239403344681e-06, "loss": -0.0089, "step": 1318, "step_time": 4.687689253001736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 914.0, "completions/max_terminated_length": 914.0, "completions/mean_length": 318.6875, "completions/mean_terminated_length": 318.2903137207031, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.505165457725525, "epoch": 0.01319, "frac_reward_zero_std": 0.0, "grad_norm": 0.05152762308716774, "kl": 0.3169608027674258, "learning_rate": 9.999238216343842e-06, "loss": -0.014, "num_tokens": 29878838.0, "reward": -0.1946699321269989, "reward_std": 1.2597202062606812, "rewards/rollout_reward_func/mean": -0.1946699321269989, "rewards/rollout_reward_func/std": 1.3700114488601685, "sampling/importance_sampling_ratio/max": 0.5486357808113098, "sampling/importance_sampling_ratio/mean": 0.07345125079154968, "sampling/importance_sampling_ratio/min": 8.494493072596902e-18, "sampling/sampling_logp_difference/max": 4.817383289337158, "sampling/sampling_logp_difference/mean": 1.6892906427383423, "step": 1319, "step_time": 9.67712932699942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0028409091755747795, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0028409091755747795, "entropy": 7.511152386665344, "epoch": 0.0132, "grad_norm": 0.04743393138051033, "kl": 0.3187821679748595, "learning_rate": 9.999237028417591e-06, "loss": -0.0142, "step": 1320, "step_time": 4.728217104995565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1585.0, "completions/max_terminated_length": 1585.0, "completions/mean_length": 418.125, "completions/mean_terminated_length": 418.125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.369553089141846, "epoch": 0.01321, "frac_reward_zero_std": 0.0, "grad_norm": 0.01071261614561081, "kl": 0.5290125589817762, "learning_rate": 9.999235839565933e-06, "loss": -0.015, "num_tokens": 29925948.0, "reward": 0.5189638733863831, "reward_std": 1.4772584438323975, "rewards/rollout_reward_func/mean": 0.5189638733863831, "rewards/rollout_reward_func/std": 1.762911319732666, "sampling/importance_sampling_ratio/max": 0.5601078271865845, "sampling/importance_sampling_ratio/mean": 0.25421154499053955, "sampling/importance_sampling_ratio/min": 2.0303454917325325e-08, "sampling/sampling_logp_difference/max": 4.15228271484375, "sampling/sampling_logp_difference/mean": 1.2910466194152832, "step": 1321, "step_time": 10.670398177000607 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 6.376424849033356, "epoch": 0.01322, "grad_norm": 0.008822798728942871, "kl": 0.5309217609465122, "learning_rate": 9.999234649788861e-06, "loss": -0.015, "step": 1322, "step_time": 6.085168809997413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 923.0, "completions/max_terminated_length": 923.0, "completions/mean_length": 237.28125, "completions/mean_terminated_length": 237.28125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.286045104265213, "epoch": 0.01323, "frac_reward_zero_std": 0.0, "grad_norm": 0.01776299811899662, "kl": 0.5605928059667349, "learning_rate": 9.99923345908638e-06, "loss": -0.0267, "num_tokens": 29968674.0, "reward": 0.6567978262901306, "reward_std": 1.3542122840881348, "rewards/rollout_reward_func/mean": 0.6567978262901306, "rewards/rollout_reward_func/std": 1.685442566871643, "sampling/importance_sampling_ratio/max": 0.5593658685684204, "sampling/importance_sampling_ratio/mean": 0.2194601446390152, "sampling/importance_sampling_ratio/min": 6.55280341164792e-14, "sampling/sampling_logp_difference/max": 4.788801670074463, "sampling/sampling_logp_difference/mean": 1.1041233539581299, "step": 1323, "step_time": 8.698610530002043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.283619552850723, "epoch": 0.01324, "grad_norm": 0.015856178477406502, "kl": 0.5607088822871447, "learning_rate": 9.999232267458488e-06, "loss": -0.0267, "step": 1324, "step_time": 5.183343311993667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1193.0, "completions/max_terminated_length": 1193.0, "completions/mean_length": 559.125, "completions/mean_terminated_length": 559.125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.162852764129639, "epoch": 0.01325, "frac_reward_zero_std": 0.0, "grad_norm": 0.056339021772146225, "kl": 0.31374229956418276, "learning_rate": 9.999231074905187e-06, "loss": -0.0177, "num_tokens": 30023090.0, "reward": 0.39943063259124756, "reward_std": 1.2606122493743896, "rewards/rollout_reward_func/mean": 0.39943063259124756, "rewards/rollout_reward_func/std": 1.260680079460144, "sampling/importance_sampling_ratio/max": 0.31222081184387207, "sampling/importance_sampling_ratio/mean": 0.06801389157772064, "sampling/importance_sampling_ratio/min": 9.289229406300378e-14, "sampling/sampling_logp_difference/max": 11.381604194641113, "sampling/sampling_logp_difference/mean": 1.514158010482788, "step": 1325, "step_time": 10.31958977999966 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 7.162359952926636, "epoch": 0.01326, "grad_norm": 0.015500151552259922, "kl": 0.31482497695833445, "learning_rate": 9.999229881426476e-06, "loss": -0.0178, "step": 1326, "step_time": 5.343683100003545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 573.0, "completions/max_terminated_length": 573.0, "completions/mean_length": 149.8125, "completions/mean_terminated_length": 149.8125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.487463593482971, "epoch": 0.01327, "frac_reward_zero_std": 0.25, "grad_norm": 0.013327560387551785, "kl": 0.2775154043920338, "learning_rate": 9.999228687022356e-06, "loss": -0.0023, "num_tokens": 30061354.0, "reward": 0.593902587890625, "reward_std": 1.024120569229126, "rewards/rollout_reward_func/mean": 0.593902587890625, "rewards/rollout_reward_func/std": 1.3696191310882568, "sampling/importance_sampling_ratio/max": 0.5621481537818909, "sampling/importance_sampling_ratio/mean": 0.17054510116577148, "sampling/importance_sampling_ratio/min": 4.213152806187281e-06, "sampling/sampling_logp_difference/max": 4.022408485412598, "sampling/sampling_logp_difference/mean": 1.6169686317443848, "step": 1327, "step_time": 6.908135925001261 }, { "clip_ratio/high_max": 0.046875, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 7.4846895933151245, "epoch": 0.01328, "grad_norm": 0.009280262514948845, "kl": 0.2784002055414021, "learning_rate": 9.999227491692825e-06, "loss": -0.0023, "step": 1328, "step_time": 3.81401999299851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1181.0, "completions/max_terminated_length": 1181.0, "completions/mean_length": 541.1875, "completions/mean_terminated_length": 541.1875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.506100058555603, "epoch": 0.01329, "frac_reward_zero_std": 0.0, "grad_norm": 0.015641851350665092, "kl": 0.5502849770709872, "learning_rate": 9.999226295437887e-06, "loss": -0.0074, "num_tokens": 30114650.0, "reward": 0.5462354421615601, "reward_std": 1.3871734142303467, "rewards/rollout_reward_func/mean": 0.5462354421615601, "rewards/rollout_reward_func/std": 1.4470868110656738, "sampling/importance_sampling_ratio/max": 0.31067487597465515, "sampling/importance_sampling_ratio/mean": 0.10540604591369629, "sampling/importance_sampling_ratio/min": 1.7782826944312546e-06, "sampling/sampling_logp_difference/max": 2.6230862140655518, "sampling/sampling_logp_difference/mean": 1.2400012016296387, "step": 1329, "step_time": 9.757798745995387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 6.495138764381409, "epoch": 0.0133, "grad_norm": 0.015487562865018845, "kl": 0.5490897344425321, "learning_rate": 9.99922509825754e-06, "loss": -0.0074, "step": 1330, "step_time": 6.278189057993586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1011.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 192.75, "completions/mean_terminated_length": 192.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.651956677436829, "epoch": 0.01331, "frac_reward_zero_std": 0.0, "grad_norm": 0.05940724536776543, "kl": 0.41701131127774715, "learning_rate": 9.999223900151786e-06, "loss": -0.0091, "num_tokens": 30154810.0, "reward": 0.5819399356842041, "reward_std": 1.095302939414978, "rewards/rollout_reward_func/mean": 0.5819399356842041, "rewards/rollout_reward_func/std": 1.5871480703353882, "sampling/importance_sampling_ratio/max": 0.5092005133628845, "sampling/importance_sampling_ratio/mean": 0.09916863590478897, "sampling/importance_sampling_ratio/min": 2.4349408704438247e-05, "sampling/sampling_logp_difference/max": 2.916111469268799, "sampling/sampling_logp_difference/mean": 1.5659165382385254, "step": 1331, "step_time": 8.653989688002184 }, { "clip_ratio/high_max": 0.0625, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 7.562846899032593, "epoch": 0.01332, "grad_norm": 0.034609634429216385, "kl": 0.43319375440478325, "learning_rate": 9.999222701120623e-06, "loss": -0.0092, "step": 1332, "step_time": 4.690999231996102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 838.0, "completions/max_terminated_length": 838.0, "completions/mean_length": 249.09375, "completions/mean_terminated_length": 249.09375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.823534667491913, "epoch": 0.01333, "frac_reward_zero_std": 0.5, "grad_norm": 0.005698696244508028, "kl": 0.687855027616024, "learning_rate": 9.999221501164055e-06, "loss": -0.0109, "num_tokens": 30197174.0, "reward": 1.834851861000061, "reward_std": 0.7664155960083008, "rewards/rollout_reward_func/mean": 1.834851861000061, "rewards/rollout_reward_func/std": 1.0459696054458618, "sampling/importance_sampling_ratio/max": 0.5838515162467957, "sampling/importance_sampling_ratio/mean": 0.3310030400753021, "sampling/importance_sampling_ratio/min": 1.0585453310341109e-05, "sampling/sampling_logp_difference/max": 2.4842686653137207, "sampling/sampling_logp_difference/mean": 0.8980702757835388, "step": 1333, "step_time": 8.03254638600265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.791363954544067, "epoch": 0.01334, "grad_norm": 0.004896933678537607, "kl": 0.6893158536404371, "learning_rate": 9.999220300282077e-06, "loss": -0.0109, "step": 1334, "step_time": 4.589410674998362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1081.0, "completions/max_terminated_length": 1081.0, "completions/mean_length": 211.8125, "completions/mean_terminated_length": 211.8125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.179692804813385, "epoch": 0.01335, "frac_reward_zero_std": 0.5, "grad_norm": 0.014189891517162323, "kl": 0.6787576302886009, "learning_rate": 9.999219098474695e-06, "loss": -0.0104, "num_tokens": 30236773.0, "reward": 1.4318197965621948, "reward_std": 0.730856716632843, "rewards/rollout_reward_func/mean": 1.4318197965621948, "rewards/rollout_reward_func/std": 1.0729128122329712, "sampling/importance_sampling_ratio/max": 0.5642563700675964, "sampling/importance_sampling_ratio/mean": 0.31349900364875793, "sampling/importance_sampling_ratio/min": 1.4122627362667117e-05, "sampling/sampling_logp_difference/max": 2.4894208908081055, "sampling/sampling_logp_difference/mean": 0.9148309230804443, "step": 1335, "step_time": 8.855665974006115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.169957160949707, "epoch": 0.01336, "grad_norm": 0.01385414320975542, "kl": 0.6804771050810814, "learning_rate": 9.999217895741903e-06, "loss": -0.0104, "step": 1336, "step_time": 5.586341991005611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "completions/clipped_ratio": 0.0, "completions/max_length": 1427.0, "completions/max_terminated_length": 1427.0, "completions/mean_length": 355.78125, "completions/mean_terminated_length": 355.78125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.839349925518036, "epoch": 0.01337, "frac_reward_zero_std": 0.0, "grad_norm": 0.03975924849510193, "kl": 0.313589621335268, "learning_rate": 9.999216692083707e-06, "loss": -0.0117, "num_tokens": 30284354.0, "reward": 0.24326777458190918, "reward_std": 1.4895930290222168, "rewards/rollout_reward_func/mean": 0.24326777458190918, "rewards/rollout_reward_func/std": 1.5006828308105469, "sampling/importance_sampling_ratio/max": 0.5533899664878845, "sampling/importance_sampling_ratio/mean": 0.11684764176607132, "sampling/importance_sampling_ratio/min": 1.498152435441824e-10, "sampling/sampling_logp_difference/max": 9.828606605529785, "sampling/sampling_logp_difference/mean": 1.4819523096084595, "step": 1337, "step_time": 10.005670652997651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 6.832589685916901, "epoch": 0.01338, "grad_norm": 0.03987662121653557, "kl": 0.31722511537373066, "learning_rate": 9.999215487500103e-06, "loss": -0.0118, "step": 1338, "step_time": 5.726836776997516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 844.0, "completions/max_terminated_length": 844.0, "completions/mean_length": 153.09375, "completions/mean_terminated_length": 153.09375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.451527893543243, "epoch": 0.01339, "frac_reward_zero_std": 0.5, "grad_norm": 0.019019991159439087, "kl": 0.9706481769680977, "learning_rate": 9.999214281991096e-06, "loss": -0.0117, "num_tokens": 30320225.0, "reward": 1.0906072854995728, "reward_std": 0.7347856760025024, "rewards/rollout_reward_func/mean": 1.0906072854995728, "rewards/rollout_reward_func/std": 1.498895525932312, "sampling/importance_sampling_ratio/max": 0.556906521320343, "sampling/importance_sampling_ratio/mean": 0.340939998626709, "sampling/importance_sampling_ratio/min": 1.5119082404763638e-14, "sampling/sampling_logp_difference/max": 12.031784057617188, "sampling/sampling_logp_difference/mean": 1.0455509424209595, "step": 1339, "step_time": 7.917336759994214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.4568754732608795, "epoch": 0.0134, "grad_norm": 0.01954253949224949, "kl": 0.9735871851444244, "learning_rate": 9.999213075556682e-06, "loss": -0.0117, "step": 1340, "step_time": 4.3576427010011685 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "completions/clipped_ratio": 0.0, "completions/max_length": 1000.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 232.875, "completions/mean_terminated_length": 232.875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.165960371494293, "epoch": 0.01341, "frac_reward_zero_std": 0.25, "grad_norm": 0.028111565858125687, "kl": 0.570180881768465, "learning_rate": 9.999211868196863e-06, "loss": -0.0069, "num_tokens": 30362243.0, "reward": 0.35228657722473145, "reward_std": 0.8247955441474915, "rewards/rollout_reward_func/mean": 0.35228657722473145, "rewards/rollout_reward_func/std": 1.3707518577575684, "sampling/importance_sampling_ratio/max": 0.564580500125885, "sampling/importance_sampling_ratio/mean": 0.2624841630458832, "sampling/importance_sampling_ratio/min": 6.126816032292481e-08, "sampling/sampling_logp_difference/max": 4.093819618225098, "sampling/sampling_logp_difference/mean": 1.2038404941558838, "step": 1341, "step_time": 8.940471812002215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 6.172475457191467, "epoch": 0.01342, "grad_norm": 0.030711421743035316, "kl": 0.5680591259151697, "learning_rate": 9.999210659911638e-06, "loss": -0.0069, "step": 1342, "step_time": 5.328330250002182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1108.0, "completions/max_terminated_length": 1108.0, "completions/mean_length": 182.40625, "completions/mean_terminated_length": 187.77418518066406, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.8934586346149445, "epoch": 0.01343, "frac_reward_zero_std": 0.0, "grad_norm": 0.17872364819049835, "kl": 0.5378544330596924, "learning_rate": 9.999209450701009e-06, "loss": -0.0071, "num_tokens": 30400663.0, "reward": 0.6553005576133728, "reward_std": 0.8331590890884399, "rewards/rollout_reward_func/mean": 0.6553005576133728, "rewards/rollout_reward_func/std": 1.308832049369812, "sampling/importance_sampling_ratio/max": 0.5615138411521912, "sampling/importance_sampling_ratio/mean": 0.27203184366226196, "sampling/importance_sampling_ratio/min": 1.2611286526794524e-13, "sampling/sampling_logp_difference/max": 4.4002180099487305, "sampling/sampling_logp_difference/mean": 1.2029352188110352, "step": 1343, "step_time": 8.607363639999676 }, { "clip_ratio/high_max": 0.15625, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.09375, "entropy": 5.684485614299774, "epoch": 0.01344, "grad_norm": 0.07253173738718033, "kl": 0.6838444322347641, "learning_rate": 9.999208240564978e-06, "loss": -0.0074, "step": 1344, "step_time": 4.943036747998121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1016.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 345.21875, "completions/mean_terminated_length": 342.51611328125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.417831361293793, "epoch": 0.01345, "frac_reward_zero_std": 0.25, "grad_norm": 0.004554664716124535, "kl": 0.8763157343491912, "learning_rate": 9.999207029503541e-06, "loss": 0.0093, "num_tokens": 30447860.0, "reward": -0.09167065471410751, "reward_std": 0.6125798225402832, "rewards/rollout_reward_func/mean": -0.09167065471410751, "rewards/rollout_reward_func/std": 0.9486311674118042, "sampling/importance_sampling_ratio/max": 0.5536003112792969, "sampling/importance_sampling_ratio/mean": 0.23216237127780914, "sampling/importance_sampling_ratio/min": 2.625240302922549e-10, "sampling/sampling_logp_difference/max": 3.619985818862915, "sampling/sampling_logp_difference/mean": 1.0638450384140015, "step": 1345, "step_time": 8.684863873000722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.42333710193634, "epoch": 0.01346, "grad_norm": 0.004641991574317217, "kl": 0.8773982198908925, "learning_rate": 9.999205817516701e-06, "loss": 0.0093, "step": 1346, "step_time": 4.870364730999427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 458.0, "completions/max_terminated_length": 458.0, "completions/mean_length": 155.34375, "completions/mean_terminated_length": 159.32257080078125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.323194146156311, "epoch": 0.01347, "frac_reward_zero_std": 0.25, "grad_norm": 0.024692820385098457, "kl": 0.5177321955561638, "learning_rate": 9.999204604604457e-06, "loss": -0.0047, "num_tokens": 30485148.0, "reward": 0.8537083864212036, "reward_std": 1.2288775444030762, "rewards/rollout_reward_func/mean": 0.8537083864212036, "rewards/rollout_reward_func/std": 1.572927474975586, "sampling/importance_sampling_ratio/max": 0.5592671632766724, "sampling/importance_sampling_ratio/mean": 0.2683679163455963, "sampling/importance_sampling_ratio/min": 2.7155015652476983e-32, "sampling/sampling_logp_difference/max": 10.895648956298828, "sampling/sampling_logp_difference/mean": 1.1875208616256714, "step": 1347, "step_time": 7.768281627995748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.338074803352356, "epoch": 0.01348, "grad_norm": 0.02500295639038086, "kl": 0.5163477845489979, "learning_rate": 9.999203390766811e-06, "loss": -0.0047, "step": 1348, "step_time": 4.129067149999173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1612.0, "completions/max_terminated_length": 1612.0, "completions/mean_length": 438.46875, "completions/mean_terminated_length": 438.46875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.174711346626282, "epoch": 0.01349, "frac_reward_zero_std": 0.25, "grad_norm": 0.04355859383940697, "kl": 0.4297270909883082, "learning_rate": 9.999202176003763e-06, "loss": -0.01, "num_tokens": 30534390.0, "reward": 0.7899848818778992, "reward_std": 0.8479366302490234, "rewards/rollout_reward_func/mean": 0.7899848818778992, "rewards/rollout_reward_func/std": 1.1763229370117188, "sampling/importance_sampling_ratio/max": 0.5586929321289062, "sampling/importance_sampling_ratio/mean": 0.19285619258880615, "sampling/importance_sampling_ratio/min": 1.9457215005846962e-10, "sampling/sampling_logp_difference/max": 9.12686538696289, "sampling/sampling_logp_difference/mean": 1.3526194095611572, "step": 1349, "step_time": 11.024446510004054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 6.185661733150482, "epoch": 0.0135, "grad_norm": 0.021418238058686256, "kl": 0.4305468634702265, "learning_rate": 9.999200960315312e-06, "loss": -0.01, "step": 1350, "step_time": 6.1796785190053924 } ], "logging_steps": 1.0, "max_steps": 200000, "num_input_tokens_seen": 30534390, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }