{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.003, "eval_steps": 500, "global_step": 150, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1775.0, "completions/max_terminated_length": 1775.0, "completions/mean_length": 1624.96875, "completions/mean_terminated_length": 1624.96875, "completions/min_length": 1388.0, "completions/min_terminated_length": 1388.0, "entropy": 0.5600852482020855, "epoch": 2e-05, "frac_reward_zero_std": 0.0, "grad_norm": 1.3755558729171753, "kl": 0.0, "learning_rate": 0.0, "loss": -0.0592, "num_tokens": 73247.0, "reward": -12.172411918640137, "reward_std": 7.601527214050293, "rewards/rollout_reward_func/mean": -12.172411918640137, "rewards/rollout_reward_func/std": 10.38169002532959, "sampling/importance_sampling_ratio/max": 1.408553123474121, "sampling/importance_sampling_ratio/mean": 0.9712058901786804, "sampling/importance_sampling_ratio/min": 0.6454448103904724, "sampling/sampling_logp_difference/max": 0.22739958763122559, "sampling/sampling_logp_difference/mean": 0.016150973737239838, "step": 1, "step_time": 36.755565460999605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5600852482020855, "epoch": 4e-05, "grad_norm": 1.3615893125534058, "kl": 0.0, "learning_rate": 2.8571428571428575e-07, "loss": -0.0592, "step": 2, "step_time": 5.746241367000948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1753.0, "completions/max_terminated_length": 1753.0, "completions/mean_length": 1628.96875, "completions/mean_terminated_length": 1628.96875, "completions/min_length": 1271.0, "completions/min_terminated_length": 1271.0, "entropy": 0.5380602143704891, "epoch": 6e-05, "frac_reward_zero_std": 0.0, "grad_norm": 1.4941445589065552, "kl": 0.0005305010126903653, "learning_rate": 5.714285714285715e-07, "loss": 0.006, "num_tokens": 146725.0, "reward": -8.265422821044922, "reward_std": 8.979022026062012, "rewards/rollout_reward_func/mean": -8.265422821044922, "rewards/rollout_reward_func/std": 13.061026573181152, "sampling/importance_sampling_ratio/max": 1.2190126180648804, "sampling/importance_sampling_ratio/mean": 0.9876266121864319, "sampling/importance_sampling_ratio/min": 0.5881595015525818, "sampling/sampling_logp_difference/max": 0.45802879333496094, "sampling/sampling_logp_difference/mean": 0.014619816094636917, "step": 3, "step_time": 36.527911828999095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5377197824418545, "epoch": 8e-05, "grad_norm": 1.4071228504180908, "kl": 0.0005172143501113169, "learning_rate": 8.571428571428572e-07, "loss": 0.0058, "step": 4, "step_time": 5.69982043300206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001953125, "completions/clipped_ratio": 0.0, "completions/max_length": 1754.0, "completions/max_terminated_length": 1754.0, "completions/mean_length": 1595.15625, "completions/mean_terminated_length": 1595.15625, "completions/min_length": 1299.0, "completions/min_terminated_length": 1299.0, "entropy": 0.539891816675663, "epoch": 0.0001, "frac_reward_zero_std": 0.0, "grad_norm": 1.4629162549972534, "kl": 0.0005519717960851267, "learning_rate": 1.142857142857143e-06, "loss": 0.0243, "num_tokens": 219002.0, "reward": -14.256836891174316, "reward_std": 9.0944185256958, "rewards/rollout_reward_func/mean": -14.256836891174316, "rewards/rollout_reward_func/std": 12.482532501220703, "sampling/importance_sampling_ratio/max": 1.6900306940078735, "sampling/importance_sampling_ratio/mean": 1.0195035934448242, "sampling/importance_sampling_ratio/min": 0.8020860552787781, "sampling/sampling_logp_difference/max": 0.25893688201904297, "sampling/sampling_logp_difference/mean": 0.016118371859192848, "step": 5, "step_time": 38.65034878200095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5395300313830376, "epoch": 0.00012, "grad_norm": 1.5035072565078735, "kl": 0.0006097570294514298, "learning_rate": 1.4285714285714286e-06, "loss": 0.0242, "step": 6, "step_time": 5.672908046001794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0031250000465661287, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250000465661287, "completions/clipped_ratio": 0.0, "completions/max_length": 1782.0, "completions/max_terminated_length": 1782.0, "completions/mean_length": 1521.90625, "completions/mean_terminated_length": 1521.90625, "completions/min_length": 723.0, "completions/min_terminated_length": 723.0, "entropy": 0.5277910158038139, "epoch": 0.00014, "frac_reward_zero_std": 0.0, "grad_norm": 1.2564315795898438, "kl": 0.0006845891803095583, "learning_rate": 1.7142857142857145e-06, "loss": 0.0223, "num_tokens": 288633.0, "reward": -16.092445373535156, "reward_std": 8.75448989868164, "rewards/rollout_reward_func/mean": -16.092445373535156, "rewards/rollout_reward_func/std": 15.618288040161133, "sampling/importance_sampling_ratio/max": 1.5459660291671753, "sampling/importance_sampling_ratio/mean": 1.024022102355957, "sampling/importance_sampling_ratio/min": 0.7249171733856201, "sampling/sampling_logp_difference/max": 0.29637718200683594, "sampling/sampling_logp_difference/mean": 0.018320683389902115, "step": 7, "step_time": 32.748348556002384 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001953125, "entropy": 0.5283261835575104, "epoch": 0.00016, "grad_norm": 1.2439873218536377, "kl": 0.0006405085659935139, "learning_rate": 2.0000000000000003e-06, "loss": 0.0223, "step": 8, "step_time": 5.7717726080009015 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "completions/clipped_ratio": 0.0, "completions/max_length": 1818.0, "completions/max_terminated_length": 1818.0, "completions/mean_length": 1608.375, "completions/mean_terminated_length": 1608.375, "completions/min_length": 305.0, "completions/min_terminated_length": 305.0, "entropy": 0.5326173529028893, "epoch": 0.00018, "frac_reward_zero_std": 0.0, "grad_norm": 0.9701208472251892, "kl": 0.0007356673304457217, "learning_rate": 2.285714285714286e-06, "loss": 0.012, "num_tokens": 361374.0, "reward": -6.8430585861206055, "reward_std": 12.837440490722656, "rewards/rollout_reward_func/mean": -6.8430585861206055, "rewards/rollout_reward_func/std": 17.0405216217041, "sampling/importance_sampling_ratio/max": 1.2777214050292969, "sampling/importance_sampling_ratio/mean": 0.9900251626968384, "sampling/importance_sampling_ratio/min": 0.6748403310775757, "sampling/sampling_logp_difference/max": 0.3269679546356201, "sampling/sampling_logp_difference/mean": 0.0145448949187994, "step": 9, "step_time": 34.28418227700058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5334620848298073, "epoch": 0.0002, "grad_norm": 1.0108616352081299, "kl": 0.0005742738721892238, "learning_rate": 2.571428571428571e-06, "loss": 0.012, "step": 10, "step_time": 6.918311860000358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1797.0, "completions/max_terminated_length": 1797.0, "completions/mean_length": 1687.25, "completions/mean_terminated_length": 1687.25, "completions/min_length": 1423.0, "completions/min_terminated_length": 1423.0, "entropy": 0.5696082189679146, "epoch": 0.00022, "frac_reward_zero_std": 0.0, "grad_norm": 1.1935234069824219, "kl": 0.0007269367633853108, "learning_rate": 2.8571428571428573e-06, "loss": -0.0013, "num_tokens": 436067.0, "reward": -9.953402519226074, "reward_std": 9.885331153869629, "rewards/rollout_reward_func/mean": -9.953402519226074, "rewards/rollout_reward_func/std": 11.941234588623047, "sampling/importance_sampling_ratio/max": 1.3005088567733765, "sampling/importance_sampling_ratio/mean": 0.9863357543945312, "sampling/importance_sampling_ratio/min": 0.7671698927879333, "sampling/sampling_logp_difference/max": 0.1938610076904297, "sampling/sampling_logp_difference/mean": 0.016408588737249374, "step": 11, "step_time": 36.470574425999075 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001953125, "entropy": 0.5690113827586174, "epoch": 0.00024, "grad_norm": 1.1512264013290405, "kl": 0.0009323725680587813, "learning_rate": 3.142857142857143e-06, "loss": -0.0002, "step": 12, "step_time": 5.875566556000194 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001953125, "completions/clipped_ratio": 0.0, "completions/max_length": 1832.0, "completions/max_terminated_length": 1832.0, "completions/mean_length": 1571.625, "completions/mean_terminated_length": 1571.625, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 0.5204437598586082, "epoch": 0.00026, "frac_reward_zero_std": 0.0, "grad_norm": 1.1478338241577148, "kl": 0.0008145252213580534, "learning_rate": 3.428571428571429e-06, "loss": 0.0344, "num_tokens": 507640.0, "reward": -5.592676162719727, "reward_std": 11.350366592407227, "rewards/rollout_reward_func/mean": -5.592676162719727, "rewards/rollout_reward_func/std": 16.42201805114746, "sampling/importance_sampling_ratio/max": 1.401992917060852, "sampling/importance_sampling_ratio/mean": 1.043225884437561, "sampling/importance_sampling_ratio/min": 0.7300771474838257, "sampling/sampling_logp_difference/max": 0.24753212928771973, "sampling/sampling_logp_difference/mean": 0.016798537224531174, "step": 13, "step_time": 34.95688835600049 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "entropy": 0.5211725942790508, "epoch": 0.00028, "grad_norm": 1.1339523792266846, "kl": 0.001084248440747615, "learning_rate": 3.7142857142857146e-06, "loss": 0.0358, "step": 14, "step_time": 5.896424438000395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1777.0, "completions/max_terminated_length": 1777.0, "completions/mean_length": 1681.0625, "completions/mean_terminated_length": 1681.0625, "completions/min_length": 1441.0, "completions/min_terminated_length": 1441.0, "entropy": 0.5041001103818417, "epoch": 0.0003, "frac_reward_zero_std": 0.0, "grad_norm": 1.4473488330841064, "kl": 0.0014922630070941523, "learning_rate": 4.000000000000001e-06, "loss": -0.0171, "num_tokens": 582735.0, "reward": -6.595297336578369, "reward_std": 7.1833062171936035, "rewards/rollout_reward_func/mean": -6.595297336578369, "rewards/rollout_reward_func/std": 9.555194854736328, "sampling/importance_sampling_ratio/max": 1.368825078010559, "sampling/importance_sampling_ratio/mean": 0.9549809098243713, "sampling/importance_sampling_ratio/min": 0.7357600331306458, "sampling/sampling_logp_difference/max": 0.22634148597717285, "sampling/sampling_logp_difference/mean": 0.015938639640808105, "step": 15, "step_time": 36.23906176099899 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001953125, "entropy": 0.5047293417155743, "epoch": 0.00032, "grad_norm": 1.4626593589782715, "kl": 0.001966523894225247, "learning_rate": 4.2857142857142855e-06, "loss": -0.0206, "step": 16, "step_time": 7.021587903999716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1747.0, "completions/max_terminated_length": 1747.0, "completions/mean_length": 1580.9375, "completions/mean_terminated_length": 1580.9375, "completions/min_length": 903.0, "completions/min_terminated_length": 903.0, "entropy": 0.5249488092958927, "epoch": 0.00034, "frac_reward_zero_std": 0.0, "grad_norm": 1.4577637910842896, "kl": 0.003107522992650047, "learning_rate": 4.571428571428572e-06, "loss": -0.062, "num_tokens": 655209.0, "reward": -11.51949691772461, "reward_std": 5.783572673797607, "rewards/rollout_reward_func/mean": -11.51949691772461, "rewards/rollout_reward_func/std": 6.822617530822754, "sampling/importance_sampling_ratio/max": 1.3054317235946655, "sampling/importance_sampling_ratio/mean": 0.9661756753921509, "sampling/importance_sampling_ratio/min": 0.7799487709999084, "sampling/sampling_logp_difference/max": 0.15545654296875, "sampling/sampling_logp_difference/mean": 0.014764709398150444, "step": 17, "step_time": 35.02218110599915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5238873660564423, "epoch": 0.00036, "grad_norm": 1.4641653299331665, "kl": 0.0047977561771404, "learning_rate": 4.857142857142858e-06, "loss": -0.0645, "step": 18, "step_time": 5.746608606000336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1798.0, "completions/max_terminated_length": 1798.0, "completions/mean_length": 1634.25, "completions/mean_terminated_length": 1634.25, "completions/min_length": 1522.0, "completions/min_terminated_length": 1522.0, "entropy": 0.5075966455042362, "epoch": 0.00038, "frac_reward_zero_std": 0.0, "grad_norm": 1.6138638257980347, "kl": 0.007999939436558634, "learning_rate": 5.142857142857142e-06, "loss": -0.0053, "num_tokens": 728702.0, "reward": -7.502280235290527, "reward_std": 9.169681549072266, "rewards/rollout_reward_func/mean": -7.502280235290527, "rewards/rollout_reward_func/std": 9.848286628723145, "sampling/importance_sampling_ratio/max": 1.388090968132019, "sampling/importance_sampling_ratio/mean": 1.0349256992340088, "sampling/importance_sampling_ratio/min": 0.6338706612586975, "sampling/sampling_logp_difference/max": 0.2868894338607788, "sampling/sampling_logp_difference/mean": 0.021757658571004868, "step": 19, "step_time": 37.415181820002545 }, { "clip_ratio/high_max": 0.01171875, "clip_ratio/high_mean": 0.005859375, "clip_ratio/low_mean": 0.005800189450383186, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011659564450383186, "entropy": 0.5063299536705017, "epoch": 0.0004, "grad_norm": 1.244437336921692, "kl": 0.01308579370379448, "learning_rate": 5.428571428571429e-06, "loss": -0.0075, "step": 20, "step_time": 5.793050424000285 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001953125, "completions/clipped_ratio": 0.0, "completions/max_length": 1807.0, "completions/max_terminated_length": 1807.0, "completions/mean_length": 1655.15625, "completions/mean_terminated_length": 1655.15625, "completions/min_length": 1502.0, "completions/min_terminated_length": 1502.0, "entropy": 0.5319979190826416, "epoch": 0.00042, "frac_reward_zero_std": 0.0, "grad_norm": 1.1382571458816528, "kl": 0.014530768617987633, "learning_rate": 5.7142857142857145e-06, "loss": 0.0206, "num_tokens": 803086.0, "reward": 0.2018265724182129, "reward_std": 8.02773666381836, "rewards/rollout_reward_func/mean": 0.2018265724182129, "rewards/rollout_reward_func/std": 10.535411834716797, "sampling/importance_sampling_ratio/max": 1.9407941102981567, "sampling/importance_sampling_ratio/mean": 1.0456597805023193, "sampling/importance_sampling_ratio/min": 0.5120582580566406, "sampling/sampling_logp_difference/max": 0.3853168487548828, "sampling/sampling_logp_difference/mean": 0.033494722098112106, "step": 21, "step_time": 38.7354500320007 }, { "clip_ratio/high_max": 0.01171875, "clip_ratio/high_mean": 0.007753314450383186, "clip_ratio/low_mean": 0.0014880952658131719, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009241409716196358, "entropy": 0.5304564274847507, "epoch": 0.00044, "grad_norm": 1.0724774599075317, "kl": 0.02271496201865375, "learning_rate": 6e-06, "loss": 0.0207, "step": 22, "step_time": 5.822538338000413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1794.0, "completions/max_terminated_length": 1794.0, "completions/mean_length": 1641.09375, "completions/mean_terminated_length": 1641.09375, "completions/min_length": 1125.0, "completions/min_terminated_length": 1125.0, "entropy": 0.5019906461238861, "epoch": 0.00046, "frac_reward_zero_std": 0.0, "grad_norm": 1.1990811824798584, "kl": 0.03286108747124672, "learning_rate": 6.285714285714286e-06, "loss": -0.042, "num_tokens": 877020.0, "reward": -8.106513977050781, "reward_std": 8.252906799316406, "rewards/rollout_reward_func/mean": -8.106513977050781, "rewards/rollout_reward_func/std": 9.194578170776367, "sampling/importance_sampling_ratio/max": 1.5264556407928467, "sampling/importance_sampling_ratio/mean": 0.9783341884613037, "sampling/importance_sampling_ratio/min": 0.4424620270729065, "sampling/sampling_logp_difference/max": 0.4774820804595947, "sampling/sampling_logp_difference/mean": 0.046494003385305405, "step": 23, "step_time": 36.808606273000805 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "entropy": 0.496415089815855, "epoch": 0.00048, "grad_norm": 1.1097930669784546, "kl": 0.04643937526270747, "learning_rate": 6.571428571428572e-06, "loss": -0.0443, "step": 24, "step_time": 5.7939676890000555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1778.0, "completions/max_terminated_length": 1778.0, "completions/mean_length": 1637.375, "completions/mean_terminated_length": 1637.375, "completions/min_length": 1411.0, "completions/min_terminated_length": 1411.0, "entropy": 0.49401185661554337, "epoch": 0.0005, "frac_reward_zero_std": 0.0, "grad_norm": 1.5811240673065186, "kl": 0.08443848416209221, "learning_rate": 6.857142857142858e-06, "loss": -0.2108, "num_tokens": 950862.0, "reward": -4.675426006317139, "reward_std": 7.909944534301758, "rewards/rollout_reward_func/mean": -4.675426006317139, "rewards/rollout_reward_func/std": 9.238933563232422, "sampling/importance_sampling_ratio/max": 2.070371150970459, "sampling/importance_sampling_ratio/mean": 0.9819083213806152, "sampling/importance_sampling_ratio/min": 0.25233596563339233, "sampling/sampling_logp_difference/max": 0.7732794284820557, "sampling/sampling_logp_difference/mean": 0.06470471620559692, "step": 25, "step_time": 38.14812202200119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.009548611124046147, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009548611124046147, "entropy": 0.4837190806865692, "epoch": 0.00052, "grad_norm": 1.3750770092010498, "kl": 0.11863584071397781, "learning_rate": 7.1428571428571436e-06, "loss": -0.2169, "step": 26, "step_time": 5.773092350001207 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001953125, "completions/clipped_ratio": 0.0, "completions/max_length": 1693.0, "completions/max_terminated_length": 1693.0, "completions/mean_length": 1613.34375, "completions/mean_terminated_length": 1613.34375, "completions/min_length": 1434.0, "completions/min_terminated_length": 1434.0, "entropy": 0.45354875922203064, "epoch": 0.00054, "frac_reward_zero_std": 0.0, "grad_norm": 1.5507698059082031, "kl": 0.1627941089682281, "learning_rate": 7.428571428571429e-06, "loss": -0.1993, "num_tokens": 1023781.0, "reward": -7.355703353881836, "reward_std": 10.72867202758789, "rewards/rollout_reward_func/mean": -7.355703353881836, "rewards/rollout_reward_func/std": 12.46450138092041, "sampling/importance_sampling_ratio/max": 2.511967420578003, "sampling/importance_sampling_ratio/mean": 1.0184850692749023, "sampling/importance_sampling_ratio/min": 0.14674033224582672, "sampling/sampling_logp_difference/max": 1.2143032550811768, "sampling/sampling_logp_difference/mean": 0.07817384600639343, "step": 27, "step_time": 38.66307055499874 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.025390625, "clip_ratio/low_min": 0.00390625, "clip_ratio/region_mean": 0.029296875, "entropy": 0.437204722315073, "epoch": 0.00056, "grad_norm": 1.2980964183807373, "kl": 0.25656731706112623, "learning_rate": 7.714285714285716e-06, "loss": -0.2037, "step": 28, "step_time": 5.5929295870000715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001953125, "completions/clipped_ratio": 0.0, "completions/max_length": 1793.0, "completions/max_terminated_length": 1793.0, "completions/mean_length": 1653.5, "completions/mean_terminated_length": 1653.5, "completions/min_length": 1433.0, "completions/min_terminated_length": 1433.0, "entropy": 0.40794313699007034, "epoch": 0.00058, "frac_reward_zero_std": 0.0, "grad_norm": 1.0819990634918213, "kl": 0.5643582288175821, "learning_rate": 8.000000000000001e-06, "loss": -0.2202, "num_tokens": 1098013.0, "reward": -6.507538318634033, "reward_std": 11.262900352478027, "rewards/rollout_reward_func/mean": -6.507538318634033, "rewards/rollout_reward_func/std": 15.167143821716309, "sampling/importance_sampling_ratio/max": 2.3346853256225586, "sampling/importance_sampling_ratio/mean": 0.7045407295227051, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.8893389701843262, "sampling/sampling_logp_difference/mean": 0.11059033870697021, "step": 29, "step_time": 36.8627199120001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.042909564450383186, "clip_ratio/low_min": 0.019412878900766373, "clip_ratio/region_mean": 0.042909564450383186, "entropy": 0.3947901092469692, "epoch": 0.0006, "grad_norm": 1.311099648475647, "kl": 0.829475361853838, "learning_rate": 8.285714285714287e-06, "loss": -0.2219, "step": 30, "step_time": 5.809630234999531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1791.0, "completions/max_terminated_length": 1791.0, "completions/mean_length": 1661.65625, "completions/mean_terminated_length": 1661.65625, "completions/min_length": 1507.0, "completions/min_terminated_length": 1507.0, "entropy": 0.3842233642935753, "epoch": 0.00062, "frac_reward_zero_std": 0.0, "grad_norm": 0.9492745995521545, "kl": 0.836035018786788, "learning_rate": 8.571428571428571e-06, "loss": -0.1847, "num_tokens": 1172333.0, "reward": -7.43505859375, "reward_std": 10.108196258544922, "rewards/rollout_reward_func/mean": -7.43505859375, "rewards/rollout_reward_func/std": 12.447552680969238, "sampling/importance_sampling_ratio/max": 1.9039174318313599, "sampling/importance_sampling_ratio/mean": 0.7630480527877808, "sampling/importance_sampling_ratio/min": 0.03481662645936012, "sampling/sampling_logp_difference/max": 2.102973699569702, "sampling/sampling_logp_difference/mean": 0.10207939893007278, "step": 31, "step_time": 37.01448380799866 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0078125, "clip_ratio/region_mean": 0.017578125, "entropy": 0.3787720203399658, "epoch": 0.00064, "grad_norm": 0.9088082313537598, "kl": 0.954752204939723, "learning_rate": 8.857142857142858e-06, "loss": -0.1848, "step": 32, "step_time": 6.941179774000375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1777.0, "completions/max_terminated_length": 1777.0, "completions/mean_length": 1584.28125, "completions/mean_terminated_length": 1584.28125, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "entropy": 0.34466781467199326, "epoch": 0.00066, "frac_reward_zero_std": 0.0, "grad_norm": 1.3309731483459473, "kl": 0.8169562551192939, "learning_rate": 9.142857142857144e-06, "loss": -0.0371, "num_tokens": 1244354.0, "reward": -5.68795108795166, "reward_std": 6.089047431945801, "rewards/rollout_reward_func/mean": -5.68795108795166, "rewards/rollout_reward_func/std": 7.458269119262695, "sampling/importance_sampling_ratio/max": 1.7896546125411987, "sampling/importance_sampling_ratio/mean": 0.8577574491500854, "sampling/importance_sampling_ratio/min": 0.03787967935204506, "sampling/sampling_logp_difference/max": 2.316878080368042, "sampling/sampling_logp_difference/mean": 0.08233191072940826, "step": 33, "step_time": 35.641360890001124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.009765625, "clip_ratio/low_min": 0.00390625, "clip_ratio/region_mean": 0.009765625, "entropy": 0.3404731787741184, "epoch": 0.00068, "grad_norm": 1.2554371356964111, "kl": 0.8635595235973597, "learning_rate": 9.42857142857143e-06, "loss": -0.0392, "step": 34, "step_time": 5.800049977002345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1793.0, "completions/max_terminated_length": 1793.0, "completions/mean_length": 1645.40625, "completions/mean_terminated_length": 1645.40625, "completions/min_length": 1436.0, "completions/min_terminated_length": 1436.0, "entropy": 0.34361691400408745, "epoch": 0.0007, "frac_reward_zero_std": 0.0, "grad_norm": 0.8878010511398315, "kl": 0.6783301420509815, "learning_rate": 9.714285714285715e-06, "loss": -0.0546, "num_tokens": 1318828.0, "reward": -6.509824752807617, "reward_std": 5.78992223739624, "rewards/rollout_reward_func/mean": -6.509824752807617, "rewards/rollout_reward_func/std": 7.799504280090332, "sampling/importance_sampling_ratio/max": 2.6033241748809814, "sampling/importance_sampling_ratio/mean": 0.7217234373092651, "sampling/importance_sampling_ratio/min": 0.03342561423778534, "sampling/sampling_logp_difference/max": 1.9652609825134277, "sampling/sampling_logp_difference/mean": 0.10379400849342346, "step": 35, "step_time": 38.887631579999834 }, { "clip_ratio/high_max": 0.01953125, "clip_ratio/high_mean": 0.009765625, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013671875, "entropy": 0.34295540675520897, "epoch": 0.00072, "grad_norm": 0.8096361756324768, "kl": 0.608342956751585, "learning_rate": 1e-05, "loss": -0.0557, "step": 36, "step_time": 5.798652657001185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1830.0, "completions/max_terminated_length": 1830.0, "completions/mean_length": 1669.1875, "completions/mean_terminated_length": 1669.1875, "completions/min_length": 1051.0, "completions/min_terminated_length": 1051.0, "entropy": 0.40553563460707664, "epoch": 0.00074, "frac_reward_zero_std": 0.0, "grad_norm": 1.1997655630111694, "kl": 0.6256659794598818, "learning_rate": 9.999999998148153e-06, "loss": -0.1523, "num_tokens": 1393543.0, "reward": -6.479705333709717, "reward_std": 5.127873420715332, "rewards/rollout_reward_func/mean": -6.479705333709717, "rewards/rollout_reward_func/std": 5.757119655609131, "sampling/importance_sampling_ratio/max": 2.8293187618255615, "sampling/importance_sampling_ratio/mean": 0.8761348724365234, "sampling/importance_sampling_ratio/min": 0.04236992821097374, "sampling/sampling_logp_difference/max": 1.9487248659133911, "sampling/sampling_logp_difference/mean": 0.10332974791526794, "step": 37, "step_time": 37.80114303599839 }, { "clip_ratio/high_max": 0.022248641354963183, "clip_ratio/high_mean": 0.011124320677481592, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011124320677481592, "entropy": 0.4083840139210224, "epoch": 0.00076, "grad_norm": 1.0707274675369263, "kl": 0.46542409248650074, "learning_rate": 9.999999992592613e-06, "loss": -0.1538, "step": 38, "step_time": 6.32874613399963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1798.0, "completions/max_terminated_length": 1798.0, "completions/mean_length": 1652.875, "completions/mean_terminated_length": 1652.875, "completions/min_length": 1438.0, "completions/min_terminated_length": 1438.0, "entropy": 0.3374646417796612, "epoch": 0.00078, "frac_reward_zero_std": 0.0, "grad_norm": 1.204039216041565, "kl": 0.31642488297075033, "learning_rate": 9.999999983333379e-06, "loss": -0.2975, "num_tokens": 1467743.0, "reward": -4.7108917236328125, "reward_std": 5.350179672241211, "rewards/rollout_reward_func/mean": -4.7108917236328125, "rewards/rollout_reward_func/std": 5.910353660583496, "sampling/importance_sampling_ratio/max": 2.4041433334350586, "sampling/importance_sampling_ratio/mean": 0.9955360889434814, "sampling/importance_sampling_ratio/min": 0.0774984359741211, "sampling/sampling_logp_difference/max": 1.3263565301895142, "sampling/sampling_logp_difference/mean": 0.08658263087272644, "step": 39, "step_time": 38.75331890299822 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "entropy": 0.3382052704691887, "epoch": 0.0008, "grad_norm": 1.1924562454223633, "kl": 0.293441329151392, "learning_rate": 9.999999970370451e-06, "loss": -0.2999, "step": 40, "step_time": 5.813114761998804 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001953125, "completions/clipped_ratio": 0.0, "completions/max_length": 1829.0, "completions/max_terminated_length": 1829.0, "completions/mean_length": 1647.59375, "completions/mean_terminated_length": 1647.59375, "completions/min_length": 871.0, "completions/min_terminated_length": 871.0, "entropy": 0.3876206576824188, "epoch": 0.00082, "frac_reward_zero_std": 0.0, "grad_norm": 1.0214810371398926, "kl": 0.42729487270116806, "learning_rate": 9.99999995370383e-06, "loss": 0.0543, "num_tokens": 1542142.0, "reward": -3.3205041885375977, "reward_std": 4.228387832641602, "rewards/rollout_reward_func/mean": -3.3205041885375977, "rewards/rollout_reward_func/std": 8.351001739501953, "sampling/importance_sampling_ratio/max": 2.9104766845703125, "sampling/importance_sampling_ratio/mean": 0.9624049067497253, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.469689130783081, "sampling/sampling_logp_difference/mean": 0.09306588023900986, "step": 41, "step_time": 36.42707723900003 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001953125, "entropy": 0.38661571592092514, "epoch": 0.00084, "grad_norm": 1.0359998941421509, "kl": 0.41935206204652786, "learning_rate": 9.999999933333514e-06, "loss": 0.0525, "step": 42, "step_time": 5.882109656998182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1772.0, "completions/max_terminated_length": 1772.0, "completions/mean_length": 1624.15625, "completions/mean_terminated_length": 1624.15625, "completions/min_length": 879.0, "completions/min_terminated_length": 879.0, "entropy": 0.3577045015990734, "epoch": 0.00086, "frac_reward_zero_std": 0.0, "grad_norm": 1.2550678253173828, "kl": 0.32372760958969593, "learning_rate": 9.999999909259504e-06, "loss": -0.0496, "num_tokens": 1615506.0, "reward": -5.2150774002075195, "reward_std": 6.197805404663086, "rewards/rollout_reward_func/mean": -5.2150774002075195, "rewards/rollout_reward_func/std": 8.147597312927246, "sampling/importance_sampling_ratio/max": 2.349579095840454, "sampling/importance_sampling_ratio/mean": 0.946481466293335, "sampling/importance_sampling_ratio/min": 0.06307531893253326, "sampling/sampling_logp_difference/max": 1.156632423400879, "sampling/sampling_logp_difference/mean": 0.07861532270908356, "step": 43, "step_time": 37.13630858600118 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005859375, "entropy": 0.3559652045369148, "epoch": 0.00088, "grad_norm": 1.0217231512069702, "kl": 0.3223106600344181, "learning_rate": 9.9999998814818e-06, "loss": -0.0515, "step": 44, "step_time": 6.2407338969997 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001953125, "completions/clipped_ratio": 0.0, "completions/max_length": 1814.0, "completions/max_terminated_length": 1814.0, "completions/mean_length": 1681.125, "completions/mean_terminated_length": 1681.125, "completions/min_length": 1242.0, "completions/min_terminated_length": 1242.0, "entropy": 0.3759612925350666, "epoch": 0.0009, "frac_reward_zero_std": 0.0, "grad_norm": 1.528295636177063, "kl": 0.4129584180191159, "learning_rate": 9.999999850000403e-06, "loss": -0.0326, "num_tokens": 1690335.0, "reward": -2.4644203186035156, "reward_std": 10.457454681396484, "rewards/rollout_reward_func/mean": -2.4644203186035156, "rewards/rollout_reward_func/std": 15.407123565673828, "sampling/importance_sampling_ratio/max": 2.56207537651062, "sampling/importance_sampling_ratio/mean": 0.8990581035614014, "sampling/importance_sampling_ratio/min": 0.13844478130340576, "sampling/sampling_logp_difference/max": 1.2251713275909424, "sampling/sampling_logp_difference/mean": 0.07088702917098999, "step": 45, "step_time": 36.063344202000735 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "entropy": 0.37699316069483757, "epoch": 0.00092, "grad_norm": 1.5766605138778687, "kl": 0.37809659354388714, "learning_rate": 9.999999814815314e-06, "loss": -0.0346, "step": 46, "step_time": 5.878628188998846 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001953125, "completions/clipped_ratio": 0.0, "completions/max_length": 1796.0, "completions/max_terminated_length": 1796.0, "completions/mean_length": 1684.96875, "completions/mean_terminated_length": 1684.96875, "completions/min_length": 1413.0, "completions/min_terminated_length": 1413.0, "entropy": 0.35037482157349586, "epoch": 0.00094, "frac_reward_zero_std": 0.0, "grad_norm": 0.7219093441963196, "kl": 0.3052864633500576, "learning_rate": 9.99999977592653e-06, "loss": -0.1454, "num_tokens": 1765997.0, "reward": -5.9493207931518555, "reward_std": 6.758513450622559, "rewards/rollout_reward_func/mean": -5.9493207931518555, "rewards/rollout_reward_func/std": 7.776234149932861, "sampling/importance_sampling_ratio/max": 2.8358778953552246, "sampling/importance_sampling_ratio/mean": 0.7998743653297424, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.1099776029586792, "sampling/sampling_logp_difference/mean": 0.0953160896897316, "step": 47, "step_time": 37.005451356000776 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005859375, "entropy": 0.34440867975354195, "epoch": 0.00096, "grad_norm": 0.6548582911491394, "kl": 0.3214886896312237, "learning_rate": 9.999999733334051e-06, "loss": -0.1452, "step": 48, "step_time": 5.856181479999577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1795.0, "completions/max_terminated_length": 1795.0, "completions/mean_length": 1652.40625, "completions/mean_terminated_length": 1652.40625, "completions/min_length": 1492.0, "completions/min_terminated_length": 1492.0, "entropy": 0.3001542203128338, "epoch": 0.00098, "frac_reward_zero_std": 0.0, "grad_norm": 1.1067109107971191, "kl": 0.20680313045158982, "learning_rate": 9.99999968703788e-06, "loss": -0.1033, "num_tokens": 1839890.0, "reward": -3.3683390617370605, "reward_std": 3.6531944274902344, "rewards/rollout_reward_func/mean": -3.3683390617370605, "rewards/rollout_reward_func/std": 7.470107078552246, "sampling/importance_sampling_ratio/max": 2.1421477794647217, "sampling/importance_sampling_ratio/mean": 1.1610007286071777, "sampling/importance_sampling_ratio/min": 0.263545960187912, "sampling/sampling_logp_difference/max": 0.9924228191375732, "sampling/sampling_logp_difference/mean": 0.05461367964744568, "step": 49, "step_time": 38.469305269999495 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "entropy": 0.29572881013154984, "epoch": 0.001, "grad_norm": 0.9047728180885315, "kl": 0.22563101211562753, "learning_rate": 9.999999637038016e-06, "loss": -0.1081, "step": 50, "step_time": 5.818200681000235 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "completions/clipped_ratio": 0.0, "completions/max_length": 1796.0, "completions/max_terminated_length": 1796.0, "completions/mean_length": 1690.875, "completions/mean_terminated_length": 1690.875, "completions/min_length": 1463.0, "completions/min_terminated_length": 1463.0, "entropy": 0.31936580687761307, "epoch": 0.00102, "frac_reward_zero_std": 0.0, "grad_norm": 0.7223824262619019, "kl": 0.3224958088248968, "learning_rate": 9.999999583334458e-06, "loss": -0.1377, "num_tokens": 1915271.0, "reward": -3.352957248687744, "reward_std": 4.21926212310791, "rewards/rollout_reward_func/mean": -3.352957248687744, "rewards/rollout_reward_func/std": 6.320347309112549, "sampling/importance_sampling_ratio/max": 2.287932872772217, "sampling/importance_sampling_ratio/mean": 0.8120144605636597, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.2639083862304688, "sampling/sampling_logp_difference/mean": 0.07481236755847931, "step": 51, "step_time": 37.81962620299964 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005859375, "entropy": 0.3146309144794941, "epoch": 0.00104, "grad_norm": 0.7475705742835999, "kl": 0.33450872637331486, "learning_rate": 9.999999525927207e-06, "loss": -0.1398, "step": 52, "step_time": 5.822449180000149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1839.0, "completions/max_terminated_length": 1839.0, "completions/mean_length": 1702.75, "completions/mean_terminated_length": 1702.75, "completions/min_length": 1476.0, "completions/min_terminated_length": 1476.0, "entropy": 0.322977501899004, "epoch": 0.00106, "frac_reward_zero_std": 0.0, "grad_norm": 1.198279619216919, "kl": 0.2917388379573822, "learning_rate": 9.999999464816262e-06, "loss": -0.1064, "num_tokens": 1991060.0, "reward": -1.9325592517852783, "reward_std": 4.967609882354736, "rewards/rollout_reward_func/mean": -1.9325592517852783, "rewards/rollout_reward_func/std": 5.8073954582214355, "sampling/importance_sampling_ratio/max": 2.789353847503662, "sampling/importance_sampling_ratio/mean": 1.1503762006759644, "sampling/importance_sampling_ratio/min": 0.1404324173927307, "sampling/sampling_logp_difference/max": 1.199690580368042, "sampling/sampling_logp_difference/mean": 0.07725630700588226, "step": 53, "step_time": 36.42649295499905 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "entropy": 0.31714847683906555, "epoch": 0.00108, "grad_norm": 1.1143569946289062, "kl": 0.3153774570673704, "learning_rate": 9.999999400001624e-06, "loss": -0.1103, "step": 54, "step_time": 6.59381660300005 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001953125, "completions/clipped_ratio": 0.0, "completions/max_length": 1815.0, "completions/max_terminated_length": 1815.0, "completions/mean_length": 1677.09375, "completions/mean_terminated_length": 1677.09375, "completions/min_length": 1520.0, "completions/min_terminated_length": 1520.0, "entropy": 0.2675260417163372, "epoch": 0.0011, "frac_reward_zero_std": 0.125, "grad_norm": 0.6082053780555725, "kl": 0.35145391430705786, "learning_rate": 9.999999331483293e-06, "loss": -0.0145, "num_tokens": 2065830.0, "reward": -7.571907997131348, "reward_std": 7.06196928024292, "rewards/rollout_reward_func/mean": -7.571907997131348, "rewards/rollout_reward_func/std": 10.322997093200684, "sampling/importance_sampling_ratio/max": 2.288501024246216, "sampling/importance_sampling_ratio/mean": 0.8737363815307617, "sampling/importance_sampling_ratio/min": 0.11803531646728516, "sampling/sampling_logp_difference/max": 1.27490234375, "sampling/sampling_logp_difference/mean": 0.07949512451887131, "step": 55, "step_time": 37.09443227400061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.00390625, "clip_ratio/region_mean": 0.00390625, "entropy": 0.26163551956415176, "epoch": 0.00112, "grad_norm": 0.5967329144477844, "kl": 0.3754094559699297, "learning_rate": 9.999999259261269e-06, "loss": -0.0158, "step": 56, "step_time": 5.860317817000578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1822.0, "completions/max_terminated_length": 1822.0, "completions/mean_length": 1653.90625, "completions/mean_terminated_length": 1653.90625, "completions/min_length": 1317.0, "completions/min_terminated_length": 1317.0, "entropy": 0.32117627188563347, "epoch": 0.00114, "frac_reward_zero_std": 0.0, "grad_norm": 0.9490989446640015, "kl": 0.3084982577711344, "learning_rate": 9.999999183335551e-06, "loss": -0.1246, "num_tokens": 2139937.0, "reward": -3.708850860595703, "reward_std": 6.1540679931640625, "rewards/rollout_reward_func/mean": -3.708850860595703, "rewards/rollout_reward_func/std": 7.61086893081665, "sampling/importance_sampling_ratio/max": 2.2644283771514893, "sampling/importance_sampling_ratio/mean": 0.9409332275390625, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.2469470500946045, "sampling/sampling_logp_difference/mean": 0.08474647253751755, "step": 57, "step_time": 36.73273920400061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.005859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005859375, "entropy": 0.31612110882997513, "epoch": 0.00116, "grad_norm": 0.9802373647689819, "kl": 0.3413949944078922, "learning_rate": 9.999999103706142e-06, "loss": -0.1259, "step": 58, "step_time": 5.886546145001375 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001953125, "completions/clipped_ratio": 0.0, "completions/max_length": 1837.0, "completions/max_terminated_length": 1837.0, "completions/mean_length": 1626.5625, "completions/mean_terminated_length": 1626.5625, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "entropy": 0.30012011528015137, "epoch": 0.00118, "frac_reward_zero_std": 0.0, "grad_norm": 1.0481098890304565, "kl": 0.5502468682825565, "learning_rate": 9.999999020373038e-06, "loss": -0.1811, "num_tokens": 2213209.0, "reward": -5.389894485473633, "reward_std": 4.981201648712158, "rewards/rollout_reward_func/mean": -5.389894485473633, "rewards/rollout_reward_func/std": 6.769619941711426, "sampling/importance_sampling_ratio/max": 2.7308578491210938, "sampling/importance_sampling_ratio/mean": 0.9160431027412415, "sampling/importance_sampling_ratio/min": 0.08488596975803375, "sampling/sampling_logp_difference/max": 1.468017578125, "sampling/sampling_logp_difference/mean": 0.09639683365821838, "step": 59, "step_time": 37.19838971900026 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005859375, "entropy": 0.2991691865026951, "epoch": 0.0012, "grad_norm": 0.8857253193855286, "kl": 0.5988470073789358, "learning_rate": 9.999998933336242e-06, "loss": -0.1844, "step": 60, "step_time": 6.367170782000358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1822.0, "completions/max_terminated_length": 1822.0, "completions/mean_length": 1680.0625, "completions/mean_terminated_length": 1680.0625, "completions/min_length": 1396.0, "completions/min_terminated_length": 1396.0, "entropy": 0.27592010982334614, "epoch": 0.00122, "frac_reward_zero_std": 0.125, "grad_norm": 1.353814721107483, "kl": 0.37851969711482525, "learning_rate": 9.999998842595754e-06, "loss": -0.0009, "num_tokens": 2288126.0, "reward": -4.979825496673584, "reward_std": 6.0550150871276855, "rewards/rollout_reward_func/mean": -4.979825496673584, "rewards/rollout_reward_func/std": 9.026530265808105, "sampling/importance_sampling_ratio/max": 1.9612035751342773, "sampling/importance_sampling_ratio/mean": 0.9742088317871094, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 2.270935535430908, "sampling/sampling_logp_difference/mean": 0.07654492557048798, "step": 61, "step_time": 37.15296104299978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.272587139159441, "epoch": 0.00124, "grad_norm": 1.322236180305481, "kl": 0.403486505150795, "learning_rate": 9.999998748151573e-06, "loss": -0.0003, "step": 62, "step_time": 5.884493231998022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1779.0, "completions/max_terminated_length": 1779.0, "completions/mean_length": 1606.4375, "completions/mean_terminated_length": 1606.4375, "completions/min_length": 1067.0, "completions/min_terminated_length": 1067.0, "entropy": 0.24129757285118103, "epoch": 0.00126, "frac_reward_zero_std": 0.125, "grad_norm": 0.8646777868270874, "kl": 0.3206884413957596, "learning_rate": 9.999998650003697e-06, "loss": -0.0263, "num_tokens": 2361057.0, "reward": -4.465510845184326, "reward_std": 6.844207286834717, "rewards/rollout_reward_func/mean": -4.465510845184326, "rewards/rollout_reward_func/std": 8.709650039672852, "sampling/importance_sampling_ratio/max": 2.8052289485931396, "sampling/importance_sampling_ratio/mean": 0.9944058656692505, "sampling/importance_sampling_ratio/min": 0.04083564504981041, "sampling/sampling_logp_difference/max": 1.8856086730957031, "sampling/sampling_logp_difference/mean": 0.07281368225812912, "step": 63, "step_time": 36.78395269500015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001953125, "entropy": 0.23481187783181667, "epoch": 0.00128, "grad_norm": 0.843463659286499, "kl": 0.33893433026969433, "learning_rate": 9.999998548152132e-06, "loss": -0.027, "step": 64, "step_time": 5.761317184999825 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001953125, "completions/clipped_ratio": 0.0, "completions/max_length": 1810.0, "completions/max_terminated_length": 1810.0, "completions/mean_length": 1699.90625, "completions/mean_terminated_length": 1699.90625, "completions/min_length": 1413.0, "completions/min_terminated_length": 1413.0, "entropy": 0.2767509985715151, "epoch": 0.0013, "frac_reward_zero_std": 0.0, "grad_norm": 1.0217297077178955, "kl": 0.45753872115164995, "learning_rate": 9.999998442596872e-06, "loss": -0.0874, "num_tokens": 2437012.0, "reward": -3.256406307220459, "reward_std": 6.123032093048096, "rewards/rollout_reward_func/mean": -3.256406307220459, "rewards/rollout_reward_func/std": 6.840267181396484, "sampling/importance_sampling_ratio/max": 2.3409619331359863, "sampling/importance_sampling_ratio/mean": 0.8038904070854187, "sampling/importance_sampling_ratio/min": 0.022516217082738876, "sampling/sampling_logp_difference/max": 1.6859521865844727, "sampling/sampling_logp_difference/mean": 0.08254212141036987, "step": 65, "step_time": 36.94500934499865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001953125, "entropy": 0.2692566681653261, "epoch": 0.00132, "grad_norm": 0.8563345670700073, "kl": 0.4730011150240898, "learning_rate": 9.999998333337923e-06, "loss": -0.0897, "step": 66, "step_time": 6.347890593002376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0026041667442768812, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0026041667442768812, "completions/clipped_ratio": 0.0, "completions/max_length": 1795.0, "completions/max_terminated_length": 1795.0, "completions/mean_length": 1669.125, "completions/mean_terminated_length": 1669.125, "completions/min_length": 641.0, "completions/min_terminated_length": 641.0, "entropy": 0.239725174382329, "epoch": 0.00134, "frac_reward_zero_std": 0.0, "grad_norm": 0.8270325660705566, "kl": 0.3960014134645462, "learning_rate": 9.99999822037528e-06, "loss": -0.107, "num_tokens": 2511757.0, "reward": -6.540927886962891, "reward_std": 7.1129469871521, "rewards/rollout_reward_func/mean": -6.540927886962891, "rewards/rollout_reward_func/std": 10.2684907913208, "sampling/importance_sampling_ratio/max": 2.9500951766967773, "sampling/importance_sampling_ratio/mean": 1.145231008529663, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.810743808746338, "sampling/sampling_logp_difference/mean": 0.07273144274950027, "step": 67, "step_time": 36.61473885000032 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.004557291744276881, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006510416744276881, "entropy": 0.23407302796840668, "epoch": 0.00136, "grad_norm": 0.7608462572097778, "kl": 0.389744964428246, "learning_rate": 9.999998103708944e-06, "loss": -0.1089, "step": 68, "step_time": 5.844826974000171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001953125, "completions/clipped_ratio": 0.0, "completions/max_length": 1819.0, "completions/max_terminated_length": 1819.0, "completions/mean_length": 1668.25, "completions/mean_terminated_length": 1668.25, "completions/min_length": 1502.0, "completions/min_terminated_length": 1502.0, "entropy": 0.2598415594547987, "epoch": 0.00138, "frac_reward_zero_std": 0.0, "grad_norm": 0.7419317960739136, "kl": 0.28893633373081684, "learning_rate": 9.999997983338918e-06, "loss": -0.0011, "num_tokens": 2586505.0, "reward": -7.137851238250732, "reward_std": 7.074878692626953, "rewards/rollout_reward_func/mean": -7.137851238250732, "rewards/rollout_reward_func/std": 10.856270790100098, "sampling/importance_sampling_ratio/max": 2.6760199069976807, "sampling/importance_sampling_ratio/mean": 0.8516594171524048, "sampling/importance_sampling_ratio/min": 0.13397420942783356, "sampling/sampling_logp_difference/max": 1.6180033683776855, "sampling/sampling_logp_difference/mean": 0.07290571928024292, "step": 69, "step_time": 37.49325247999877 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.005859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009765625, "entropy": 0.25334353744983673, "epoch": 0.0014, "grad_norm": 0.7260986566543579, "kl": 0.3038715925067663, "learning_rate": 9.999997859265198e-06, "loss": -0.0033, "step": 70, "step_time": 6.613173748000918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "completions/clipped_ratio": 0.0, "completions/max_length": 1825.0, "completions/max_terminated_length": 1825.0, "completions/mean_length": 1665.625, "completions/mean_terminated_length": 1665.625, "completions/min_length": 1103.0, "completions/min_terminated_length": 1103.0, "entropy": 0.1995892282575369, "epoch": 0.00142, "frac_reward_zero_std": 0.0, "grad_norm": 0.8726052641868591, "kl": 0.3035321347415447, "learning_rate": 9.999997731487788e-06, "loss": -0.1951, "num_tokens": 2661197.0, "reward": 0.01697838306427002, "reward_std": 5.1158766746521, "rewards/rollout_reward_func/mean": 0.01697838306427002, "rewards/rollout_reward_func/std": 10.660343170166016, "sampling/importance_sampling_ratio/max": 2.622579574584961, "sampling/importance_sampling_ratio/mean": 0.7724930047988892, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.3327304124832153, "sampling/sampling_logp_difference/mean": 0.068142369389534, "step": 71, "step_time": 35.71897001199886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "entropy": 0.19654854200780392, "epoch": 0.00144, "grad_norm": 0.9153209328651428, "kl": 0.30595986917614937, "learning_rate": 9.999997600006685e-06, "loss": -0.1967, "step": 72, "step_time": 5.866601767999782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1808.0, "completions/max_terminated_length": 1808.0, "completions/mean_length": 1696.9375, "completions/mean_terminated_length": 1696.9375, "completions/min_length": 1599.0, "completions/min_terminated_length": 1599.0, "entropy": 0.21030581928789616, "epoch": 0.00146, "frac_reward_zero_std": 0.0, "grad_norm": 0.7310628890991211, "kl": 1.0501810098066926, "learning_rate": 9.999997464821892e-06, "loss": -0.2747, "num_tokens": 2736835.0, "reward": -7.002237319946289, "reward_std": 6.102571487426758, "rewards/rollout_reward_func/mean": -7.002237319946289, "rewards/rollout_reward_func/std": 10.413652420043945, "sampling/importance_sampling_ratio/max": 2.1960866451263428, "sampling/importance_sampling_ratio/mean": 0.8038663268089294, "sampling/importance_sampling_ratio/min": 0.046504825353622437, "sampling/sampling_logp_difference/max": 2.5245094299316406, "sampling/sampling_logp_difference/mean": 0.09849925339221954, "step": 73, "step_time": 38.96205426400138 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005859375, "entropy": 0.20804075710475445, "epoch": 0.00148, "grad_norm": 0.6374855041503906, "kl": 0.9734249282628298, "learning_rate": 9.999997325933409e-06, "loss": -0.2766, "step": 74, "step_time": 5.84851037899989 }, { "clip_ratio/high_max": 0.0032051282469183207, "clip_ratio/high_mean": 0.0016025641234591603, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0035556891234591603, "completions/clipped_ratio": 0.0, "completions/max_length": 1783.0, "completions/max_terminated_length": 1783.0, "completions/mean_length": 1692.96875, "completions/mean_terminated_length": 1692.96875, "completions/min_length": 1394.0, "completions/min_terminated_length": 1394.0, "entropy": 0.19729920756071806, "epoch": 0.0015, "frac_reward_zero_std": 0.125, "grad_norm": 0.8172791600227356, "kl": 0.4751043822616339, "learning_rate": 9.999997183341233e-06, "loss": -0.142, "num_tokens": 2812279.0, "reward": -2.219168186187744, "reward_std": 9.400641441345215, "rewards/rollout_reward_func/mean": -2.219168186187744, "rewards/rollout_reward_func/std": 17.64300537109375, "sampling/importance_sampling_ratio/max": 2.173631429672241, "sampling/importance_sampling_ratio/mean": 0.8823361396789551, "sampling/importance_sampling_ratio/min": 0.03189156949520111, "sampling/sampling_logp_difference/max": 1.8222627639770508, "sampling/sampling_logp_difference/mean": 0.0679212361574173, "step": 75, "step_time": 35.097398169999906 }, { "clip_ratio/high_max": 0.007111378246918321, "clip_ratio/high_mean": 0.0035556891234591603, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00550881412345916, "entropy": 0.19708813540637493, "epoch": 0.00152, "grad_norm": 0.7900307774543762, "kl": 0.43427742179483175, "learning_rate": 9.999997037045365e-06, "loss": -0.1431, "step": 76, "step_time": 6.543489481999131 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001953125, "completions/clipped_ratio": 0.0, "completions/max_length": 1808.0, "completions/max_terminated_length": 1808.0, "completions/mean_length": 1717.5625, "completions/mean_terminated_length": 1717.5625, "completions/min_length": 1391.0, "completions/min_terminated_length": 1391.0, "entropy": 0.15479024220257998, "epoch": 0.00154, "frac_reward_zero_std": 0.125, "grad_norm": 0.6995255947113037, "kl": 0.30347975343465805, "learning_rate": 9.999996887045808e-06, "loss": 0.0408, "num_tokens": 2888657.0, "reward": -4.00355339050293, "reward_std": 4.775286674499512, "rewards/rollout_reward_func/mean": -4.00355339050293, "rewards/rollout_reward_func/std": 6.246252536773682, "sampling/importance_sampling_ratio/max": 1.650878667831421, "sampling/importance_sampling_ratio/mean": 1.0060396194458008, "sampling/importance_sampling_ratio/min": 0.047248467803001404, "sampling/sampling_logp_difference/max": 1.4376678466796875, "sampling/sampling_logp_difference/mean": 0.05947484076023102, "step": 77, "step_time": 38.8434166800007 }, { "clip_ratio/high_max": 0.01171875, "clip_ratio/high_mean": 0.005859375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005859375, "entropy": 0.15612738858908415, "epoch": 0.00156, "grad_norm": 0.6284993886947632, "kl": 0.2821835596114397, "learning_rate": 9.99999673334256e-06, "loss": 0.0397, "step": 78, "step_time": 5.918868127000678 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005859375, "completions/clipped_ratio": 0.0, "completions/max_length": 1762.0, "completions/max_terminated_length": 1762.0, "completions/mean_length": 1664.59375, "completions/mean_terminated_length": 1664.59375, "completions/min_length": 1434.0, "completions/min_terminated_length": 1434.0, "entropy": 0.17359685897827148, "epoch": 0.00158, "frac_reward_zero_std": 0.0, "grad_norm": 0.977432370185852, "kl": 0.6586176492273808, "learning_rate": 9.99999657593562e-06, "loss": -0.2064, "num_tokens": 2963303.0, "reward": -4.805361270904541, "reward_std": 6.155376434326172, "rewards/rollout_reward_func/mean": -4.805361270904541, "rewards/rollout_reward_func/std": 7.987611293792725, "sampling/importance_sampling_ratio/max": 2.683112144470215, "sampling/importance_sampling_ratio/mean": 0.8393880128860474, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 2.071775197982788, "sampling/sampling_logp_difference/mean": 0.07382857799530029, "step": 79, "step_time": 37.2499062830002 }, { "clip_ratio/high_max": 0.01171875, "clip_ratio/high_mean": 0.005859375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005859375, "entropy": 0.1753261275589466, "epoch": 0.0016, "grad_norm": 0.7493237257003784, "kl": 0.522200190462172, "learning_rate": 9.99999641482499e-06, "loss": -0.2097, "step": 80, "step_time": 5.753503210998133 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "completions/clipped_ratio": 0.0, "completions/max_length": 1832.0, "completions/max_terminated_length": 1832.0, "completions/mean_length": 1708.34375, "completions/mean_terminated_length": 1708.34375, "completions/min_length": 1412.0, "completions/min_terminated_length": 1412.0, "entropy": 0.2533543687313795, "epoch": 0.00162, "frac_reward_zero_std": 0.0, "grad_norm": 0.9206402897834778, "kl": 0.49991538375616074, "learning_rate": 9.999996250010671e-06, "loss": -0.1446, "num_tokens": 3039686.0, "reward": -3.956460952758789, "reward_std": 7.495929718017578, "rewards/rollout_reward_func/mean": -3.956460952758789, "rewards/rollout_reward_func/std": 8.32241153717041, "sampling/importance_sampling_ratio/max": 2.4827659130096436, "sampling/importance_sampling_ratio/mean": 0.9107170701026917, "sampling/importance_sampling_ratio/min": 0.11611815541982651, "sampling/sampling_logp_difference/max": 1.4147658348083496, "sampling/sampling_logp_difference/mean": 0.07592972368001938, "step": 81, "step_time": 37.18665667199912 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005859375, "entropy": 0.2543573584407568, "epoch": 0.00164, "grad_norm": 0.854416012763977, "kl": 0.47987215034663677, "learning_rate": 9.999996081492662e-06, "loss": -0.1459, "step": 82, "step_time": 5.884696772999632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "completions/clipped_ratio": 0.0, "completions/max_length": 1824.0, "completions/max_terminated_length": 1824.0, "completions/mean_length": 1733.9375, "completions/mean_terminated_length": 1733.9375, "completions/min_length": 1618.0, "completions/min_terminated_length": 1618.0, "entropy": 0.15933354571461678, "epoch": 0.00166, "frac_reward_zero_std": 0.0, "grad_norm": 0.8887842893600464, "kl": 0.3857283741235733, "learning_rate": 9.999995909270962e-06, "loss": -0.166, "num_tokens": 3116205.0, "reward": -4.7430243492126465, "reward_std": 5.144591808319092, "rewards/rollout_reward_func/mean": -4.7430243492126465, "rewards/rollout_reward_func/std": 8.597419738769531, "sampling/importance_sampling_ratio/max": 2.8380703926086426, "sampling/importance_sampling_ratio/mean": 1.0134353637695312, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.3160133361816406, "sampling/sampling_logp_difference/mean": 0.05554642528295517, "step": 83, "step_time": 37.96955776300001 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.15759241580963135, "epoch": 0.00168, "grad_norm": 0.837668240070343, "kl": 0.40188954304903746, "learning_rate": 9.999995733345573e-06, "loss": -0.1676, "step": 84, "step_time": 5.867369008998139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.00390625, "clip_ratio/region_mean": 0.00390625, "completions/clipped_ratio": 0.0, "completions/max_length": 1805.0, "completions/max_terminated_length": 1805.0, "completions/mean_length": 1607.8125, "completions/mean_terminated_length": 1607.8125, "completions/min_length": 745.0, "completions/min_terminated_length": 745.0, "entropy": 0.2709789536893368, "epoch": 0.0017, "frac_reward_zero_std": 0.0, "grad_norm": 0.9693952798843384, "kl": 0.7671259762719274, "learning_rate": 9.999995553716494e-06, "loss": -0.0133, "num_tokens": 3189437.0, "reward": -6.506250381469727, "reward_std": 6.401736736297607, "rewards/rollout_reward_func/mean": -6.506250381469727, "rewards/rollout_reward_func/std": 11.671521186828613, "sampling/importance_sampling_ratio/max": 2.657285690307617, "sampling/importance_sampling_ratio/mean": 0.9502277374267578, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 2.1241543292999268, "sampling/sampling_logp_difference/mean": 0.06842806935310364, "step": 85, "step_time": 35.07483531900107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0047940341755747795, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0047940341755747795, "entropy": 0.2695111036300659, "epoch": 0.00172, "grad_norm": 0.9897550344467163, "kl": 0.8880385467782617, "learning_rate": 9.999995370383725e-06, "loss": -0.0147, "step": 86, "step_time": 5.823899960000745 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "completions/clipped_ratio": 0.0, "completions/max_length": 1834.0, "completions/max_terminated_length": 1834.0, "completions/mean_length": 1718.625, "completions/mean_terminated_length": 1718.625, "completions/min_length": 1620.0, "completions/min_terminated_length": 1620.0, "entropy": 0.19925166107714176, "epoch": 0.00174, "frac_reward_zero_std": 0.0, "grad_norm": 0.8225500583648682, "kl": 0.40306959114968777, "learning_rate": 9.999995183347268e-06, "loss": -0.1216, "num_tokens": 3265817.0, "reward": -4.694127082824707, "reward_std": 7.164846897125244, "rewards/rollout_reward_func/mean": -4.694127082824707, "rewards/rollout_reward_func/std": 10.58739948272705, "sampling/importance_sampling_ratio/max": 2.249318838119507, "sampling/importance_sampling_ratio/mean": 0.8394811153411865, "sampling/importance_sampling_ratio/min": 0.12018804997205734, "sampling/sampling_logp_difference/max": 1.5835975408554077, "sampling/sampling_logp_difference/mean": 0.07420962303876877, "step": 87, "step_time": 38.998291893000896 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.005859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.1985994167625904, "epoch": 0.00176, "grad_norm": 0.7866024374961853, "kl": 0.4124698657542467, "learning_rate": 9.999994992607122e-06, "loss": -0.1228, "step": 88, "step_time": 6.345813049000753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.005430640187114477, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005430640187114477, "completions/clipped_ratio": 0.0, "completions/max_length": 1791.0, "completions/max_terminated_length": 1791.0, "completions/mean_length": 1672.28125, "completions/mean_terminated_length": 1672.28125, "completions/min_length": 1437.0, "completions/min_terminated_length": 1437.0, "entropy": 0.15057391114532948, "epoch": 0.00178, "frac_reward_zero_std": 0.0, "grad_norm": 0.8857223987579346, "kl": 0.7324486412107944, "learning_rate": 9.999994798163286e-06, "loss": -0.1942, "num_tokens": 3340291.0, "reward": -4.911107540130615, "reward_std": 5.658788204193115, "rewards/rollout_reward_func/mean": -4.911107540130615, "rewards/rollout_reward_func/std": 9.124991416931152, "sampling/importance_sampling_ratio/max": 1.8904942274093628, "sampling/importance_sampling_ratio/mean": 0.8380607962608337, "sampling/importance_sampling_ratio/min": 0.02950156107544899, "sampling/sampling_logp_difference/max": 1.5451288223266602, "sampling/sampling_logp_difference/mean": 0.06332387030124664, "step": 89, "step_time": 37.11040565100211 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001953125, "entropy": 0.15209791343659163, "epoch": 0.0018, "grad_norm": 0.7109376788139343, "kl": 0.6368166394531727, "learning_rate": 9.999994600015764e-06, "loss": -0.1955, "step": 90, "step_time": 5.825622919000125 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "completions/clipped_ratio": 0.0, "completions/max_length": 1821.0, "completions/max_terminated_length": 1821.0, "completions/mean_length": 1721.84375, "completions/mean_terminated_length": 1721.84375, "completions/min_length": 1530.0, "completions/min_terminated_length": 1530.0, "entropy": 0.16620426252484322, "epoch": 0.00182, "frac_reward_zero_std": 0.0, "grad_norm": 2.4581727981567383, "kl": 1.7564852200448513, "learning_rate": 9.99999439816455e-06, "loss": -0.0169, "num_tokens": 3416936.0, "reward": -4.905522346496582, "reward_std": 7.8199005126953125, "rewards/rollout_reward_func/mean": -4.905522346496582, "rewards/rollout_reward_func/std": 8.327784538269043, "sampling/importance_sampling_ratio/max": 1.7173593044281006, "sampling/importance_sampling_ratio/mean": 0.9146490693092346, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 2.112088203430176, "sampling/sampling_logp_difference/mean": 0.06157752498984337, "step": 91, "step_time": 37.20597630500015 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "entropy": 0.17287507839500904, "epoch": 0.00184, "grad_norm": 1.492632269859314, "kl": 1.1148988083004951, "learning_rate": 9.999994192609649e-06, "loss": -0.023, "step": 92, "step_time": 5.926312706999852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1774.0, "completions/max_terminated_length": 1774.0, "completions/mean_length": 1653.09375, "completions/mean_terminated_length": 1653.09375, "completions/min_length": 1566.0, "completions/min_terminated_length": 1566.0, "entropy": 0.21760124899446964, "epoch": 0.00186, "frac_reward_zero_std": 0.0, "grad_norm": 0.9200534820556641, "kl": 0.26406371779739857, "learning_rate": 9.99999398335106e-06, "loss": -0.0031, "num_tokens": 3491478.0, "reward": -5.420025825500488, "reward_std": 5.013497829437256, "rewards/rollout_reward_func/mean": -5.420025825500488, "rewards/rollout_reward_func/std": 6.584102630615234, "sampling/importance_sampling_ratio/max": 2.963585376739502, "sampling/importance_sampling_ratio/mean": 0.8498660326004028, "sampling/importance_sampling_ratio/min": 0.2295318990945816, "sampling/sampling_logp_difference/max": 0.9062635898590088, "sampling/sampling_logp_difference/mean": 0.06188333407044411, "step": 93, "step_time": 38.3763018009995 }, { "clip_ratio/high_max": 0.01953125, "clip_ratio/high_mean": 0.01171875, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013671875, "entropy": 0.22959251329302788, "epoch": 0.00188, "grad_norm": 0.6035089492797852, "kl": 0.23167249467223883, "learning_rate": 9.999993770388785e-06, "loss": -0.0067, "step": 94, "step_time": 5.798366992999036 }, { "clip_ratio/high_max": 0.007694128900766373, "clip_ratio/high_mean": 0.0038470644503831863, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0038470644503831863, "completions/clipped_ratio": 0.0, "completions/max_length": 1753.0, "completions/max_terminated_length": 1753.0, "completions/mean_length": 1658.03125, "completions/mean_terminated_length": 1658.03125, "completions/min_length": 1455.0, "completions/min_terminated_length": 1455.0, "entropy": 0.20340878516435623, "epoch": 0.0019, "frac_reward_zero_std": 0.0, "grad_norm": 0.7779289484024048, "kl": 0.3094237130135298, "learning_rate": 9.99999355372282e-06, "loss": -0.1794, "num_tokens": 3565673.0, "reward": -5.6944780349731445, "reward_std": 3.949958562850952, "rewards/rollout_reward_func/mean": -5.6944780349731445, "rewards/rollout_reward_func/std": 5.5989580154418945, "sampling/importance_sampling_ratio/max": 1.688936471939087, "sampling/importance_sampling_ratio/mean": 0.9206300973892212, "sampling/importance_sampling_ratio/min": 0.1527530699968338, "sampling/sampling_logp_difference/max": 0.9441490173339844, "sampling/sampling_logp_difference/mean": 0.050117556005716324, "step": 95, "step_time": 37.41757664999841 }, { "clip_ratio/high_max": 0.01171875, "clip_ratio/high_mean": 0.005859375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005859375, "entropy": 0.2086612544953823, "epoch": 0.00192, "grad_norm": 0.7833954691886902, "kl": 0.29302166029810905, "learning_rate": 9.999993333353169e-06, "loss": -0.1799, "step": 96, "step_time": 5.725412834000053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0028409091755747795, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0028409091755747795, "completions/clipped_ratio": 0.0, "completions/max_length": 1804.0, "completions/max_terminated_length": 1804.0, "completions/mean_length": 1496.625, "completions/mean_terminated_length": 1496.625, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "entropy": 0.2479400299489498, "epoch": 0.00194, "frac_reward_zero_std": 0.0, "grad_norm": 1.1320165395736694, "kl": 0.3989646164700389, "learning_rate": 9.999993109279829e-06, "loss": -0.0546, "num_tokens": 3635185.0, "reward": -5.023990154266357, "reward_std": 3.9877772331237793, "rewards/rollout_reward_func/mean": -5.023990154266357, "rewards/rollout_reward_func/std": 8.748732566833496, "sampling/importance_sampling_ratio/max": 2.214284896850586, "sampling/importance_sampling_ratio/mean": 0.9846020936965942, "sampling/importance_sampling_ratio/min": 0.30134767293930054, "sampling/sampling_logp_difference/max": 0.9754681587219238, "sampling/sampling_logp_difference/mean": 0.061219509690999985, "step": 97, "step_time": 36.602559436000774 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.008984375046566129, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010937500046566129, "entropy": 0.25142903439700603, "epoch": 0.00196, "grad_norm": 1.64491868019104, "kl": 0.39512724056839943, "learning_rate": 9.999992881502803e-06, "loss": -0.0574, "step": 98, "step_time": 6.578721888999098 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001953125, "completions/clipped_ratio": 0.0, "completions/max_length": 1808.0, "completions/max_terminated_length": 1808.0, "completions/mean_length": 1710.1875, "completions/mean_terminated_length": 1710.1875, "completions/min_length": 1435.0, "completions/min_terminated_length": 1435.0, "entropy": 0.23437649384140968, "epoch": 0.00198, "frac_reward_zero_std": 0.125, "grad_norm": 0.9414642453193665, "kl": 0.3236832795664668, "learning_rate": 9.999992650022092e-06, "loss": -0.1719, "num_tokens": 3711125.0, "reward": -2.0210158824920654, "reward_std": 4.779875755310059, "rewards/rollout_reward_func/mean": -2.0210158824920654, "rewards/rollout_reward_func/std": 6.4040093421936035, "sampling/importance_sampling_ratio/max": 2.3287320137023926, "sampling/importance_sampling_ratio/mean": 1.0207520723342896, "sampling/importance_sampling_ratio/min": 0.35892435908317566, "sampling/sampling_logp_difference/max": 1.0962285995483398, "sampling/sampling_logp_difference/mean": 0.044818222522735596, "step": 99, "step_time": 36.753581342999496 }, { "clip_ratio/high_max": 0.01171875, "clip_ratio/high_mean": 0.005859375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005859375, "entropy": 0.2348782755434513, "epoch": 0.002, "grad_norm": 0.9503746628761292, "kl": 0.3419312732294202, "learning_rate": 9.999992414837692e-06, "loss": -0.1746, "step": 100, "step_time": 5.861453168999105 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "completions/clipped_ratio": 0.0, "completions/max_length": 1808.0, "completions/max_terminated_length": 1808.0, "completions/mean_length": 1658.5, "completions/mean_terminated_length": 1658.5, "completions/min_length": 665.0, "completions/min_terminated_length": 665.0, "entropy": 0.2949713133275509, "epoch": 0.00202, "frac_reward_zero_std": 0.0, "grad_norm": 0.8095335960388184, "kl": 0.4149684626609087, "learning_rate": 9.999992175949606e-06, "loss": -0.1327, "num_tokens": 3785245.0, "reward": -1.9772329330444336, "reward_std": 6.376922607421875, "rewards/rollout_reward_func/mean": -1.9772329330444336, "rewards/rollout_reward_func/std": 9.654932022094727, "sampling/importance_sampling_ratio/max": 1.6300160884857178, "sampling/importance_sampling_ratio/mean": 0.9628180265426636, "sampling/importance_sampling_ratio/min": 0.14600704610347748, "sampling/sampling_logp_difference/max": 1.1694939136505127, "sampling/sampling_logp_difference/mean": 0.055236026644706726, "step": 101, "step_time": 36.26178865900147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2924080714583397, "epoch": 0.00204, "grad_norm": 0.8164064884185791, "kl": 0.48169367760419846, "learning_rate": 9.999991933357835e-06, "loss": -0.1325, "step": 102, "step_time": 5.8681000480000876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1814.0, "completions/max_terminated_length": 1814.0, "completions/mean_length": 1653.375, "completions/mean_terminated_length": 1653.375, "completions/min_length": 663.0, "completions/min_terminated_length": 663.0, "entropy": 0.2441821303218603, "epoch": 0.00206, "frac_reward_zero_std": 0.0, "grad_norm": 1.1374014616012573, "kl": 0.19944205041974783, "learning_rate": 9.999991687062379e-06, "loss": -0.1332, "num_tokens": 3859549.0, "reward": -3.1205766201019287, "reward_std": 6.89734411239624, "rewards/rollout_reward_func/mean": -3.1205766201019287, "rewards/rollout_reward_func/std": 8.786608695983887, "sampling/importance_sampling_ratio/max": 2.907442092895508, "sampling/importance_sampling_ratio/mean": 0.9543389678001404, "sampling/importance_sampling_ratio/min": 0.26082849502563477, "sampling/sampling_logp_difference/max": 1.416438102722168, "sampling/sampling_logp_difference/mean": 0.04272625967860222, "step": 103, "step_time": 35.06253007600026 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.006510416744276881, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008463541744276881, "entropy": 0.24291709996759892, "epoch": 0.00208, "grad_norm": 1.0697972774505615, "kl": 0.21361435670405626, "learning_rate": 9.999991437063234e-06, "loss": -0.1369, "step": 104, "step_time": 6.35797552799977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1813.0, "completions/max_terminated_length": 1813.0, "completions/mean_length": 1677.625, "completions/mean_terminated_length": 1677.625, "completions/min_length": 1535.0, "completions/min_terminated_length": 1535.0, "entropy": 0.2205460276454687, "epoch": 0.0021, "frac_reward_zero_std": 0.0, "grad_norm": 0.7834408283233643, "kl": 0.41426409501582384, "learning_rate": 9.999991183360406e-06, "loss": -0.1262, "num_tokens": 3934186.0, "reward": -1.3775752782821655, "reward_std": 6.934841156005859, "rewards/rollout_reward_func/mean": -1.3775752782821655, "rewards/rollout_reward_func/std": 9.078507423400879, "sampling/importance_sampling_ratio/max": 1.6336519718170166, "sampling/importance_sampling_ratio/mean": 0.9894572496414185, "sampling/importance_sampling_ratio/min": 0.09595068544149399, "sampling/sampling_logp_difference/max": 1.2380528450012207, "sampling/sampling_logp_difference/mean": 0.04348953068256378, "step": 105, "step_time": 37.434679850000975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001953125, "entropy": 0.21956982091069221, "epoch": 0.00212, "grad_norm": 0.7701007723808289, "kl": 0.435552092269063, "learning_rate": 9.999990925953894e-06, "loss": -0.1276, "step": 106, "step_time": 5.842552839000746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001953125, "completions/clipped_ratio": 0.0, "completions/max_length": 1784.0, "completions/max_terminated_length": 1784.0, "completions/mean_length": 1683.90625, "completions/mean_terminated_length": 1683.90625, "completions/min_length": 1482.0, "completions/min_terminated_length": 1482.0, "entropy": 0.23681390658020973, "epoch": 0.00214, "frac_reward_zero_std": 0.0, "grad_norm": 0.7701250314712524, "kl": 0.675053995102644, "learning_rate": 9.999990664843696e-06, "loss": -0.2223, "num_tokens": 4009166.0, "reward": -3.5597405433654785, "reward_std": 6.297412395477295, "rewards/rollout_reward_func/mean": -3.5597405433654785, "rewards/rollout_reward_func/std": 8.763272285461426, "sampling/importance_sampling_ratio/max": 1.9215384721755981, "sampling/importance_sampling_ratio/mean": 0.8674914836883545, "sampling/importance_sampling_ratio/min": 0.052470579743385315, "sampling/sampling_logp_difference/max": 1.6477103233337402, "sampling/sampling_logp_difference/mean": 0.06225297600030899, "step": 107, "step_time": 38.8391734249999 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "entropy": 0.23528996109962463, "epoch": 0.00216, "grad_norm": 0.7644266486167908, "kl": 0.6335257366299629, "learning_rate": 9.999990400029814e-06, "loss": -0.2237, "step": 108, "step_time": 5.8248516069998 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001953125, "completions/clipped_ratio": 0.0, "completions/max_length": 1759.0, "completions/max_terminated_length": 1759.0, "completions/mean_length": 1650.09375, "completions/mean_terminated_length": 1650.09375, "completions/min_length": 1436.0, "completions/min_terminated_length": 1436.0, "entropy": 0.2473286334425211, "epoch": 0.00218, "frac_reward_zero_std": 0.0, "grad_norm": 0.9451829791069031, "kl": 0.3451357875019312, "learning_rate": 9.999990131512245e-06, "loss": -0.1751, "num_tokens": 4083095.0, "reward": -3.8237268924713135, "reward_std": 5.033920764923096, "rewards/rollout_reward_func/mean": -3.8237268924713135, "rewards/rollout_reward_func/std": 6.137859344482422, "sampling/importance_sampling_ratio/max": 1.8476587533950806, "sampling/importance_sampling_ratio/mean": 1.0128724575042725, "sampling/importance_sampling_ratio/min": 0.1566690355539322, "sampling/sampling_logp_difference/max": 1.2108018398284912, "sampling/sampling_logp_difference/mean": 0.03772260248661041, "step": 109, "step_time": 37.583575454999846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "entropy": 0.24676945246756077, "epoch": 0.0022, "grad_norm": 0.8795206546783447, "kl": 0.3198219258338213, "learning_rate": 9.999989859290995e-06, "loss": -0.1785, "step": 110, "step_time": 6.2266275209995 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "completions/clipped_ratio": 0.0, "completions/max_length": 1813.0, "completions/max_terminated_length": 1813.0, "completions/mean_length": 1680.4375, "completions/mean_terminated_length": 1680.4375, "completions/min_length": 1474.0, "completions/min_terminated_length": 1474.0, "entropy": 0.2194829098880291, "epoch": 0.00222, "frac_reward_zero_std": 0.0, "grad_norm": 0.7685462832450867, "kl": 0.49289337545633316, "learning_rate": 9.99998958336606e-06, "loss": -0.1256, "num_tokens": 4158181.0, "reward": -5.0853400230407715, "reward_std": 3.0313239097595215, "rewards/rollout_reward_func/mean": -5.0853400230407715, "rewards/rollout_reward_func/std": 4.776071548461914, "sampling/importance_sampling_ratio/max": 1.744321584701538, "sampling/importance_sampling_ratio/mean": 0.8627097606658936, "sampling/importance_sampling_ratio/min": 0.1638958901166916, "sampling/sampling_logp_difference/max": 1.4669370651245117, "sampling/sampling_logp_difference/mean": 0.04983227327466011, "step": 111, "step_time": 37.50297103200319 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "entropy": 0.2189310658723116, "epoch": 0.00224, "grad_norm": 0.7485074400901794, "kl": 0.480657372623682, "learning_rate": 9.999989303737442e-06, "loss": -0.1253, "step": 112, "step_time": 5.845547954002541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "completions/clipped_ratio": 0.0, "completions/max_length": 1802.0, "completions/max_terminated_length": 1802.0, "completions/mean_length": 1632.78125, "completions/mean_terminated_length": 1632.78125, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "entropy": 0.25709761306643486, "epoch": 0.00226, "frac_reward_zero_std": 0.0, "grad_norm": 1.0180292129516602, "kl": 0.28133167419582605, "learning_rate": 9.999989020405141e-06, "loss": -0.0265, "num_tokens": 4231873.0, "reward": -4.746943950653076, "reward_std": 5.858287811279297, "rewards/rollout_reward_func/mean": -4.746943950653076, "rewards/rollout_reward_func/std": 8.34781551361084, "sampling/importance_sampling_ratio/max": 1.501573920249939, "sampling/importance_sampling_ratio/mean": 0.8685078024864197, "sampling/importance_sampling_ratio/min": 0.08650124073028564, "sampling/sampling_logp_difference/max": 1.4456124305725098, "sampling/sampling_logp_difference/mean": 0.04686921089887619, "step": 113, "step_time": 35.9479068110013 }, { "clip_ratio/high_max": 0.014062500093132257, "clip_ratio/high_mean": 0.007031250046566129, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010937500046566129, "entropy": 0.2569599896669388, "epoch": 0.00228, "grad_norm": 0.9531951546669006, "kl": 0.2381299063563347, "learning_rate": 9.999988733369157e-06, "loss": -0.0309, "step": 114, "step_time": 6.549294945001748 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001953125, "completions/clipped_ratio": 0.0, "completions/max_length": 1823.0, "completions/max_terminated_length": 1823.0, "completions/mean_length": 1726.15625, "completions/mean_terminated_length": 1726.15625, "completions/min_length": 1611.0, "completions/min_terminated_length": 1611.0, "entropy": 0.23173769749701023, "epoch": 0.0023, "frac_reward_zero_std": 0.125, "grad_norm": 0.8843880891799927, "kl": 0.37245292216539383, "learning_rate": 9.999988442629489e-06, "loss": 0.0203, "num_tokens": 4308553.0, "reward": -4.190584182739258, "reward_std": 2.972388744354248, "rewards/rollout_reward_func/mean": -4.190584182739258, "rewards/rollout_reward_func/std": 5.20486307144165, "sampling/importance_sampling_ratio/max": 1.6944365501403809, "sampling/importance_sampling_ratio/mean": 0.9236411452293396, "sampling/importance_sampling_ratio/min": 0.13073918223381042, "sampling/sampling_logp_difference/max": 1.3462402820587158, "sampling/sampling_logp_difference/mean": 0.052857253700494766, "step": 115, "step_time": 38.24738468399846 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.005859375, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.23898235149681568, "epoch": 0.00232, "grad_norm": 0.7934396266937256, "kl": 0.37963598500937223, "learning_rate": 9.99998814818614e-06, "loss": 0.0198, "step": 116, "step_time": 5.872056240998063 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001953125, "completions/clipped_ratio": 0.0, "completions/max_length": 1798.0, "completions/max_terminated_length": 1798.0, "completions/mean_length": 1668.34375, "completions/mean_terminated_length": 1668.34375, "completions/min_length": 1463.0, "completions/min_terminated_length": 1463.0, "entropy": 0.2294948324561119, "epoch": 0.00234, "frac_reward_zero_std": 0.0, "grad_norm": 1.2376160621643066, "kl": 0.2936890870332718, "learning_rate": 9.999987850039108e-06, "loss": -0.1774, "num_tokens": 4383556.0, "reward": -3.607532501220703, "reward_std": 5.6845526695251465, "rewards/rollout_reward_func/mean": -3.607532501220703, "rewards/rollout_reward_func/std": 8.885178565979004, "sampling/importance_sampling_ratio/max": 2.275712251663208, "sampling/importance_sampling_ratio/mean": 1.1071913242340088, "sampling/importance_sampling_ratio/min": 0.16783574223518372, "sampling/sampling_logp_difference/max": 1.2079877853393555, "sampling/sampling_logp_difference/mean": 0.04779823124408722, "step": 117, "step_time": 37.413396432997615 }, { "clip_ratio/high_max": 0.01171875, "clip_ratio/high_mean": 0.005859375, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009765625, "entropy": 0.23724446073174477, "epoch": 0.00236, "grad_norm": 1.1334284543991089, "kl": 0.257572659291327, "learning_rate": 9.999987548188395e-06, "loss": -0.1829, "step": 118, "step_time": 5.831991403998472 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001953125, "completions/clipped_ratio": 0.0, "completions/max_length": 1806.0, "completions/max_terminated_length": 1806.0, "completions/mean_length": 1690.75, "completions/mean_terminated_length": 1690.75, "completions/min_length": 1531.0, "completions/min_terminated_length": 1531.0, "entropy": 0.27842383086681366, "epoch": 0.00238, "frac_reward_zero_std": 0.0, "grad_norm": 0.7958987355232239, "kl": 0.3204949628561735, "learning_rate": 9.999987242634e-06, "loss": -0.104, "num_tokens": 4458969.0, "reward": -8.779083251953125, "reward_std": 6.466405868530273, "rewards/rollout_reward_func/mean": -8.779083251953125, "rewards/rollout_reward_func/std": 12.54110336303711, "sampling/importance_sampling_ratio/max": 1.7512174844741821, "sampling/importance_sampling_ratio/mean": 0.9728891849517822, "sampling/importance_sampling_ratio/min": 0.23163382709026337, "sampling/sampling_logp_difference/max": 1.429762363433838, "sampling/sampling_logp_difference/mean": 0.059581462293863297, "step": 119, "step_time": 37.53063563500109 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.005859375, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009765625, "entropy": 0.2830961886793375, "epoch": 0.0024, "grad_norm": 0.7229918837547302, "kl": 0.32604870945215225, "learning_rate": 9.999986933375924e-06, "loss": -0.107, "step": 120, "step_time": 6.557443951995083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1802.0, "completions/max_terminated_length": 1802.0, "completions/mean_length": 1673.0625, "completions/mean_terminated_length": 1669.4515380859375, "completions/min_length": 1072.0, "completions/min_terminated_length": 1072.0, "entropy": 0.3067500479519367, "epoch": 0.00242, "frac_reward_zero_std": 0.0, "grad_norm": 1.1562621593475342, "kl": 0.21205937396734953, "learning_rate": 9.999986620414169e-06, "loss": -0.1476, "num_tokens": 4533539.0, "reward": -5.477802276611328, "reward_std": 3.7002973556518555, "rewards/rollout_reward_func/mean": -5.477802276611328, "rewards/rollout_reward_func/std": 4.9684367179870605, "sampling/importance_sampling_ratio/max": 2.1120123863220215, "sampling/importance_sampling_ratio/mean": 1.002963900566101, "sampling/importance_sampling_ratio/min": 0.1644321084022522, "sampling/sampling_logp_difference/max": 0.7201070785522461, "sampling/sampling_logp_difference/mean": 0.044744931161403656, "step": 121, "step_time": 37.94513329599977 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001953125, "entropy": 0.30317614786326885, "epoch": 0.00244, "grad_norm": 1.1647439002990723, "kl": 0.2262652236968279, "learning_rate": 9.999986303748731e-06, "loss": -0.1508, "step": 122, "step_time": 5.857959121001841 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001953125, "completions/clipped_ratio": 0.0, "completions/max_length": 1785.0, "completions/max_terminated_length": 1785.0, "completions/mean_length": 1429.46875, "completions/mean_terminated_length": 1429.46875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.22994763404130936, "epoch": 0.00246, "frac_reward_zero_std": 0.0, "grad_norm": 1.0889474153518677, "kl": 0.7471896391361952, "learning_rate": 9.999985983379614e-06, "loss": -0.1179, "num_tokens": 4600314.0, "reward": -1.682168960571289, "reward_std": 8.262513160705566, "rewards/rollout_reward_func/mean": -1.682168960571289, "rewards/rollout_reward_func/std": 16.95815658569336, "sampling/importance_sampling_ratio/max": 2.0507054328918457, "sampling/importance_sampling_ratio/mean": 0.9804076552391052, "sampling/importance_sampling_ratio/min": 0.13134591281414032, "sampling/sampling_logp_difference/max": 1.8472480773925781, "sampling/sampling_logp_difference/mean": 0.06341268122196198, "step": 123, "step_time": 33.21367415700297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.005859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005859375, "entropy": 0.22646107524633408, "epoch": 0.00248, "grad_norm": 1.079579472541809, "kl": 0.8753251153975725, "learning_rate": 9.999985659306817e-06, "loss": -0.121, "step": 124, "step_time": 5.792652656999053 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001953125, "completions/clipped_ratio": 0.0, "completions/max_length": 1804.0, "completions/max_terminated_length": 1804.0, "completions/mean_length": 1706.4375, "completions/mean_terminated_length": 1706.4375, "completions/min_length": 1559.0, "completions/min_terminated_length": 1559.0, "entropy": 0.282099112868309, "epoch": 0.0025, "frac_reward_zero_std": 0.0, "grad_norm": 1.3554356098175049, "kl": 0.2657380551099777, "learning_rate": 9.999985331530339e-06, "loss": -0.0685, "num_tokens": 4676158.0, "reward": 1.4572546482086182, "reward_std": 7.827357292175293, "rewards/rollout_reward_func/mean": 1.4572546482086182, "rewards/rollout_reward_func/std": 8.701656341552734, "sampling/importance_sampling_ratio/max": 2.6973798274993896, "sampling/importance_sampling_ratio/mean": 0.9617570638656616, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.366655945777893, "sampling/sampling_logp_difference/mean": 0.06495144963264465, "step": 125, "step_time": 38.350035333998676 }, { "clip_ratio/high_max": 0.01171875, "clip_ratio/high_mean": 0.005859375, "clip_ratio/low_mean": 0.005859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01171875, "entropy": 0.2868986092507839, "epoch": 0.00252, "grad_norm": 1.1578820943832397, "kl": 0.2646348997950554, "learning_rate": 9.999985000050181e-06, "loss": -0.0723, "step": 126, "step_time": 6.268096281999533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001953125, "completions/clipped_ratio": 0.0, "completions/max_length": 1826.0, "completions/max_terminated_length": 1826.0, "completions/mean_length": 1739.8125, "completions/mean_terminated_length": 1739.8125, "completions/min_length": 1644.0, "completions/min_terminated_length": 1644.0, "entropy": 0.26093689538538456, "epoch": 0.00254, "frac_reward_zero_std": 0.0, "grad_norm": 1.2453805208206177, "kl": 0.4399577025324106, "learning_rate": 9.999984664866347e-06, "loss": -0.0086, "num_tokens": 4753406.0, "reward": -2.7315988540649414, "reward_std": 4.536945343017578, "rewards/rollout_reward_func/mean": -2.7315988540649414, "rewards/rollout_reward_func/std": 7.6850104331970215, "sampling/importance_sampling_ratio/max": 2.3371880054473877, "sampling/importance_sampling_ratio/mean": 1.0783873796463013, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.2284293174743652, "sampling/sampling_logp_difference/mean": 0.04868567734956741, "step": 127, "step_time": 37.24148940499981 }, { "clip_ratio/high_max": 0.01171875, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009765625, "entropy": 0.26575652323663235, "epoch": 0.00256, "grad_norm": 1.235435962677002, "kl": 0.4320835890248418, "learning_rate": 9.999984325978833e-06, "loss": -0.0116, "step": 128, "step_time": 5.902758816999267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1805.0, "completions/max_terminated_length": 1805.0, "completions/mean_length": 1686.875, "completions/mean_terminated_length": 1686.875, "completions/min_length": 1159.0, "completions/min_terminated_length": 1159.0, "entropy": 0.3018810376524925, "epoch": 0.00258, "frac_reward_zero_std": 0.0, "grad_norm": 0.9894475936889648, "kl": 0.32896456494927406, "learning_rate": 9.99998398338764e-06, "loss": -0.089, "num_tokens": 4828478.0, "reward": -1.9806309938430786, "reward_std": 5.783495903015137, "rewards/rollout_reward_func/mean": -1.9806309938430786, "rewards/rollout_reward_func/std": 9.691821098327637, "sampling/importance_sampling_ratio/max": 2.419085741043091, "sampling/importance_sampling_ratio/mean": 0.9663759469985962, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.6888089179992676, "sampling/sampling_logp_difference/mean": 0.06528542190790176, "step": 129, "step_time": 37.874985575997925 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005859375, "entropy": 0.3036986291408539, "epoch": 0.0026, "grad_norm": 0.9928931593894958, "kl": 0.33502755127847195, "learning_rate": 9.99998363709277e-06, "loss": -0.0895, "step": 130, "step_time": 5.84719717599728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001953125, "completions/clipped_ratio": 0.0, "completions/max_length": 1772.0, "completions/max_terminated_length": 1772.0, "completions/mean_length": 1671.9375, "completions/mean_terminated_length": 1671.9375, "completions/min_length": 1508.0, "completions/min_terminated_length": 1508.0, "entropy": 0.276044437661767, "epoch": 0.00262, "frac_reward_zero_std": 0.125, "grad_norm": 0.8475321531295776, "kl": 0.3156882934272289, "learning_rate": 9.999983287094222e-06, "loss": -0.0246, "num_tokens": 4903229.0, "reward": -4.237326145172119, "reward_std": 7.174188613891602, "rewards/rollout_reward_func/mean": -4.237326145172119, "rewards/rollout_reward_func/std": 13.78705883026123, "sampling/importance_sampling_ratio/max": 2.0592706203460693, "sampling/importance_sampling_ratio/mean": 0.9671538472175598, "sampling/importance_sampling_ratio/min": 0.17537109553813934, "sampling/sampling_logp_difference/max": 1.267343282699585, "sampling/sampling_logp_difference/mean": 0.05497532710433006, "step": 131, "step_time": 38.15398663500309 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "entropy": 0.27523134648799896, "epoch": 0.00264, "grad_norm": 0.8275483250617981, "kl": 0.3333571758121252, "learning_rate": 9.999982933391998e-06, "loss": -0.0265, "step": 132, "step_time": 6.213580482999532 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001953125, "completions/clipped_ratio": 0.0, "completions/max_length": 1787.0, "completions/max_terminated_length": 1787.0, "completions/mean_length": 1690.6875, "completions/mean_terminated_length": 1690.6875, "completions/min_length": 1552.0, "completions/min_terminated_length": 1552.0, "entropy": 0.31048087403178215, "epoch": 0.00266, "frac_reward_zero_std": 0.0, "grad_norm": 0.9148619771003723, "kl": 0.5922599658370018, "learning_rate": 9.999982575986095e-06, "loss": -0.1689, "num_tokens": 4978651.0, "reward": -7.106402397155762, "reward_std": 7.335752487182617, "rewards/rollout_reward_func/mean": -7.106402397155762, "rewards/rollout_reward_func/std": 10.287908554077148, "sampling/importance_sampling_ratio/max": 2.353391170501709, "sampling/importance_sampling_ratio/mean": 0.7480576038360596, "sampling/importance_sampling_ratio/min": 0.10871558636426926, "sampling/sampling_logp_difference/max": 1.7691650390625, "sampling/sampling_logp_difference/mean": 0.08270461857318878, "step": 133, "step_time": 37.87937480399705 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.005859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.3103002533316612, "epoch": 0.00268, "grad_norm": 0.8720380067825317, "kl": 0.596416313201189, "learning_rate": 9.999982214876516e-06, "loss": -0.1711, "step": 134, "step_time": 5.838397514999087 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001953125, "completions/clipped_ratio": 0.0, "completions/max_length": 1810.0, "completions/max_terminated_length": 1810.0, "completions/mean_length": 1734.5, "completions/mean_terminated_length": 1734.5, "completions/min_length": 1659.0, "completions/min_terminated_length": 1659.0, "entropy": 0.296902384608984, "epoch": 0.0027, "frac_reward_zero_std": 0.0, "grad_norm": 1.5514373779296875, "kl": 0.32854650542140007, "learning_rate": 9.999981850063262e-06, "loss": -0.2692, "num_tokens": 5055484.0, "reward": 2.0809240341186523, "reward_std": 5.269416809082031, "rewards/rollout_reward_func/mean": 2.0809240341186523, "rewards/rollout_reward_func/std": 7.5051188468933105, "sampling/importance_sampling_ratio/max": 2.7047743797302246, "sampling/importance_sampling_ratio/mean": 1.0847183465957642, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.3921051025390625, "sampling/sampling_logp_difference/mean": 0.07780618220567703, "step": 135, "step_time": 37.11384174500017 }, { "clip_ratio/high_max": 0.01171875, "clip_ratio/high_mean": 0.005859375, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.2912406101822853, "epoch": 0.00272, "grad_norm": 1.203852653503418, "kl": 0.34848837181925774, "learning_rate": 9.99998148154633e-06, "loss": -0.2722, "step": 136, "step_time": 6.586184759000389 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001953125, "completions/clipped_ratio": 0.0, "completions/max_length": 1788.0, "completions/max_terminated_length": 1788.0, "completions/mean_length": 1674.96875, "completions/mean_terminated_length": 1674.96875, "completions/min_length": 1567.0, "completions/min_terminated_length": 1567.0, "entropy": 0.2682835068553686, "epoch": 0.00274, "frac_reward_zero_std": 0.0, "grad_norm": 1.322084903717041, "kl": 0.4422433190047741, "learning_rate": 9.999981109325725e-06, "loss": 0.0099, "num_tokens": 5130363.0, "reward": -0.8909265995025635, "reward_std": 6.19950008392334, "rewards/rollout_reward_func/mean": -0.8909265995025635, "rewards/rollout_reward_func/std": 11.643590927124023, "sampling/importance_sampling_ratio/max": 2.353513717651367, "sampling/importance_sampling_ratio/mean": 1.027718186378479, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.9235548973083496, "sampling/sampling_logp_difference/mean": 0.05995417386293411, "step": 137, "step_time": 37.823591190001025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2614995054900646, "epoch": 0.00276, "grad_norm": 1.342252254486084, "kl": 0.4471647199243307, "learning_rate": 9.999980733401442e-06, "loss": 0.0087, "step": 138, "step_time": 5.804548355996303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001953125, "completions/clipped_ratio": 0.0, "completions/max_length": 1774.0, "completions/max_terminated_length": 1774.0, "completions/mean_length": 1635.40625, "completions/mean_terminated_length": 1635.40625, "completions/min_length": 637.0, "completions/min_terminated_length": 637.0, "entropy": 0.28644070588052273, "epoch": 0.00278, "frac_reward_zero_std": 0.0, "grad_norm": 1.5117998123168945, "kl": 0.37019682209938765, "learning_rate": 9.999980353773486e-06, "loss": -0.0649, "num_tokens": 5204087.0, "reward": -2.6386919021606445, "reward_std": 7.861667156219482, "rewards/rollout_reward_func/mean": -2.6386919021606445, "rewards/rollout_reward_func/std": 9.34830093383789, "sampling/importance_sampling_ratio/max": 2.392774820327759, "sampling/importance_sampling_ratio/mean": 0.8888267278671265, "sampling/importance_sampling_ratio/min": 0.06766009330749512, "sampling/sampling_logp_difference/max": 0.964139461517334, "sampling/sampling_logp_difference/mean": 0.07269679009914398, "step": 139, "step_time": 36.273601793001944 }, { "clip_ratio/high_max": 0.02734375, "clip_ratio/high_mean": 0.013671875, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.2876611240208149, "epoch": 0.0028, "grad_norm": 1.0806161165237427, "kl": 0.3748003738000989, "learning_rate": 9.999979970441856e-06, "loss": -0.066, "step": 140, "step_time": 5.793748694000897 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "completions/clipped_ratio": 0.0, "completions/max_length": 1778.0, "completions/max_terminated_length": 1778.0, "completions/mean_length": 1630.625, "completions/mean_terminated_length": 1630.625, "completions/min_length": 1075.0, "completions/min_terminated_length": 1075.0, "entropy": 0.2753357030451298, "epoch": 0.00282, "frac_reward_zero_std": 0.0, "grad_norm": 1.2243664264678955, "kl": 0.3961847685277462, "learning_rate": 9.999979583406551e-06, "loss": -0.1401, "num_tokens": 5277342.0, "reward": -0.9446412324905396, "reward_std": 6.179128646850586, "rewards/rollout_reward_func/mean": -0.9446412324905396, "rewards/rollout_reward_func/std": 7.662261009216309, "sampling/importance_sampling_ratio/max": 2.07578706741333, "sampling/importance_sampling_ratio/mean": 0.9215522408485413, "sampling/importance_sampling_ratio/min": 0.17828358709812164, "sampling/sampling_logp_difference/max": 1.095733880996704, "sampling/sampling_logp_difference/mean": 0.06696178764104843, "step": 141, "step_time": 37.386809142004495 }, { "clip_ratio/high_max": 0.01171875, "clip_ratio/high_mean": 0.005859375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005859375, "entropy": 0.2797367610037327, "epoch": 0.00284, "grad_norm": 1.1719934940338135, "kl": 0.36871890537440777, "learning_rate": 9.999979192667574e-06, "loss": -0.1444, "step": 142, "step_time": 6.457391539001037 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001953125, "completions/clipped_ratio": 0.0, "completions/max_length": 1805.0, "completions/max_terminated_length": 1805.0, "completions/mean_length": 1678.625, "completions/mean_terminated_length": 1678.625, "completions/min_length": 1457.0, "completions/min_terminated_length": 1457.0, "entropy": 0.2741607278585434, "epoch": 0.00286, "frac_reward_zero_std": 0.0, "grad_norm": 0.8713260293006897, "kl": 0.43569475039839745, "learning_rate": 9.999978798224922e-06, "loss": -0.138, "num_tokens": 5352366.0, "reward": 0.8101233839988708, "reward_std": 3.6327834129333496, "rewards/rollout_reward_func/mean": 0.8101233839988708, "rewards/rollout_reward_func/std": 7.983520030975342, "sampling/importance_sampling_ratio/max": 2.63127064704895, "sampling/importance_sampling_ratio/mean": 0.9456866979598999, "sampling/importance_sampling_ratio/min": 0.17168530821800232, "sampling/sampling_logp_difference/max": 1.6172382831573486, "sampling/sampling_logp_difference/mean": 0.05936865881085396, "step": 143, "step_time": 38.01681525400272 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001953125, "entropy": 0.28014545887708664, "epoch": 0.00288, "grad_norm": 0.8923928737640381, "kl": 0.40905678272247314, "learning_rate": 9.999978400078598e-06, "loss": -0.1408, "step": 144, "step_time": 5.900416173997655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001953125, "completions/clipped_ratio": 0.0, "completions/max_length": 1773.0, "completions/max_terminated_length": 1773.0, "completions/mean_length": 1679.46875, "completions/mean_terminated_length": 1679.46875, "completions/min_length": 1426.0, "completions/min_terminated_length": 1426.0, "entropy": 0.28977033123373985, "epoch": 0.0029, "frac_reward_zero_std": 0.0, "grad_norm": 1.2021753787994385, "kl": 0.9492019787430763, "learning_rate": 9.9999779982286e-06, "loss": -0.1518, "num_tokens": 5427370.0, "reward": -1.4256936311721802, "reward_std": 5.131044387817383, "rewards/rollout_reward_func/mean": -1.4256936311721802, "rewards/rollout_reward_func/std": 6.864547252655029, "sampling/importance_sampling_ratio/max": 2.061087131500244, "sampling/importance_sampling_ratio/mean": 0.7186421155929565, "sampling/importance_sampling_ratio/min": 0.03768601268529892, "sampling/sampling_logp_difference/max": 1.7941226959228516, "sampling/sampling_logp_difference/mean": 0.0829053670167923, "step": 145, "step_time": 35.9784139159965 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005859375, "entropy": 0.2919359765946865, "epoch": 0.00292, "grad_norm": 0.8570596575737, "kl": 0.9193199034780264, "learning_rate": 9.999977592674933e-06, "loss": -0.1533, "step": 146, "step_time": 5.8281254610010365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1779.0, "completions/max_terminated_length": 1779.0, "completions/mean_length": 1666.84375, "completions/mean_terminated_length": 1666.84375, "completions/min_length": 1414.0, "completions/min_terminated_length": 1414.0, "entropy": 0.28393640369176865, "epoch": 0.00294, "frac_reward_zero_std": 0.0, "grad_norm": 0.9220053553581238, "kl": 0.4149657338857651, "learning_rate": 9.999977183417593e-06, "loss": -0.1243, "num_tokens": 5502317.0, "reward": -4.950237274169922, "reward_std": 7.430306434631348, "rewards/rollout_reward_func/mean": -4.950237274169922, "rewards/rollout_reward_func/std": 8.644596099853516, "sampling/importance_sampling_ratio/max": 1.6269994974136353, "sampling/importance_sampling_ratio/mean": 0.8315407633781433, "sampling/importance_sampling_ratio/min": 0.07847892493009567, "sampling/sampling_logp_difference/max": 1.8345155715942383, "sampling/sampling_logp_difference/mean": 0.07923141121864319, "step": 147, "step_time": 37.406879453998044 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "entropy": 0.28766966238617897, "epoch": 0.00296, "grad_norm": 0.9152698516845703, "kl": 0.3896927610039711, "learning_rate": 9.999976770456581e-06, "loss": -0.126, "step": 148, "step_time": 6.265031434000775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "completions/clipped_ratio": 0.0, "completions/max_length": 1810.0, "completions/max_terminated_length": 1810.0, "completions/mean_length": 1691.28125, "completions/mean_terminated_length": 1691.28125, "completions/min_length": 1595.0, "completions/min_terminated_length": 1595.0, "entropy": 0.251856479793787, "epoch": 0.00298, "frac_reward_zero_std": 0.0, "grad_norm": 0.9210507869720459, "kl": 0.4119174964725971, "learning_rate": 9.999976353791898e-06, "loss": -0.1814, "num_tokens": 5577778.0, "reward": -1.1243976354599, "reward_std": 5.285574436187744, "rewards/rollout_reward_func/mean": -1.1243976354599, "rewards/rollout_reward_func/std": 7.979835510253906, "sampling/importance_sampling_ratio/max": 2.8175506591796875, "sampling/importance_sampling_ratio/mean": 1.1608731746673584, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.2149560451507568, "sampling/sampling_logp_difference/mean": 0.06952120363712311, "step": 149, "step_time": 37.22757341200122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "entropy": 0.25096623599529266, "epoch": 0.003, "grad_norm": 0.9395397305488586, "kl": 0.4342615343630314, "learning_rate": 9.999975933423546e-06, "loss": -0.184, "step": 150, "step_time": 5.87532146200283 } ], "logging_steps": 1.0, "max_steps": 100000, "num_input_tokens_seen": 5577778, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }