| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.003, |
| "eval_steps": 500, |
| "global_step": 150, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1775.0, |
| "completions/max_terminated_length": 1775.0, |
| "completions/mean_length": 1624.96875, |
| "completions/mean_terminated_length": 1624.96875, |
| "completions/min_length": 1388.0, |
| "completions/min_terminated_length": 1388.0, |
| "entropy": 0.5600852482020855, |
| "epoch": 2e-05, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.3755558729171753, |
| "kl": 0.0, |
| "learning_rate": 0.0, |
| "loss": -0.0592, |
| "num_tokens": 73247.0, |
| "reward": -12.172411918640137, |
| "reward_std": 7.601527214050293, |
| "rewards/rollout_reward_func/mean": -12.172411918640137, |
| "rewards/rollout_reward_func/std": 10.38169002532959, |
| "sampling/importance_sampling_ratio/max": 1.408553123474121, |
| "sampling/importance_sampling_ratio/mean": 0.9712058901786804, |
| "sampling/importance_sampling_ratio/min": 0.6454448103904724, |
| "sampling/sampling_logp_difference/max": 0.22739958763122559, |
| "sampling/sampling_logp_difference/mean": 0.016150973737239838, |
| "step": 1, |
| "step_time": 36.755565460999605 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "entropy": 0.5600852482020855, |
| "epoch": 4e-05, |
| "grad_norm": 1.3615893125534058, |
| "kl": 0.0, |
| "learning_rate": 2.8571428571428575e-07, |
| "loss": -0.0592, |
| "step": 2, |
| "step_time": 5.746241367000948 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1753.0, |
| "completions/max_terminated_length": 1753.0, |
| "completions/mean_length": 1628.96875, |
| "completions/mean_terminated_length": 1628.96875, |
| "completions/min_length": 1271.0, |
| "completions/min_terminated_length": 1271.0, |
| "entropy": 0.5380602143704891, |
| "epoch": 6e-05, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.4941445589065552, |
| "kl": 0.0005305010126903653, |
| "learning_rate": 5.714285714285715e-07, |
| "loss": 0.006, |
| "num_tokens": 146725.0, |
| "reward": -8.265422821044922, |
| "reward_std": 8.979022026062012, |
| "rewards/rollout_reward_func/mean": -8.265422821044922, |
| "rewards/rollout_reward_func/std": 13.061026573181152, |
| "sampling/importance_sampling_ratio/max": 1.2190126180648804, |
| "sampling/importance_sampling_ratio/mean": 0.9876266121864319, |
| "sampling/importance_sampling_ratio/min": 0.5881595015525818, |
| "sampling/sampling_logp_difference/max": 0.45802879333496094, |
| "sampling/sampling_logp_difference/mean": 0.014619816094636917, |
| "step": 3, |
| "step_time": 36.527911828999095 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "entropy": 0.5377197824418545, |
| "epoch": 8e-05, |
| "grad_norm": 1.4071228504180908, |
| "kl": 0.0005172143501113169, |
| "learning_rate": 8.571428571428572e-07, |
| "loss": 0.0058, |
| "step": 4, |
| "step_time": 5.69982043300206 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.001953125, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.001953125, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1754.0, |
| "completions/max_terminated_length": 1754.0, |
| "completions/mean_length": 1595.15625, |
| "completions/mean_terminated_length": 1595.15625, |
| "completions/min_length": 1299.0, |
| "completions/min_terminated_length": 1299.0, |
| "entropy": 0.539891816675663, |
| "epoch": 0.0001, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.4629162549972534, |
| "kl": 0.0005519717960851267, |
| "learning_rate": 1.142857142857143e-06, |
| "loss": 0.0243, |
| "num_tokens": 219002.0, |
| "reward": -14.256836891174316, |
| "reward_std": 9.0944185256958, |
| "rewards/rollout_reward_func/mean": -14.256836891174316, |
| "rewards/rollout_reward_func/std": 12.482532501220703, |
| "sampling/importance_sampling_ratio/max": 1.6900306940078735, |
| "sampling/importance_sampling_ratio/mean": 1.0195035934448242, |
| "sampling/importance_sampling_ratio/min": 0.8020860552787781, |
| "sampling/sampling_logp_difference/max": 0.25893688201904297, |
| "sampling/sampling_logp_difference/mean": 0.016118371859192848, |
| "step": 5, |
| "step_time": 38.65034878200095 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "entropy": 0.5395300313830376, |
| "epoch": 0.00012, |
| "grad_norm": 1.5035072565078735, |
| "kl": 0.0006097570294514298, |
| "learning_rate": 1.4285714285714286e-06, |
| "loss": 0.0242, |
| "step": 6, |
| "step_time": 5.672908046001794 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0031250000465661287, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0031250000465661287, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1782.0, |
| "completions/max_terminated_length": 1782.0, |
| "completions/mean_length": 1521.90625, |
| "completions/mean_terminated_length": 1521.90625, |
| "completions/min_length": 723.0, |
| "completions/min_terminated_length": 723.0, |
| "entropy": 0.5277910158038139, |
| "epoch": 0.00014, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.2564315795898438, |
| "kl": 0.0006845891803095583, |
| "learning_rate": 1.7142857142857145e-06, |
| "loss": 0.0223, |
| "num_tokens": 288633.0, |
| "reward": -16.092445373535156, |
| "reward_std": 8.75448989868164, |
| "rewards/rollout_reward_func/mean": -16.092445373535156, |
| "rewards/rollout_reward_func/std": 15.618288040161133, |
| "sampling/importance_sampling_ratio/max": 1.5459660291671753, |
| "sampling/importance_sampling_ratio/mean": 1.024022102355957, |
| "sampling/importance_sampling_ratio/min": 0.7249171733856201, |
| "sampling/sampling_logp_difference/max": 0.29637718200683594, |
| "sampling/sampling_logp_difference/mean": 0.018320683389902115, |
| "step": 7, |
| "step_time": 32.748348556002384 |
| }, |
| { |
| "clip_ratio/high_max": 0.00390625, |
| "clip_ratio/high_mean": 0.001953125, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.001953125, |
| "entropy": 0.5283261835575104, |
| "epoch": 0.00016, |
| "grad_norm": 1.2439873218536377, |
| "kl": 0.0006405085659935139, |
| "learning_rate": 2.0000000000000003e-06, |
| "loss": 0.0223, |
| "step": 8, |
| "step_time": 5.7717726080009015 |
| }, |
| { |
| "clip_ratio/high_max": 0.0078125, |
| "clip_ratio/high_mean": 0.00390625, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.00390625, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1818.0, |
| "completions/max_terminated_length": 1818.0, |
| "completions/mean_length": 1608.375, |
| "completions/mean_terminated_length": 1608.375, |
| "completions/min_length": 305.0, |
| "completions/min_terminated_length": 305.0, |
| "entropy": 0.5326173529028893, |
| "epoch": 0.00018, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.9701208472251892, |
| "kl": 0.0007356673304457217, |
| "learning_rate": 2.285714285714286e-06, |
| "loss": 0.012, |
| "num_tokens": 361374.0, |
| "reward": -6.8430585861206055, |
| "reward_std": 12.837440490722656, |
| "rewards/rollout_reward_func/mean": -6.8430585861206055, |
| "rewards/rollout_reward_func/std": 17.0405216217041, |
| "sampling/importance_sampling_ratio/max": 1.2777214050292969, |
| "sampling/importance_sampling_ratio/mean": 0.9900251626968384, |
| "sampling/importance_sampling_ratio/min": 0.6748403310775757, |
| "sampling/sampling_logp_difference/max": 0.3269679546356201, |
| "sampling/sampling_logp_difference/mean": 0.0145448949187994, |
| "step": 9, |
| "step_time": 34.28418227700058 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "entropy": 0.5334620848298073, |
| "epoch": 0.0002, |
| "grad_norm": 1.0108616352081299, |
| "kl": 0.0005742738721892238, |
| "learning_rate": 2.571428571428571e-06, |
| "loss": 0.012, |
| "step": 10, |
| "step_time": 6.918311860000358 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1797.0, |
| "completions/max_terminated_length": 1797.0, |
| "completions/mean_length": 1687.25, |
| "completions/mean_terminated_length": 1687.25, |
| "completions/min_length": 1423.0, |
| "completions/min_terminated_length": 1423.0, |
| "entropy": 0.5696082189679146, |
| "epoch": 0.00022, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.1935234069824219, |
| "kl": 0.0007269367633853108, |
| "learning_rate": 2.8571428571428573e-06, |
| "loss": -0.0013, |
| "num_tokens": 436067.0, |
| "reward": -9.953402519226074, |
| "reward_std": 9.885331153869629, |
| "rewards/rollout_reward_func/mean": -9.953402519226074, |
| "rewards/rollout_reward_func/std": 11.941234588623047, |
| "sampling/importance_sampling_ratio/max": 1.3005088567733765, |
| "sampling/importance_sampling_ratio/mean": 0.9863357543945312, |
| "sampling/importance_sampling_ratio/min": 0.7671698927879333, |
| "sampling/sampling_logp_difference/max": 0.1938610076904297, |
| "sampling/sampling_logp_difference/mean": 0.016408588737249374, |
| "step": 11, |
| "step_time": 36.470574425999075 |
| }, |
| { |
| "clip_ratio/high_max": 0.00390625, |
| "clip_ratio/high_mean": 0.001953125, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.001953125, |
| "entropy": 0.5690113827586174, |
| "epoch": 0.00024, |
| "grad_norm": 1.1512264013290405, |
| "kl": 0.0009323725680587813, |
| "learning_rate": 3.142857142857143e-06, |
| "loss": -0.0002, |
| "step": 12, |
| "step_time": 5.875566556000194 |
| }, |
| { |
| "clip_ratio/high_max": 0.00390625, |
| "clip_ratio/high_mean": 0.001953125, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.001953125, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1832.0, |
| "completions/max_terminated_length": 1832.0, |
| "completions/mean_length": 1571.625, |
| "completions/mean_terminated_length": 1571.625, |
| "completions/min_length": 262.0, |
| "completions/min_terminated_length": 262.0, |
| "entropy": 0.5204437598586082, |
| "epoch": 0.00026, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.1478338241577148, |
| "kl": 0.0008145252213580534, |
| "learning_rate": 3.428571428571429e-06, |
| "loss": 0.0344, |
| "num_tokens": 507640.0, |
| "reward": -5.592676162719727, |
| "reward_std": 11.350366592407227, |
| "rewards/rollout_reward_func/mean": -5.592676162719727, |
| "rewards/rollout_reward_func/std": 16.42201805114746, |
| "sampling/importance_sampling_ratio/max": 1.401992917060852, |
| "sampling/importance_sampling_ratio/mean": 1.043225884437561, |
| "sampling/importance_sampling_ratio/min": 0.7300771474838257, |
| "sampling/sampling_logp_difference/max": 0.24753212928771973, |
| "sampling/sampling_logp_difference/mean": 0.016798537224531174, |
| "step": 13, |
| "step_time": 34.95688835600049 |
| }, |
| { |
| "clip_ratio/high_max": 0.0078125, |
| "clip_ratio/high_mean": 0.00390625, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.00390625, |
| "entropy": 0.5211725942790508, |
| "epoch": 0.00028, |
| "grad_norm": 1.1339523792266846, |
| "kl": 0.001084248440747615, |
| "learning_rate": 3.7142857142857146e-06, |
| "loss": 0.0358, |
| "step": 14, |
| "step_time": 5.896424438000395 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1777.0, |
| "completions/max_terminated_length": 1777.0, |
| "completions/mean_length": 1681.0625, |
| "completions/mean_terminated_length": 1681.0625, |
| "completions/min_length": 1441.0, |
| "completions/min_terminated_length": 1441.0, |
| "entropy": 0.5041001103818417, |
| "epoch": 0.0003, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.4473488330841064, |
| "kl": 0.0014922630070941523, |
| "learning_rate": 4.000000000000001e-06, |
| "loss": -0.0171, |
| "num_tokens": 582735.0, |
| "reward": -6.595297336578369, |
| "reward_std": 7.1833062171936035, |
| "rewards/rollout_reward_func/mean": -6.595297336578369, |
| "rewards/rollout_reward_func/std": 9.555194854736328, |
| "sampling/importance_sampling_ratio/max": 1.368825078010559, |
| "sampling/importance_sampling_ratio/mean": 0.9549809098243713, |
| "sampling/importance_sampling_ratio/min": 0.7357600331306458, |
| "sampling/sampling_logp_difference/max": 0.22634148597717285, |
| "sampling/sampling_logp_difference/mean": 0.015938639640808105, |
| "step": 15, |
| "step_time": 36.23906176099899 |
| }, |
| { |
| "clip_ratio/high_max": 0.00390625, |
| "clip_ratio/high_mean": 0.001953125, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.001953125, |
| "entropy": 0.5047293417155743, |
| "epoch": 0.00032, |
| "grad_norm": 1.4626593589782715, |
| "kl": 0.001966523894225247, |
| "learning_rate": 4.2857142857142855e-06, |
| "loss": -0.0206, |
| "step": 16, |
| "step_time": 7.021587903999716 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1747.0, |
| "completions/max_terminated_length": 1747.0, |
| "completions/mean_length": 1580.9375, |
| "completions/mean_terminated_length": 1580.9375, |
| "completions/min_length": 903.0, |
| "completions/min_terminated_length": 903.0, |
| "entropy": 0.5249488092958927, |
| "epoch": 0.00034, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.4577637910842896, |
| "kl": 0.003107522992650047, |
| "learning_rate": 4.571428571428572e-06, |
| "loss": -0.062, |
| "num_tokens": 655209.0, |
| "reward": -11.51949691772461, |
| "reward_std": 5.783572673797607, |
| "rewards/rollout_reward_func/mean": -11.51949691772461, |
| "rewards/rollout_reward_func/std": 6.822617530822754, |
| "sampling/importance_sampling_ratio/max": 1.3054317235946655, |
| "sampling/importance_sampling_ratio/mean": 0.9661756753921509, |
| "sampling/importance_sampling_ratio/min": 0.7799487709999084, |
| "sampling/sampling_logp_difference/max": 0.15545654296875, |
| "sampling/sampling_logp_difference/mean": 0.014764709398150444, |
| "step": 17, |
| "step_time": 35.02218110599915 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "entropy": 0.5238873660564423, |
| "epoch": 0.00036, |
| "grad_norm": 1.4641653299331665, |
| "kl": 0.0047977561771404, |
| "learning_rate": 4.857142857142858e-06, |
| "loss": -0.0645, |
| "step": 18, |
| "step_time": 5.746608606000336 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1798.0, |
| "completions/max_terminated_length": 1798.0, |
| "completions/mean_length": 1634.25, |
| "completions/mean_terminated_length": 1634.25, |
| "completions/min_length": 1522.0, |
| "completions/min_terminated_length": 1522.0, |
| "entropy": 0.5075966455042362, |
| "epoch": 0.00038, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.6138638257980347, |
| "kl": 0.007999939436558634, |
| "learning_rate": 5.142857142857142e-06, |
| "loss": -0.0053, |
| "num_tokens": 728702.0, |
| "reward": -7.502280235290527, |
| "reward_std": 9.169681549072266, |
| "rewards/rollout_reward_func/mean": -7.502280235290527, |
| "rewards/rollout_reward_func/std": 9.848286628723145, |
| "sampling/importance_sampling_ratio/max": 1.388090968132019, |
| "sampling/importance_sampling_ratio/mean": 1.0349256992340088, |
| "sampling/importance_sampling_ratio/min": 0.6338706612586975, |
| "sampling/sampling_logp_difference/max": 0.2868894338607788, |
| "sampling/sampling_logp_difference/mean": 0.021757658571004868, |
| "step": 19, |
| "step_time": 37.415181820002545 |
| }, |
| { |
| "clip_ratio/high_max": 0.01171875, |
| "clip_ratio/high_mean": 0.005859375, |
| "clip_ratio/low_mean": 0.005800189450383186, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.011659564450383186, |
| "entropy": 0.5063299536705017, |
| "epoch": 0.0004, |
| "grad_norm": 1.244437336921692, |
| "kl": 0.01308579370379448, |
| "learning_rate": 5.428571428571429e-06, |
| "loss": -0.0075, |
| "step": 20, |
| "step_time": 5.793050424000285 |
| }, |
| { |
| "clip_ratio/high_max": 0.00390625, |
| "clip_ratio/high_mean": 0.001953125, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.001953125, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1807.0, |
| "completions/max_terminated_length": 1807.0, |
| "completions/mean_length": 1655.15625, |
| "completions/mean_terminated_length": 1655.15625, |
| "completions/min_length": 1502.0, |
| "completions/min_terminated_length": 1502.0, |
| "entropy": 0.5319979190826416, |
| "epoch": 0.00042, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.1382571458816528, |
| "kl": 0.014530768617987633, |
| "learning_rate": 5.7142857142857145e-06, |
| "loss": 0.0206, |
| "num_tokens": 803086.0, |
| "reward": 0.2018265724182129, |
| "reward_std": 8.02773666381836, |
| "rewards/rollout_reward_func/mean": 0.2018265724182129, |
| "rewards/rollout_reward_func/std": 10.535411834716797, |
| "sampling/importance_sampling_ratio/max": 1.9407941102981567, |
| "sampling/importance_sampling_ratio/mean": 1.0456597805023193, |
| "sampling/importance_sampling_ratio/min": 0.5120582580566406, |
| "sampling/sampling_logp_difference/max": 0.3853168487548828, |
| "sampling/sampling_logp_difference/mean": 0.033494722098112106, |
| "step": 21, |
| "step_time": 38.7354500320007 |
| }, |
| { |
| "clip_ratio/high_max": 0.01171875, |
| "clip_ratio/high_mean": 0.007753314450383186, |
| "clip_ratio/low_mean": 0.0014880952658131719, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.009241409716196358, |
| "entropy": 0.5304564274847507, |
| "epoch": 0.00044, |
| "grad_norm": 1.0724774599075317, |
| "kl": 0.02271496201865375, |
| "learning_rate": 6e-06, |
| "loss": 0.0207, |
| "step": 22, |
| "step_time": 5.822538338000413 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1794.0, |
| "completions/max_terminated_length": 1794.0, |
| "completions/mean_length": 1641.09375, |
| "completions/mean_terminated_length": 1641.09375, |
| "completions/min_length": 1125.0, |
| "completions/min_terminated_length": 1125.0, |
| "entropy": 0.5019906461238861, |
| "epoch": 0.00046, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.1990811824798584, |
| "kl": 0.03286108747124672, |
| "learning_rate": 6.285714285714286e-06, |
| "loss": -0.042, |
| "num_tokens": 877020.0, |
| "reward": -8.106513977050781, |
| "reward_std": 8.252906799316406, |
| "rewards/rollout_reward_func/mean": -8.106513977050781, |
| "rewards/rollout_reward_func/std": 9.194578170776367, |
| "sampling/importance_sampling_ratio/max": 1.5264556407928467, |
| "sampling/importance_sampling_ratio/mean": 0.9783341884613037, |
| "sampling/importance_sampling_ratio/min": 0.4424620270729065, |
| "sampling/sampling_logp_difference/max": 0.4774820804595947, |
| "sampling/sampling_logp_difference/mean": 0.046494003385305405, |
| "step": 23, |
| "step_time": 36.808606273000805 |
| }, |
| { |
| "clip_ratio/high_max": 0.00390625, |
| "clip_ratio/high_mean": 0.001953125, |
| "clip_ratio/low_mean": 0.001953125, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.00390625, |
| "entropy": 0.496415089815855, |
| "epoch": 0.00048, |
| "grad_norm": 1.1097930669784546, |
| "kl": 0.04643937526270747, |
| "learning_rate": 6.571428571428572e-06, |
| "loss": -0.0443, |
| "step": 24, |
| "step_time": 5.7939676890000555 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1778.0, |
| "completions/max_terminated_length": 1778.0, |
| "completions/mean_length": 1637.375, |
| "completions/mean_terminated_length": 1637.375, |
| "completions/min_length": 1411.0, |
| "completions/min_terminated_length": 1411.0, |
| "entropy": 0.49401185661554337, |
| "epoch": 0.0005, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.5811240673065186, |
| "kl": 0.08443848416209221, |
| "learning_rate": 6.857142857142858e-06, |
| "loss": -0.2108, |
| "num_tokens": 950862.0, |
| "reward": -4.675426006317139, |
| "reward_std": 7.909944534301758, |
| "rewards/rollout_reward_func/mean": -4.675426006317139, |
| "rewards/rollout_reward_func/std": 9.238933563232422, |
| "sampling/importance_sampling_ratio/max": 2.070371150970459, |
| "sampling/importance_sampling_ratio/mean": 0.9819083213806152, |
| "sampling/importance_sampling_ratio/min": 0.25233596563339233, |
| "sampling/sampling_logp_difference/max": 0.7732794284820557, |
| "sampling/sampling_logp_difference/mean": 0.06470471620559692, |
| "step": 25, |
| "step_time": 38.14812202200119 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.009548611124046147, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.009548611124046147, |
| "entropy": 0.4837190806865692, |
| "epoch": 0.00052, |
| "grad_norm": 1.3750770092010498, |
| "kl": 0.11863584071397781, |
| "learning_rate": 7.1428571428571436e-06, |
| "loss": -0.2169, |
| "step": 26, |
| "step_time": 5.773092350001207 |
| }, |
| { |
| "clip_ratio/high_max": 0.00390625, |
| "clip_ratio/high_mean": 0.001953125, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.001953125, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1693.0, |
| "completions/max_terminated_length": 1693.0, |
| "completions/mean_length": 1613.34375, |
| "completions/mean_terminated_length": 1613.34375, |
| "completions/min_length": 1434.0, |
| "completions/min_terminated_length": 1434.0, |
| "entropy": 0.45354875922203064, |
| "epoch": 0.00054, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.5507698059082031, |
| "kl": 0.1627941089682281, |
| "learning_rate": 7.428571428571429e-06, |
| "loss": -0.1993, |
| "num_tokens": 1023781.0, |
| "reward": -7.355703353881836, |
| "reward_std": 10.72867202758789, |
| "rewards/rollout_reward_func/mean": -7.355703353881836, |
| "rewards/rollout_reward_func/std": 12.46450138092041, |
| "sampling/importance_sampling_ratio/max": 2.511967420578003, |
| "sampling/importance_sampling_ratio/mean": 1.0184850692749023, |
| "sampling/importance_sampling_ratio/min": 0.14674033224582672, |
| "sampling/sampling_logp_difference/max": 1.2143032550811768, |
| "sampling/sampling_logp_difference/mean": 0.07817384600639343, |
| "step": 27, |
| "step_time": 38.66307055499874 |
| }, |
| { |
| "clip_ratio/high_max": 0.0078125, |
| "clip_ratio/high_mean": 0.00390625, |
| "clip_ratio/low_mean": 0.025390625, |
| "clip_ratio/low_min": 0.00390625, |
| "clip_ratio/region_mean": 0.029296875, |
| "entropy": 0.437204722315073, |
| "epoch": 0.00056, |
| "grad_norm": 1.2980964183807373, |
| "kl": 0.25656731706112623, |
| "learning_rate": 7.714285714285716e-06, |
| "loss": -0.2037, |
| "step": 28, |
| "step_time": 5.5929295870000715 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.001953125, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.001953125, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1793.0, |
| "completions/max_terminated_length": 1793.0, |
| "completions/mean_length": 1653.5, |
| "completions/mean_terminated_length": 1653.5, |
| "completions/min_length": 1433.0, |
| "completions/min_terminated_length": 1433.0, |
| "entropy": 0.40794313699007034, |
| "epoch": 0.00058, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.0819990634918213, |
| "kl": 0.5643582288175821, |
| "learning_rate": 8.000000000000001e-06, |
| "loss": -0.2202, |
| "num_tokens": 1098013.0, |
| "reward": -6.507538318634033, |
| "reward_std": 11.262900352478027, |
| "rewards/rollout_reward_func/mean": -6.507538318634033, |
| "rewards/rollout_reward_func/std": 15.167143821716309, |
| "sampling/importance_sampling_ratio/max": 2.3346853256225586, |
| "sampling/importance_sampling_ratio/mean": 0.7045407295227051, |
| "sampling/importance_sampling_ratio/min": 0.0, |
| "sampling/sampling_logp_difference/max": 1.8893389701843262, |
| "sampling/sampling_logp_difference/mean": 0.11059033870697021, |
| "step": 29, |
| "step_time": 36.8627199120001 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.042909564450383186, |
| "clip_ratio/low_min": 0.019412878900766373, |
| "clip_ratio/region_mean": 0.042909564450383186, |
| "entropy": 0.3947901092469692, |
| "epoch": 0.0006, |
| "grad_norm": 1.311099648475647, |
| "kl": 0.829475361853838, |
| "learning_rate": 8.285714285714287e-06, |
| "loss": -0.2219, |
| "step": 30, |
| "step_time": 5.809630234999531 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1791.0, |
| "completions/max_terminated_length": 1791.0, |
| "completions/mean_length": 1661.65625, |
| "completions/mean_terminated_length": 1661.65625, |
| "completions/min_length": 1507.0, |
| "completions/min_terminated_length": 1507.0, |
| "entropy": 0.3842233642935753, |
| "epoch": 0.00062, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.9492745995521545, |
| "kl": 0.836035018786788, |
| "learning_rate": 8.571428571428571e-06, |
| "loss": -0.1847, |
| "num_tokens": 1172333.0, |
| "reward": -7.43505859375, |
| "reward_std": 10.108196258544922, |
| "rewards/rollout_reward_func/mean": -7.43505859375, |
| "rewards/rollout_reward_func/std": 12.447552680969238, |
| "sampling/importance_sampling_ratio/max": 1.9039174318313599, |
| "sampling/importance_sampling_ratio/mean": 0.7630480527877808, |
| "sampling/importance_sampling_ratio/min": 0.03481662645936012, |
| "sampling/sampling_logp_difference/max": 2.102973699569702, |
| "sampling/sampling_logp_difference/mean": 0.10207939893007278, |
| "step": 31, |
| "step_time": 37.01448380799866 |
| }, |
| { |
| "clip_ratio/high_max": 0.00390625, |
| "clip_ratio/high_mean": 0.001953125, |
| "clip_ratio/low_mean": 0.015625, |
| "clip_ratio/low_min": 0.0078125, |
| "clip_ratio/region_mean": 0.017578125, |
| "entropy": 0.3787720203399658, |
| "epoch": 0.00064, |
| "grad_norm": 0.9088082313537598, |
| "kl": 0.954752204939723, |
| "learning_rate": 8.857142857142858e-06, |
| "loss": -0.1848, |
| "step": 32, |
| "step_time": 6.941179774000375 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1777.0, |
| "completions/max_terminated_length": 1777.0, |
| "completions/mean_length": 1584.28125, |
| "completions/mean_terminated_length": 1584.28125, |
| "completions/min_length": 274.0, |
| "completions/min_terminated_length": 274.0, |
| "entropy": 0.34466781467199326, |
| "epoch": 0.00066, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.3309731483459473, |
| "kl": 0.8169562551192939, |
| "learning_rate": 9.142857142857144e-06, |
| "loss": -0.0371, |
| "num_tokens": 1244354.0, |
| "reward": -5.68795108795166, |
| "reward_std": 6.089047431945801, |
| "rewards/rollout_reward_func/mean": -5.68795108795166, |
| "rewards/rollout_reward_func/std": 7.458269119262695, |
| "sampling/importance_sampling_ratio/max": 1.7896546125411987, |
| "sampling/importance_sampling_ratio/mean": 0.8577574491500854, |
| "sampling/importance_sampling_ratio/min": 0.03787967935204506, |
| "sampling/sampling_logp_difference/max": 2.316878080368042, |
| "sampling/sampling_logp_difference/mean": 0.08233191072940826, |
| "step": 33, |
| "step_time": 35.641360890001124 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.009765625, |
| "clip_ratio/low_min": 0.00390625, |
| "clip_ratio/region_mean": 0.009765625, |
| "entropy": 0.3404731787741184, |
| "epoch": 0.00068, |
| "grad_norm": 1.2554371356964111, |
| "kl": 0.8635595235973597, |
| "learning_rate": 9.42857142857143e-06, |
| "loss": -0.0392, |
| "step": 34, |
| "step_time": 5.800049977002345 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1793.0, |
| "completions/max_terminated_length": 1793.0, |
| "completions/mean_length": 1645.40625, |
| "completions/mean_terminated_length": 1645.40625, |
| "completions/min_length": 1436.0, |
| "completions/min_terminated_length": 1436.0, |
| "entropy": 0.34361691400408745, |
| "epoch": 0.0007, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.8878010511398315, |
| "kl": 0.6783301420509815, |
| "learning_rate": 9.714285714285715e-06, |
| "loss": -0.0546, |
| "num_tokens": 1318828.0, |
| "reward": -6.509824752807617, |
| "reward_std": 5.78992223739624, |
| "rewards/rollout_reward_func/mean": -6.509824752807617, |
| "rewards/rollout_reward_func/std": 7.799504280090332, |
| "sampling/importance_sampling_ratio/max": 2.6033241748809814, |
| "sampling/importance_sampling_ratio/mean": 0.7217234373092651, |
| "sampling/importance_sampling_ratio/min": 0.03342561423778534, |
| "sampling/sampling_logp_difference/max": 1.9652609825134277, |
| "sampling/sampling_logp_difference/mean": 0.10379400849342346, |
| "step": 35, |
| "step_time": 38.887631579999834 |
| }, |
| { |
| "clip_ratio/high_max": 0.01953125, |
| "clip_ratio/high_mean": 0.009765625, |
| "clip_ratio/low_mean": 0.00390625, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.013671875, |
| "entropy": 0.34295540675520897, |
| "epoch": 0.00072, |
| "grad_norm": 0.8096361756324768, |
| "kl": 0.608342956751585, |
| "learning_rate": 1e-05, |
| "loss": -0.0557, |
| "step": 36, |
| "step_time": 5.798652657001185 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1830.0, |
| "completions/max_terminated_length": 1830.0, |
| "completions/mean_length": 1669.1875, |
| "completions/mean_terminated_length": 1669.1875, |
| "completions/min_length": 1051.0, |
| "completions/min_terminated_length": 1051.0, |
| "entropy": 0.40553563460707664, |
| "epoch": 0.00074, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.1997655630111694, |
| "kl": 0.6256659794598818, |
| "learning_rate": 9.999999998148153e-06, |
| "loss": -0.1523, |
| "num_tokens": 1393543.0, |
| "reward": -6.479705333709717, |
| "reward_std": 5.127873420715332, |
| "rewards/rollout_reward_func/mean": -6.479705333709717, |
| "rewards/rollout_reward_func/std": 5.757119655609131, |
| "sampling/importance_sampling_ratio/max": 2.8293187618255615, |
| "sampling/importance_sampling_ratio/mean": 0.8761348724365234, |
| "sampling/importance_sampling_ratio/min": 0.04236992821097374, |
| "sampling/sampling_logp_difference/max": 1.9487248659133911, |
| "sampling/sampling_logp_difference/mean": 0.10332974791526794, |
| "step": 37, |
| "step_time": 37.80114303599839 |
| }, |
| { |
| "clip_ratio/high_max": 0.022248641354963183, |
| "clip_ratio/high_mean": 0.011124320677481592, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.011124320677481592, |
| "entropy": 0.4083840139210224, |
| "epoch": 0.00076, |
| "grad_norm": 1.0707274675369263, |
| "kl": 0.46542409248650074, |
| "learning_rate": 9.999999992592613e-06, |
| "loss": -0.1538, |
| "step": 38, |
| "step_time": 6.32874613399963 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1798.0, |
| "completions/max_terminated_length": 1798.0, |
| "completions/mean_length": 1652.875, |
| "completions/mean_terminated_length": 1652.875, |
| "completions/min_length": 1438.0, |
| "completions/min_terminated_length": 1438.0, |
| "entropy": 0.3374646417796612, |
| "epoch": 0.00078, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.204039216041565, |
| "kl": 0.31642488297075033, |
| "learning_rate": 9.999999983333379e-06, |
| "loss": -0.2975, |
| "num_tokens": 1467743.0, |
| "reward": -4.7108917236328125, |
| "reward_std": 5.350179672241211, |
| "rewards/rollout_reward_func/mean": -4.7108917236328125, |
| "rewards/rollout_reward_func/std": 5.910353660583496, |
| "sampling/importance_sampling_ratio/max": 2.4041433334350586, |
| "sampling/importance_sampling_ratio/mean": 0.9955360889434814, |
| "sampling/importance_sampling_ratio/min": 0.0774984359741211, |
| "sampling/sampling_logp_difference/max": 1.3263565301895142, |
| "sampling/sampling_logp_difference/mean": 0.08658263087272644, |
| "step": 39, |
| "step_time": 38.75331890299822 |
| }, |
| { |
| "clip_ratio/high_max": 0.00390625, |
| "clip_ratio/high_mean": 0.001953125, |
| "clip_ratio/low_mean": 0.001953125, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.00390625, |
| "entropy": 0.3382052704691887, |
| "epoch": 0.0008, |
| "grad_norm": 1.1924562454223633, |
| "kl": 0.293441329151392, |
| "learning_rate": 9.999999970370451e-06, |
| "loss": -0.2999, |
| "step": 40, |
| "step_time": 5.813114761998804 |
| }, |
| { |
| "clip_ratio/high_max": 0.00390625, |
| "clip_ratio/high_mean": 0.001953125, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.001953125, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1829.0, |
| "completions/max_terminated_length": 1829.0, |
| "completions/mean_length": 1647.59375, |
| "completions/mean_terminated_length": 1647.59375, |
| "completions/min_length": 871.0, |
| "completions/min_terminated_length": 871.0, |
| "entropy": 0.3876206576824188, |
| "epoch": 0.00082, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.0214810371398926, |
| "kl": 0.42729487270116806, |
| "learning_rate": 9.99999995370383e-06, |
| "loss": 0.0543, |
| "num_tokens": 1542142.0, |
| "reward": -3.3205041885375977, |
| "reward_std": 4.228387832641602, |
| "rewards/rollout_reward_func/mean": -3.3205041885375977, |
| "rewards/rollout_reward_func/std": 8.351001739501953, |
| "sampling/importance_sampling_ratio/max": 2.9104766845703125, |
| "sampling/importance_sampling_ratio/mean": 0.9624049067497253, |
| "sampling/importance_sampling_ratio/min": 0.0, |
| "sampling/sampling_logp_difference/max": 1.469689130783081, |
| "sampling/sampling_logp_difference/mean": 0.09306588023900986, |
| "step": 41, |
| "step_time": 36.42707723900003 |
| }, |
| { |
| "clip_ratio/high_max": 0.00390625, |
| "clip_ratio/high_mean": 0.001953125, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.001953125, |
| "entropy": 0.38661571592092514, |
| "epoch": 0.00084, |
| "grad_norm": 1.0359998941421509, |
| "kl": 0.41935206204652786, |
| "learning_rate": 9.999999933333514e-06, |
| "loss": 0.0525, |
| "step": 42, |
| "step_time": 5.882109656998182 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1772.0, |
| "completions/max_terminated_length": 1772.0, |
| "completions/mean_length": 1624.15625, |
| "completions/mean_terminated_length": 1624.15625, |
| "completions/min_length": 879.0, |
| "completions/min_terminated_length": 879.0, |
| "entropy": 0.3577045015990734, |
| "epoch": 0.00086, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.2550678253173828, |
| "kl": 0.32372760958969593, |
| "learning_rate": 9.999999909259504e-06, |
| "loss": -0.0496, |
| "num_tokens": 1615506.0, |
| "reward": -5.2150774002075195, |
| "reward_std": 6.197805404663086, |
| "rewards/rollout_reward_func/mean": -5.2150774002075195, |
| "rewards/rollout_reward_func/std": 8.147597312927246, |
| "sampling/importance_sampling_ratio/max": 2.349579095840454, |
| "sampling/importance_sampling_ratio/mean": 0.946481466293335, |
| "sampling/importance_sampling_ratio/min": 0.06307531893253326, |
| "sampling/sampling_logp_difference/max": 1.156632423400879, |
| "sampling/sampling_logp_difference/mean": 0.07861532270908356, |
| "step": 43, |
| "step_time": 37.13630858600118 |
| }, |
| { |
| "clip_ratio/high_max": 0.0078125, |
| "clip_ratio/high_mean": 0.00390625, |
| "clip_ratio/low_mean": 0.001953125, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.005859375, |
| "entropy": 0.3559652045369148, |
| "epoch": 0.00088, |
| "grad_norm": 1.0217231512069702, |
| "kl": 0.3223106600344181, |
| "learning_rate": 9.9999998814818e-06, |
| "loss": -0.0515, |
| "step": 44, |
| "step_time": 6.2407338969997 |
| }, |
| { |
| "clip_ratio/high_max": 0.00390625, |
| "clip_ratio/high_mean": 0.001953125, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.001953125, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1814.0, |
| "completions/max_terminated_length": 1814.0, |
| "completions/mean_length": 1681.125, |
| "completions/mean_terminated_length": 1681.125, |
| "completions/min_length": 1242.0, |
| "completions/min_terminated_length": 1242.0, |
| "entropy": 0.3759612925350666, |
| "epoch": 0.0009, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.528295636177063, |
| "kl": 0.4129584180191159, |
| "learning_rate": 9.999999850000403e-06, |
| "loss": -0.0326, |
| "num_tokens": 1690335.0, |
| "reward": -2.4644203186035156, |
| "reward_std": 10.457454681396484, |
| "rewards/rollout_reward_func/mean": -2.4644203186035156, |
| "rewards/rollout_reward_func/std": 15.407123565673828, |
| "sampling/importance_sampling_ratio/max": 2.56207537651062, |
| "sampling/importance_sampling_ratio/mean": 0.8990581035614014, |
| "sampling/importance_sampling_ratio/min": 0.13844478130340576, |
| "sampling/sampling_logp_difference/max": 1.2251713275909424, |
| "sampling/sampling_logp_difference/mean": 0.07088702917098999, |
| "step": 45, |
| "step_time": 36.063344202000735 |
| }, |
| { |
| "clip_ratio/high_max": 0.00390625, |
| "clip_ratio/high_mean": 0.001953125, |
| "clip_ratio/low_mean": 0.001953125, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.00390625, |
| "entropy": 0.37699316069483757, |
| "epoch": 0.00092, |
| "grad_norm": 1.5766605138778687, |
| "kl": 0.37809659354388714, |
| "learning_rate": 9.999999814815314e-06, |
| "loss": -0.0346, |
| "step": 46, |
| "step_time": 5.878628188998846 |
| }, |
| { |
| "clip_ratio/high_max": 0.00390625, |
| "clip_ratio/high_mean": 0.001953125, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.001953125, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1796.0, |
| "completions/max_terminated_length": 1796.0, |
| "completions/mean_length": 1684.96875, |
| "completions/mean_terminated_length": 1684.96875, |
| "completions/min_length": 1413.0, |
| "completions/min_terminated_length": 1413.0, |
| "entropy": 0.35037482157349586, |
| "epoch": 0.00094, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.7219093441963196, |
| "kl": 0.3052864633500576, |
| "learning_rate": 9.99999977592653e-06, |
| "loss": -0.1454, |
| "num_tokens": 1765997.0, |
| "reward": -5.9493207931518555, |
| "reward_std": 6.758513450622559, |
| "rewards/rollout_reward_func/mean": -5.9493207931518555, |
| "rewards/rollout_reward_func/std": 7.776234149932861, |
| "sampling/importance_sampling_ratio/max": 2.8358778953552246, |
| "sampling/importance_sampling_ratio/mean": 0.7998743653297424, |
| "sampling/importance_sampling_ratio/min": 0.0, |
| "sampling/sampling_logp_difference/max": 1.1099776029586792, |
| "sampling/sampling_logp_difference/mean": 0.0953160896897316, |
| "step": 47, |
| "step_time": 37.005451356000776 |
| }, |
| { |
| "clip_ratio/high_max": 0.00390625, |
| "clip_ratio/high_mean": 0.001953125, |
| "clip_ratio/low_mean": 0.00390625, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.005859375, |
| "entropy": 0.34440867975354195, |
| "epoch": 0.00096, |
| "grad_norm": 0.6548582911491394, |
| "kl": 0.3214886896312237, |
| "learning_rate": 9.999999733334051e-06, |
| "loss": -0.1452, |
| "step": 48, |
| "step_time": 5.856181479999577 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1795.0, |
| "completions/max_terminated_length": 1795.0, |
| "completions/mean_length": 1652.40625, |
| "completions/mean_terminated_length": 1652.40625, |
| "completions/min_length": 1492.0, |
| "completions/min_terminated_length": 1492.0, |
| "entropy": 0.3001542203128338, |
| "epoch": 0.00098, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.1067109107971191, |
| "kl": 0.20680313045158982, |
| "learning_rate": 9.99999968703788e-06, |
| "loss": -0.1033, |
| "num_tokens": 1839890.0, |
| "reward": -3.3683390617370605, |
| "reward_std": 3.6531944274902344, |
| "rewards/rollout_reward_func/mean": -3.3683390617370605, |
| "rewards/rollout_reward_func/std": 7.470107078552246, |
| "sampling/importance_sampling_ratio/max": 2.1421477794647217, |
| "sampling/importance_sampling_ratio/mean": 1.1610007286071777, |
| "sampling/importance_sampling_ratio/min": 0.263545960187912, |
| "sampling/sampling_logp_difference/max": 0.9924228191375732, |
| "sampling/sampling_logp_difference/mean": 0.05461367964744568, |
| "step": 49, |
| "step_time": 38.469305269999495 |
| }, |
| { |
| "clip_ratio/high_max": 0.00390625, |
| "clip_ratio/high_mean": 0.001953125, |
| "clip_ratio/low_mean": 0.001953125, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.00390625, |
| "entropy": 0.29572881013154984, |
| "epoch": 0.001, |
| "grad_norm": 0.9047728180885315, |
| "kl": 0.22563101211562753, |
| "learning_rate": 9.999999637038016e-06, |
| "loss": -0.1081, |
| "step": 50, |
| "step_time": 5.818200681000235 |
| }, |
| { |
| "clip_ratio/high_max": 0.0078125, |
| "clip_ratio/high_mean": 0.00390625, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.00390625, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1796.0, |
| "completions/max_terminated_length": 1796.0, |
| "completions/mean_length": 1690.875, |
| "completions/mean_terminated_length": 1690.875, |
| "completions/min_length": 1463.0, |
| "completions/min_terminated_length": 1463.0, |
| "entropy": 0.31936580687761307, |
| "epoch": 0.00102, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.7223824262619019, |
| "kl": 0.3224958088248968, |
| "learning_rate": 9.999999583334458e-06, |
| "loss": -0.1377, |
| "num_tokens": 1915271.0, |
| "reward": -3.352957248687744, |
| "reward_std": 4.21926212310791, |
| "rewards/rollout_reward_func/mean": -3.352957248687744, |
| "rewards/rollout_reward_func/std": 6.320347309112549, |
| "sampling/importance_sampling_ratio/max": 2.287932872772217, |
| "sampling/importance_sampling_ratio/mean": 0.8120144605636597, |
| "sampling/importance_sampling_ratio/min": 0.0, |
| "sampling/sampling_logp_difference/max": 1.2639083862304688, |
| "sampling/sampling_logp_difference/mean": 0.07481236755847931, |
| "step": 51, |
| "step_time": 37.81962620299964 |
| }, |
| { |
| "clip_ratio/high_max": 0.0078125, |
| "clip_ratio/high_mean": 0.00390625, |
| "clip_ratio/low_mean": 0.001953125, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.005859375, |
| "entropy": 0.3146309144794941, |
| "epoch": 0.00104, |
| "grad_norm": 0.7475705742835999, |
| "kl": 0.33450872637331486, |
| "learning_rate": 9.999999525927207e-06, |
| "loss": -0.1398, |
| "step": 52, |
| "step_time": 5.822449180000149 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1839.0, |
| "completions/max_terminated_length": 1839.0, |
| "completions/mean_length": 1702.75, |
| "completions/mean_terminated_length": 1702.75, |
| "completions/min_length": 1476.0, |
| "completions/min_terminated_length": 1476.0, |
| "entropy": 0.322977501899004, |
| "epoch": 0.00106, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.198279619216919, |
| "kl": 0.2917388379573822, |
| "learning_rate": 9.999999464816262e-06, |
| "loss": -0.1064, |
| "num_tokens": 1991060.0, |
| "reward": -1.9325592517852783, |
| "reward_std": 4.967609882354736, |
| "rewards/rollout_reward_func/mean": -1.9325592517852783, |
| "rewards/rollout_reward_func/std": 5.8073954582214355, |
| "sampling/importance_sampling_ratio/max": 2.789353847503662, |
| "sampling/importance_sampling_ratio/mean": 1.1503762006759644, |
| "sampling/importance_sampling_ratio/min": 0.1404324173927307, |
| "sampling/sampling_logp_difference/max": 1.199690580368042, |
| "sampling/sampling_logp_difference/mean": 0.07725630700588226, |
| "step": 53, |
| "step_time": 36.42649295499905 |
| }, |
| { |
| "clip_ratio/high_max": 0.00390625, |
| "clip_ratio/high_mean": 0.001953125, |
| "clip_ratio/low_mean": 0.001953125, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.00390625, |
| "entropy": 0.31714847683906555, |
| "epoch": 0.00108, |
| "grad_norm": 1.1143569946289062, |
| "kl": 0.3153774570673704, |
| "learning_rate": 9.999999400001624e-06, |
| "loss": -0.1103, |
| "step": 54, |
| "step_time": 6.59381660300005 |
| }, |
| { |
| "clip_ratio/high_max": 0.00390625, |
| "clip_ratio/high_mean": 0.001953125, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.001953125, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1815.0, |
| "completions/max_terminated_length": 1815.0, |
| "completions/mean_length": 1677.09375, |
| "completions/mean_terminated_length": 1677.09375, |
| "completions/min_length": 1520.0, |
| "completions/min_terminated_length": 1520.0, |
| "entropy": 0.2675260417163372, |
| "epoch": 0.0011, |
| "frac_reward_zero_std": 0.125, |
| "grad_norm": 0.6082053780555725, |
| "kl": 0.35145391430705786, |
| "learning_rate": 9.999999331483293e-06, |
| "loss": -0.0145, |
| "num_tokens": 2065830.0, |
| "reward": -7.571907997131348, |
| "reward_std": 7.06196928024292, |
| "rewards/rollout_reward_func/mean": -7.571907997131348, |
| "rewards/rollout_reward_func/std": 10.322997093200684, |
| "sampling/importance_sampling_ratio/max": 2.288501024246216, |
| "sampling/importance_sampling_ratio/mean": 0.8737363815307617, |
| "sampling/importance_sampling_ratio/min": 0.11803531646728516, |
| "sampling/sampling_logp_difference/max": 1.27490234375, |
| "sampling/sampling_logp_difference/mean": 0.07949512451887131, |
| "step": 55, |
| "step_time": 37.09443227400061 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.00390625, |
| "clip_ratio/low_min": 0.00390625, |
| "clip_ratio/region_mean": 0.00390625, |
| "entropy": 0.26163551956415176, |
| "epoch": 0.00112, |
| "grad_norm": 0.5967329144477844, |
| "kl": 0.3754094559699297, |
| "learning_rate": 9.999999259261269e-06, |
| "loss": -0.0158, |
| "step": 56, |
| "step_time": 5.860317817000578 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1822.0, |
| "completions/max_terminated_length": 1822.0, |
| "completions/mean_length": 1653.90625, |
| "completions/mean_terminated_length": 1653.90625, |
| "completions/min_length": 1317.0, |
| "completions/min_terminated_length": 1317.0, |
| "entropy": 0.32117627188563347, |
| "epoch": 0.00114, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.9490989446640015, |
| "kl": 0.3084982577711344, |
| "learning_rate": 9.999999183335551e-06, |
| "loss": -0.1246, |
| "num_tokens": 2139937.0, |
| "reward": -3.708850860595703, |
| "reward_std": 6.1540679931640625, |
| "rewards/rollout_reward_func/mean": -3.708850860595703, |
| "rewards/rollout_reward_func/std": 7.61086893081665, |
| "sampling/importance_sampling_ratio/max": 2.2644283771514893, |
| "sampling/importance_sampling_ratio/mean": 0.9409332275390625, |
| "sampling/importance_sampling_ratio/min": 0.0, |
| "sampling/sampling_logp_difference/max": 1.2469470500946045, |
| "sampling/sampling_logp_difference/mean": 0.08474647253751755, |
| "step": 57, |
| "step_time": 36.73273920400061 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.005859375, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.005859375, |
| "entropy": 0.31612110882997513, |
| "epoch": 0.00116, |
| "grad_norm": 0.9802373647689819, |
| "kl": 0.3413949944078922, |
| "learning_rate": 9.999999103706142e-06, |
| "loss": -0.1259, |
| "step": 58, |
| "step_time": 5.886546145001375 |
| }, |
| { |
| "clip_ratio/high_max": 0.00390625, |
| "clip_ratio/high_mean": 0.001953125, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.001953125, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1837.0, |
| "completions/max_terminated_length": 1837.0, |
| "completions/mean_length": 1626.5625, |
| "completions/mean_terminated_length": 1626.5625, |
| "completions/min_length": 267.0, |
| "completions/min_terminated_length": 267.0, |
| "entropy": 0.30012011528015137, |
| "epoch": 0.00118, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.0481098890304565, |
| "kl": 0.5502468682825565, |
| "learning_rate": 9.999999020373038e-06, |
| "loss": -0.1811, |
| "num_tokens": 2213209.0, |
| "reward": -5.389894485473633, |
| "reward_std": 4.981201648712158, |
| "rewards/rollout_reward_func/mean": -5.389894485473633, |
| "rewards/rollout_reward_func/std": 6.769619941711426, |
| "sampling/importance_sampling_ratio/max": 2.7308578491210938, |
| "sampling/importance_sampling_ratio/mean": 0.9160431027412415, |
| "sampling/importance_sampling_ratio/min": 0.08488596975803375, |
| "sampling/sampling_logp_difference/max": 1.468017578125, |
| "sampling/sampling_logp_difference/mean": 0.09639683365821838, |
| "step": 59, |
| "step_time": 37.19838971900026 |
| }, |
| { |
| "clip_ratio/high_max": 0.00390625, |
| "clip_ratio/high_mean": 0.001953125, |
| "clip_ratio/low_mean": 0.00390625, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.005859375, |
| "entropy": 0.2991691865026951, |
| "epoch": 0.0012, |
| "grad_norm": 0.8857253193855286, |
| "kl": 0.5988470073789358, |
| "learning_rate": 9.999998933336242e-06, |
| "loss": -0.1844, |
| "step": 60, |
| "step_time": 6.367170782000358 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1822.0, |
| "completions/max_terminated_length": 1822.0, |
| "completions/mean_length": 1680.0625, |
| "completions/mean_terminated_length": 1680.0625, |
| "completions/min_length": 1396.0, |
| "completions/min_terminated_length": 1396.0, |
| "entropy": 0.27592010982334614, |
| "epoch": 0.00122, |
| "frac_reward_zero_std": 0.125, |
| "grad_norm": 1.353814721107483, |
| "kl": 0.37851969711482525, |
| "learning_rate": 9.999998842595754e-06, |
| "loss": -0.0009, |
| "num_tokens": 2288126.0, |
| "reward": -4.979825496673584, |
| "reward_std": 6.0550150871276855, |
| "rewards/rollout_reward_func/mean": -4.979825496673584, |
| "rewards/rollout_reward_func/std": 9.026530265808105, |
| "sampling/importance_sampling_ratio/max": 1.9612035751342773, |
| "sampling/importance_sampling_ratio/mean": 0.9742088317871094, |
| "sampling/importance_sampling_ratio/min": 0.0, |
| "sampling/sampling_logp_difference/max": 2.270935535430908, |
| "sampling/sampling_logp_difference/mean": 0.07654492557048798, |
| "step": 61, |
| "step_time": 37.15296104299978 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "entropy": 0.272587139159441, |
| "epoch": 0.00124, |
| "grad_norm": 1.322236180305481, |
| "kl": 0.403486505150795, |
| "learning_rate": 9.999998748151573e-06, |
| "loss": -0.0003, |
| "step": 62, |
| "step_time": 5.884493231998022 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1779.0, |
| "completions/max_terminated_length": 1779.0, |
| "completions/mean_length": 1606.4375, |
| "completions/mean_terminated_length": 1606.4375, |
| "completions/min_length": 1067.0, |
| "completions/min_terminated_length": 1067.0, |
| "entropy": 0.24129757285118103, |
| "epoch": 0.00126, |
| "frac_reward_zero_std": 0.125, |
| "grad_norm": 0.8646777868270874, |
| "kl": 0.3206884413957596, |
| "learning_rate": 9.999998650003697e-06, |
| "loss": -0.0263, |
| "num_tokens": 2361057.0, |
| "reward": -4.465510845184326, |
| "reward_std": 6.844207286834717, |
| "rewards/rollout_reward_func/mean": -4.465510845184326, |
| "rewards/rollout_reward_func/std": 8.709650039672852, |
| "sampling/importance_sampling_ratio/max": 2.8052289485931396, |
| "sampling/importance_sampling_ratio/mean": 0.9944058656692505, |
| "sampling/importance_sampling_ratio/min": 0.04083564504981041, |
| "sampling/sampling_logp_difference/max": 1.8856086730957031, |
| "sampling/sampling_logp_difference/mean": 0.07281368225812912, |
| "step": 63, |
| "step_time": 36.78395269500015 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.001953125, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.001953125, |
| "entropy": 0.23481187783181667, |
| "epoch": 0.00128, |
| "grad_norm": 0.843463659286499, |
| "kl": 0.33893433026969433, |
| "learning_rate": 9.999998548152132e-06, |
| "loss": -0.027, |
| "step": 64, |
| "step_time": 5.761317184999825 |
| }, |
| { |
| "clip_ratio/high_max": 0.00390625, |
| "clip_ratio/high_mean": 0.001953125, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.001953125, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1810.0, |
| "completions/max_terminated_length": 1810.0, |
| "completions/mean_length": 1699.90625, |
| "completions/mean_terminated_length": 1699.90625, |
| "completions/min_length": 1413.0, |
| "completions/min_terminated_length": 1413.0, |
| "entropy": 0.2767509985715151, |
| "epoch": 0.0013, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.0217297077178955, |
| "kl": 0.45753872115164995, |
| "learning_rate": 9.999998442596872e-06, |
| "loss": -0.0874, |
| "num_tokens": 2437012.0, |
| "reward": -3.256406307220459, |
| "reward_std": 6.123032093048096, |
| "rewards/rollout_reward_func/mean": -3.256406307220459, |
| "rewards/rollout_reward_func/std": 6.840267181396484, |
| "sampling/importance_sampling_ratio/max": 2.3409619331359863, |
| "sampling/importance_sampling_ratio/mean": 0.8038904070854187, |
| "sampling/importance_sampling_ratio/min": 0.022516217082738876, |
| "sampling/sampling_logp_difference/max": 1.6859521865844727, |
| "sampling/sampling_logp_difference/mean": 0.08254212141036987, |
| "step": 65, |
| "step_time": 36.94500934499865 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.001953125, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.001953125, |
| "entropy": 0.2692566681653261, |
| "epoch": 0.00132, |
| "grad_norm": 0.8563345670700073, |
| "kl": 0.4730011150240898, |
| "learning_rate": 9.999998333337923e-06, |
| "loss": -0.0897, |
| "step": 66, |
| "step_time": 6.347890593002376 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0026041667442768812, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0026041667442768812, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1795.0, |
| "completions/max_terminated_length": 1795.0, |
| "completions/mean_length": 1669.125, |
| "completions/mean_terminated_length": 1669.125, |
| "completions/min_length": 641.0, |
| "completions/min_terminated_length": 641.0, |
| "entropy": 0.239725174382329, |
| "epoch": 0.00134, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.8270325660705566, |
| "kl": 0.3960014134645462, |
| "learning_rate": 9.99999822037528e-06, |
| "loss": -0.107, |
| "num_tokens": 2511757.0, |
| "reward": -6.540927886962891, |
| "reward_std": 7.1129469871521, |
| "rewards/rollout_reward_func/mean": -6.540927886962891, |
| "rewards/rollout_reward_func/std": 10.2684907913208, |
| "sampling/importance_sampling_ratio/max": 2.9500951766967773, |
| "sampling/importance_sampling_ratio/mean": 1.145231008529663, |
| "sampling/importance_sampling_ratio/min": 0.0, |
| "sampling/sampling_logp_difference/max": 1.810743808746338, |
| "sampling/sampling_logp_difference/mean": 0.07273144274950027, |
| "step": 67, |
| "step_time": 36.61473885000032 |
| }, |
| { |
| "clip_ratio/high_max": 0.00390625, |
| "clip_ratio/high_mean": 0.001953125, |
| "clip_ratio/low_mean": 0.004557291744276881, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.006510416744276881, |
| "entropy": 0.23407302796840668, |
| "epoch": 0.00136, |
| "grad_norm": 0.7608462572097778, |
| "kl": 0.389744964428246, |
| "learning_rate": 9.999998103708944e-06, |
| "loss": -0.1089, |
| "step": 68, |
| "step_time": 5.844826974000171 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.001953125, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.001953125, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1819.0, |
| "completions/max_terminated_length": 1819.0, |
| "completions/mean_length": 1668.25, |
| "completions/mean_terminated_length": 1668.25, |
| "completions/min_length": 1502.0, |
| "completions/min_terminated_length": 1502.0, |
| "entropy": 0.2598415594547987, |
| "epoch": 0.00138, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.7419317960739136, |
| "kl": 0.28893633373081684, |
| "learning_rate": 9.999997983338918e-06, |
| "loss": -0.0011, |
| "num_tokens": 2586505.0, |
| "reward": -7.137851238250732, |
| "reward_std": 7.074878692626953, |
| "rewards/rollout_reward_func/mean": -7.137851238250732, |
| "rewards/rollout_reward_func/std": 10.856270790100098, |
| "sampling/importance_sampling_ratio/max": 2.6760199069976807, |
| "sampling/importance_sampling_ratio/mean": 0.8516594171524048, |
| "sampling/importance_sampling_ratio/min": 0.13397420942783356, |
| "sampling/sampling_logp_difference/max": 1.6180033683776855, |
| "sampling/sampling_logp_difference/mean": 0.07290571928024292, |
| "step": 69, |
| "step_time": 37.49325247999877 |
| }, |
| { |
| "clip_ratio/high_max": 0.0078125, |
| "clip_ratio/high_mean": 0.00390625, |
| "clip_ratio/low_mean": 0.005859375, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.009765625, |
| "entropy": 0.25334353744983673, |
| "epoch": 0.0014, |
| "grad_norm": 0.7260986566543579, |
| "kl": 0.3038715925067663, |
| "learning_rate": 9.999997859265198e-06, |
| "loss": -0.0033, |
| "step": 70, |
| "step_time": 6.613173748000918 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.00390625, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.00390625, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1825.0, |
| "completions/max_terminated_length": 1825.0, |
| "completions/mean_length": 1665.625, |
| "completions/mean_terminated_length": 1665.625, |
| "completions/min_length": 1103.0, |
| "completions/min_terminated_length": 1103.0, |
| "entropy": 0.1995892282575369, |
| "epoch": 0.00142, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.8726052641868591, |
| "kl": 0.3035321347415447, |
| "learning_rate": 9.999997731487788e-06, |
| "loss": -0.1951, |
| "num_tokens": 2661197.0, |
| "reward": 0.01697838306427002, |
| "reward_std": 5.1158766746521, |
| "rewards/rollout_reward_func/mean": 0.01697838306427002, |
| "rewards/rollout_reward_func/std": 10.660343170166016, |
| "sampling/importance_sampling_ratio/max": 2.622579574584961, |
| "sampling/importance_sampling_ratio/mean": 0.7724930047988892, |
| "sampling/importance_sampling_ratio/min": 0.0, |
| "sampling/sampling_logp_difference/max": 1.3327304124832153, |
| "sampling/sampling_logp_difference/mean": 0.068142369389534, |
| "step": 71, |
| "step_time": 35.71897001199886 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.00390625, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.00390625, |
| "entropy": 0.19654854200780392, |
| "epoch": 0.00144, |
| "grad_norm": 0.9153209328651428, |
| "kl": 0.30595986917614937, |
| "learning_rate": 9.999997600006685e-06, |
| "loss": -0.1967, |
| "step": 72, |
| "step_time": 5.866601767999782 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1808.0, |
| "completions/max_terminated_length": 1808.0, |
| "completions/mean_length": 1696.9375, |
| "completions/mean_terminated_length": 1696.9375, |
| "completions/min_length": 1599.0, |
| "completions/min_terminated_length": 1599.0, |
| "entropy": 0.21030581928789616, |
| "epoch": 0.00146, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.7310628890991211, |
| "kl": 1.0501810098066926, |
| "learning_rate": 9.999997464821892e-06, |
| "loss": -0.2747, |
| "num_tokens": 2736835.0, |
| "reward": -7.002237319946289, |
| "reward_std": 6.102571487426758, |
| "rewards/rollout_reward_func/mean": -7.002237319946289, |
| "rewards/rollout_reward_func/std": 10.413652420043945, |
| "sampling/importance_sampling_ratio/max": 2.1960866451263428, |
| "sampling/importance_sampling_ratio/mean": 0.8038663268089294, |
| "sampling/importance_sampling_ratio/min": 0.046504825353622437, |
| "sampling/sampling_logp_difference/max": 2.5245094299316406, |
| "sampling/sampling_logp_difference/mean": 0.09849925339221954, |
| "step": 73, |
| "step_time": 38.96205426400138 |
| }, |
| { |
| "clip_ratio/high_max": 0.0078125, |
| "clip_ratio/high_mean": 0.00390625, |
| "clip_ratio/low_mean": 0.001953125, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.005859375, |
| "entropy": 0.20804075710475445, |
| "epoch": 0.00148, |
| "grad_norm": 0.6374855041503906, |
| "kl": 0.9734249282628298, |
| "learning_rate": 9.999997325933409e-06, |
| "loss": -0.2766, |
| "step": 74, |
| "step_time": 5.84851037899989 |
| }, |
| { |
| "clip_ratio/high_max": 0.0032051282469183207, |
| "clip_ratio/high_mean": 0.0016025641234591603, |
| "clip_ratio/low_mean": 0.001953125, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0035556891234591603, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1783.0, |
| "completions/max_terminated_length": 1783.0, |
| "completions/mean_length": 1692.96875, |
| "completions/mean_terminated_length": 1692.96875, |
| "completions/min_length": 1394.0, |
| "completions/min_terminated_length": 1394.0, |
| "entropy": 0.19729920756071806, |
| "epoch": 0.0015, |
| "frac_reward_zero_std": 0.125, |
| "grad_norm": 0.8172791600227356, |
| "kl": 0.4751043822616339, |
| "learning_rate": 9.999997183341233e-06, |
| "loss": -0.142, |
| "num_tokens": 2812279.0, |
| "reward": -2.219168186187744, |
| "reward_std": 9.400641441345215, |
| "rewards/rollout_reward_func/mean": -2.219168186187744, |
| "rewards/rollout_reward_func/std": 17.64300537109375, |
| "sampling/importance_sampling_ratio/max": 2.173631429672241, |
| "sampling/importance_sampling_ratio/mean": 0.8823361396789551, |
| "sampling/importance_sampling_ratio/min": 0.03189156949520111, |
| "sampling/sampling_logp_difference/max": 1.8222627639770508, |
| "sampling/sampling_logp_difference/mean": 0.0679212361574173, |
| "step": 75, |
| "step_time": 35.097398169999906 |
| }, |
| { |
| "clip_ratio/high_max": 0.007111378246918321, |
| "clip_ratio/high_mean": 0.0035556891234591603, |
| "clip_ratio/low_mean": 0.001953125, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.00550881412345916, |
| "entropy": 0.19708813540637493, |
| "epoch": 0.00152, |
| "grad_norm": 0.7900307774543762, |
| "kl": 0.43427742179483175, |
| "learning_rate": 9.999997037045365e-06, |
| "loss": -0.1431, |
| "step": 76, |
| "step_time": 6.543489481999131 |
| }, |
| { |
| "clip_ratio/high_max": 0.00390625, |
| "clip_ratio/high_mean": 0.001953125, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.001953125, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1808.0, |
| "completions/max_terminated_length": 1808.0, |
| "completions/mean_length": 1717.5625, |
| "completions/mean_terminated_length": 1717.5625, |
| "completions/min_length": 1391.0, |
| "completions/min_terminated_length": 1391.0, |
| "entropy": 0.15479024220257998, |
| "epoch": 0.00154, |
| "frac_reward_zero_std": 0.125, |
| "grad_norm": 0.6995255947113037, |
| "kl": 0.30347975343465805, |
| "learning_rate": 9.999996887045808e-06, |
| "loss": 0.0408, |
| "num_tokens": 2888657.0, |
| "reward": -4.00355339050293, |
| "reward_std": 4.775286674499512, |
| "rewards/rollout_reward_func/mean": -4.00355339050293, |
| "rewards/rollout_reward_func/std": 6.246252536773682, |
| "sampling/importance_sampling_ratio/max": 1.650878667831421, |
| "sampling/importance_sampling_ratio/mean": 1.0060396194458008, |
| "sampling/importance_sampling_ratio/min": 0.047248467803001404, |
| "sampling/sampling_logp_difference/max": 1.4376678466796875, |
| "sampling/sampling_logp_difference/mean": 0.05947484076023102, |
| "step": 77, |
| "step_time": 38.8434166800007 |
| }, |
| { |
| "clip_ratio/high_max": 0.01171875, |
| "clip_ratio/high_mean": 0.005859375, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.005859375, |
| "entropy": 0.15612738858908415, |
| "epoch": 0.00156, |
| "grad_norm": 0.6284993886947632, |
| "kl": 0.2821835596114397, |
| "learning_rate": 9.99999673334256e-06, |
| "loss": 0.0397, |
| "step": 78, |
| "step_time": 5.918868127000678 |
| }, |
| { |
| "clip_ratio/high_max": 0.00390625, |
| "clip_ratio/high_mean": 0.001953125, |
| "clip_ratio/low_mean": 0.00390625, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.005859375, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1762.0, |
| "completions/max_terminated_length": 1762.0, |
| "completions/mean_length": 1664.59375, |
| "completions/mean_terminated_length": 1664.59375, |
| "completions/min_length": 1434.0, |
| "completions/min_terminated_length": 1434.0, |
| "entropy": 0.17359685897827148, |
| "epoch": 0.00158, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.977432370185852, |
| "kl": 0.6586176492273808, |
| "learning_rate": 9.99999657593562e-06, |
| "loss": -0.2064, |
| "num_tokens": 2963303.0, |
| "reward": -4.805361270904541, |
| "reward_std": 6.155376434326172, |
| "rewards/rollout_reward_func/mean": -4.805361270904541, |
| "rewards/rollout_reward_func/std": 7.987611293792725, |
| "sampling/importance_sampling_ratio/max": 2.683112144470215, |
| "sampling/importance_sampling_ratio/mean": 0.8393880128860474, |
| "sampling/importance_sampling_ratio/min": 0.0, |
| "sampling/sampling_logp_difference/max": 2.071775197982788, |
| "sampling/sampling_logp_difference/mean": 0.07382857799530029, |
| "step": 79, |
| "step_time": 37.2499062830002 |
| }, |
| { |
| "clip_ratio/high_max": 0.01171875, |
| "clip_ratio/high_mean": 0.005859375, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.005859375, |
| "entropy": 0.1753261275589466, |
| "epoch": 0.0016, |
| "grad_norm": 0.7493237257003784, |
| "kl": 0.522200190462172, |
| "learning_rate": 9.99999641482499e-06, |
| "loss": -0.2097, |
| "step": 80, |
| "step_time": 5.753503210998133 |
| }, |
| { |
| "clip_ratio/high_max": 0.00390625, |
| "clip_ratio/high_mean": 0.001953125, |
| "clip_ratio/low_mean": 0.001953125, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.00390625, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1832.0, |
| "completions/max_terminated_length": 1832.0, |
| "completions/mean_length": 1708.34375, |
| "completions/mean_terminated_length": 1708.34375, |
| "completions/min_length": 1412.0, |
| "completions/min_terminated_length": 1412.0, |
| "entropy": 0.2533543687313795, |
| "epoch": 0.00162, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.9206402897834778, |
| "kl": 0.49991538375616074, |
| "learning_rate": 9.999996250010671e-06, |
| "loss": -0.1446, |
| "num_tokens": 3039686.0, |
| "reward": -3.956460952758789, |
| "reward_std": 7.495929718017578, |
| "rewards/rollout_reward_func/mean": -3.956460952758789, |
| "rewards/rollout_reward_func/std": 8.32241153717041, |
| "sampling/importance_sampling_ratio/max": 2.4827659130096436, |
| "sampling/importance_sampling_ratio/mean": 0.9107170701026917, |
| "sampling/importance_sampling_ratio/min": 0.11611815541982651, |
| "sampling/sampling_logp_difference/max": 1.4147658348083496, |
| "sampling/sampling_logp_difference/mean": 0.07592972368001938, |
| "step": 81, |
| "step_time": 37.18665667199912 |
| }, |
| { |
| "clip_ratio/high_max": 0.0078125, |
| "clip_ratio/high_mean": 0.00390625, |
| "clip_ratio/low_mean": 0.001953125, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.005859375, |
| "entropy": 0.2543573584407568, |
| "epoch": 0.00164, |
| "grad_norm": 0.854416012763977, |
| "kl": 0.47987215034663677, |
| "learning_rate": 9.999996081492662e-06, |
| "loss": -0.1459, |
| "step": 82, |
| "step_time": 5.884696772999632 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.00390625, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.00390625, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1824.0, |
| "completions/max_terminated_length": 1824.0, |
| "completions/mean_length": 1733.9375, |
| "completions/mean_terminated_length": 1733.9375, |
| "completions/min_length": 1618.0, |
| "completions/min_terminated_length": 1618.0, |
| "entropy": 0.15933354571461678, |
| "epoch": 0.00166, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.8887842893600464, |
| "kl": 0.3857283741235733, |
| "learning_rate": 9.999995909270962e-06, |
| "loss": -0.166, |
| "num_tokens": 3116205.0, |
| "reward": -4.7430243492126465, |
| "reward_std": 5.144591808319092, |
| "rewards/rollout_reward_func/mean": -4.7430243492126465, |
| "rewards/rollout_reward_func/std": 8.597419738769531, |
| "sampling/importance_sampling_ratio/max": 2.8380703926086426, |
| "sampling/importance_sampling_ratio/mean": 1.0134353637695312, |
| "sampling/importance_sampling_ratio/min": 0.0, |
| "sampling/sampling_logp_difference/max": 1.3160133361816406, |
| "sampling/sampling_logp_difference/mean": 0.05554642528295517, |
| "step": 83, |
| "step_time": 37.96955776300001 |
| }, |
| { |
| "clip_ratio/high_max": 0.0078125, |
| "clip_ratio/high_mean": 0.00390625, |
| "clip_ratio/low_mean": 0.00390625, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0078125, |
| "entropy": 0.15759241580963135, |
| "epoch": 0.00168, |
| "grad_norm": 0.837668240070343, |
| "kl": 0.40188954304903746, |
| "learning_rate": 9.999995733345573e-06, |
| "loss": -0.1676, |
| "step": 84, |
| "step_time": 5.867369008998139 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.00390625, |
| "clip_ratio/low_min": 0.00390625, |
| "clip_ratio/region_mean": 0.00390625, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1805.0, |
| "completions/max_terminated_length": 1805.0, |
| "completions/mean_length": 1607.8125, |
| "completions/mean_terminated_length": 1607.8125, |
| "completions/min_length": 745.0, |
| "completions/min_terminated_length": 745.0, |
| "entropy": 0.2709789536893368, |
| "epoch": 0.0017, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.9693952798843384, |
| "kl": 0.7671259762719274, |
| "learning_rate": 9.999995553716494e-06, |
| "loss": -0.0133, |
| "num_tokens": 3189437.0, |
| "reward": -6.506250381469727, |
| "reward_std": 6.401736736297607, |
| "rewards/rollout_reward_func/mean": -6.506250381469727, |
| "rewards/rollout_reward_func/std": 11.671521186828613, |
| "sampling/importance_sampling_ratio/max": 2.657285690307617, |
| "sampling/importance_sampling_ratio/mean": 0.9502277374267578, |
| "sampling/importance_sampling_ratio/min": 0.0, |
| "sampling/sampling_logp_difference/max": 2.1241543292999268, |
| "sampling/sampling_logp_difference/mean": 0.06842806935310364, |
| "step": 85, |
| "step_time": 35.07483531900107 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0047940341755747795, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0047940341755747795, |
| "entropy": 0.2695111036300659, |
| "epoch": 0.00172, |
| "grad_norm": 0.9897550344467163, |
| "kl": 0.8880385467782617, |
| "learning_rate": 9.999995370383725e-06, |
| "loss": -0.0147, |
| "step": 86, |
| "step_time": 5.823899960000745 |
| }, |
| { |
| "clip_ratio/high_max": 0.0078125, |
| "clip_ratio/high_mean": 0.00390625, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.00390625, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1834.0, |
| "completions/max_terminated_length": 1834.0, |
| "completions/mean_length": 1718.625, |
| "completions/mean_terminated_length": 1718.625, |
| "completions/min_length": 1620.0, |
| "completions/min_terminated_length": 1620.0, |
| "entropy": 0.19925166107714176, |
| "epoch": 0.00174, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.8225500583648682, |
| "kl": 0.40306959114968777, |
| "learning_rate": 9.999995183347268e-06, |
| "loss": -0.1216, |
| "num_tokens": 3265817.0, |
| "reward": -4.694127082824707, |
| "reward_std": 7.164846897125244, |
| "rewards/rollout_reward_func/mean": -4.694127082824707, |
| "rewards/rollout_reward_func/std": 10.58739948272705, |
| "sampling/importance_sampling_ratio/max": 2.249318838119507, |
| "sampling/importance_sampling_ratio/mean": 0.8394811153411865, |
| "sampling/importance_sampling_ratio/min": 0.12018804997205734, |
| "sampling/sampling_logp_difference/max": 1.5835975408554077, |
| "sampling/sampling_logp_difference/mean": 0.07420962303876877, |
| "step": 87, |
| "step_time": 38.998291893000896 |
| }, |
| { |
| "clip_ratio/high_max": 0.00390625, |
| "clip_ratio/high_mean": 0.001953125, |
| "clip_ratio/low_mean": 0.005859375, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0078125, |
| "entropy": 0.1985994167625904, |
| "epoch": 0.00176, |
| "grad_norm": 0.7866024374961853, |
| "kl": 0.4124698657542467, |
| "learning_rate": 9.999994992607122e-06, |
| "loss": -0.1228, |
| "step": 88, |
| "step_time": 6.345813049000753 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.005430640187114477, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.005430640187114477, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1791.0, |
| "completions/max_terminated_length": 1791.0, |
| "completions/mean_length": 1672.28125, |
| "completions/mean_terminated_length": 1672.28125, |
| "completions/min_length": 1437.0, |
| "completions/min_terminated_length": 1437.0, |
| "entropy": 0.15057391114532948, |
| "epoch": 0.00178, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.8857223987579346, |
| "kl": 0.7324486412107944, |
| "learning_rate": 9.999994798163286e-06, |
| "loss": -0.1942, |
| "num_tokens": 3340291.0, |
| "reward": -4.911107540130615, |
| "reward_std": 5.658788204193115, |
| "rewards/rollout_reward_func/mean": -4.911107540130615, |
| "rewards/rollout_reward_func/std": 9.124991416931152, |
| "sampling/importance_sampling_ratio/max": 1.8904942274093628, |
| "sampling/importance_sampling_ratio/mean": 0.8380607962608337, |
| "sampling/importance_sampling_ratio/min": 0.02950156107544899, |
| "sampling/sampling_logp_difference/max": 1.5451288223266602, |
| "sampling/sampling_logp_difference/mean": 0.06332387030124664, |
| "step": 89, |
| "step_time": 37.11040565100211 |
| }, |
| { |
| "clip_ratio/high_max": 0.00390625, |
| "clip_ratio/high_mean": 0.001953125, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.001953125, |
| "entropy": 0.15209791343659163, |
| "epoch": 0.0018, |
| "grad_norm": 0.7109376788139343, |
| "kl": 0.6368166394531727, |
| "learning_rate": 9.999994600015764e-06, |
| "loss": -0.1955, |
| "step": 90, |
| "step_time": 5.825622919000125 |
| }, |
| { |
| "clip_ratio/high_max": 0.00390625, |
| "clip_ratio/high_mean": 0.001953125, |
| "clip_ratio/low_mean": 0.001953125, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.00390625, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1821.0, |
| "completions/max_terminated_length": 1821.0, |
| "completions/mean_length": 1721.84375, |
| "completions/mean_terminated_length": 1721.84375, |
| "completions/min_length": 1530.0, |
| "completions/min_terminated_length": 1530.0, |
| "entropy": 0.16620426252484322, |
| "epoch": 0.00182, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 2.4581727981567383, |
| "kl": 1.7564852200448513, |
| "learning_rate": 9.99999439816455e-06, |
| "loss": -0.0169, |
| "num_tokens": 3416936.0, |
| "reward": -4.905522346496582, |
| "reward_std": 7.8199005126953125, |
| "rewards/rollout_reward_func/mean": -4.905522346496582, |
| "rewards/rollout_reward_func/std": 8.327784538269043, |
| "sampling/importance_sampling_ratio/max": 1.7173593044281006, |
| "sampling/importance_sampling_ratio/mean": 0.9146490693092346, |
| "sampling/importance_sampling_ratio/min": 0.0, |
| "sampling/sampling_logp_difference/max": 2.112088203430176, |
| "sampling/sampling_logp_difference/mean": 0.06157752498984337, |
| "step": 91, |
| "step_time": 37.20597630500015 |
| }, |
| { |
| "clip_ratio/high_max": 0.0078125, |
| "clip_ratio/high_mean": 0.00390625, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.00390625, |
| "entropy": 0.17287507839500904, |
| "epoch": 0.00184, |
| "grad_norm": 1.492632269859314, |
| "kl": 1.1148988083004951, |
| "learning_rate": 9.999994192609649e-06, |
| "loss": -0.023, |
| "step": 92, |
| "step_time": 5.926312706999852 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1774.0, |
| "completions/max_terminated_length": 1774.0, |
| "completions/mean_length": 1653.09375, |
| "completions/mean_terminated_length": 1653.09375, |
| "completions/min_length": 1566.0, |
| "completions/min_terminated_length": 1566.0, |
| "entropy": 0.21760124899446964, |
| "epoch": 0.00186, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.9200534820556641, |
| "kl": 0.26406371779739857, |
| "learning_rate": 9.99999398335106e-06, |
| "loss": -0.0031, |
| "num_tokens": 3491478.0, |
| "reward": -5.420025825500488, |
| "reward_std": 5.013497829437256, |
| "rewards/rollout_reward_func/mean": -5.420025825500488, |
| "rewards/rollout_reward_func/std": 6.584102630615234, |
| "sampling/importance_sampling_ratio/max": 2.963585376739502, |
| "sampling/importance_sampling_ratio/mean": 0.8498660326004028, |
| "sampling/importance_sampling_ratio/min": 0.2295318990945816, |
| "sampling/sampling_logp_difference/max": 0.9062635898590088, |
| "sampling/sampling_logp_difference/mean": 0.06188333407044411, |
| "step": 93, |
| "step_time": 38.3763018009995 |
| }, |
| { |
| "clip_ratio/high_max": 0.01953125, |
| "clip_ratio/high_mean": 0.01171875, |
| "clip_ratio/low_mean": 0.001953125, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.013671875, |
| "entropy": 0.22959251329302788, |
| "epoch": 0.00188, |
| "grad_norm": 0.6035089492797852, |
| "kl": 0.23167249467223883, |
| "learning_rate": 9.999993770388785e-06, |
| "loss": -0.0067, |
| "step": 94, |
| "step_time": 5.798366992999036 |
| }, |
| { |
| "clip_ratio/high_max": 0.007694128900766373, |
| "clip_ratio/high_mean": 0.0038470644503831863, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0038470644503831863, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1753.0, |
| "completions/max_terminated_length": 1753.0, |
| "completions/mean_length": 1658.03125, |
| "completions/mean_terminated_length": 1658.03125, |
| "completions/min_length": 1455.0, |
| "completions/min_terminated_length": 1455.0, |
| "entropy": 0.20340878516435623, |
| "epoch": 0.0019, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.7779289484024048, |
| "kl": 0.3094237130135298, |
| "learning_rate": 9.99999355372282e-06, |
| "loss": -0.1794, |
| "num_tokens": 3565673.0, |
| "reward": -5.6944780349731445, |
| "reward_std": 3.949958562850952, |
| "rewards/rollout_reward_func/mean": -5.6944780349731445, |
| "rewards/rollout_reward_func/std": 5.5989580154418945, |
| "sampling/importance_sampling_ratio/max": 1.688936471939087, |
| "sampling/importance_sampling_ratio/mean": 0.9206300973892212, |
| "sampling/importance_sampling_ratio/min": 0.1527530699968338, |
| "sampling/sampling_logp_difference/max": 0.9441490173339844, |
| "sampling/sampling_logp_difference/mean": 0.050117556005716324, |
| "step": 95, |
| "step_time": 37.41757664999841 |
| }, |
| { |
| "clip_ratio/high_max": 0.01171875, |
| "clip_ratio/high_mean": 0.005859375, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.005859375, |
| "entropy": 0.2086612544953823, |
| "epoch": 0.00192, |
| "grad_norm": 0.7833954691886902, |
| "kl": 0.29302166029810905, |
| "learning_rate": 9.999993333353169e-06, |
| "loss": -0.1799, |
| "step": 96, |
| "step_time": 5.725412834000053 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0028409091755747795, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0028409091755747795, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1804.0, |
| "completions/max_terminated_length": 1804.0, |
| "completions/mean_length": 1496.625, |
| "completions/mean_terminated_length": 1496.625, |
| "completions/min_length": 185.0, |
| "completions/min_terminated_length": 185.0, |
| "entropy": 0.2479400299489498, |
| "epoch": 0.00194, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.1320165395736694, |
| "kl": 0.3989646164700389, |
| "learning_rate": 9.999993109279829e-06, |
| "loss": -0.0546, |
| "num_tokens": 3635185.0, |
| "reward": -5.023990154266357, |
| "reward_std": 3.9877772331237793, |
| "rewards/rollout_reward_func/mean": -5.023990154266357, |
| "rewards/rollout_reward_func/std": 8.748732566833496, |
| "sampling/importance_sampling_ratio/max": 2.214284896850586, |
| "sampling/importance_sampling_ratio/mean": 0.9846020936965942, |
| "sampling/importance_sampling_ratio/min": 0.30134767293930054, |
| "sampling/sampling_logp_difference/max": 0.9754681587219238, |
| "sampling/sampling_logp_difference/mean": 0.061219509690999985, |
| "step": 97, |
| "step_time": 36.602559436000774 |
| }, |
| { |
| "clip_ratio/high_max": 0.00390625, |
| "clip_ratio/high_mean": 0.001953125, |
| "clip_ratio/low_mean": 0.008984375046566129, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.010937500046566129, |
| "entropy": 0.25142903439700603, |
| "epoch": 0.00196, |
| "grad_norm": 1.64491868019104, |
| "kl": 0.39512724056839943, |
| "learning_rate": 9.999992881502803e-06, |
| "loss": -0.0574, |
| "step": 98, |
| "step_time": 6.578721888999098 |
| }, |
| { |
| "clip_ratio/high_max": 0.00390625, |
| "clip_ratio/high_mean": 0.001953125, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.001953125, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1808.0, |
| "completions/max_terminated_length": 1808.0, |
| "completions/mean_length": 1710.1875, |
| "completions/mean_terminated_length": 1710.1875, |
| "completions/min_length": 1435.0, |
| "completions/min_terminated_length": 1435.0, |
| "entropy": 0.23437649384140968, |
| "epoch": 0.00198, |
| "frac_reward_zero_std": 0.125, |
| "grad_norm": 0.9414642453193665, |
| "kl": 0.3236832795664668, |
| "learning_rate": 9.999992650022092e-06, |
| "loss": -0.1719, |
| "num_tokens": 3711125.0, |
| "reward": -2.0210158824920654, |
| "reward_std": 4.779875755310059, |
| "rewards/rollout_reward_func/mean": -2.0210158824920654, |
| "rewards/rollout_reward_func/std": 6.4040093421936035, |
| "sampling/importance_sampling_ratio/max": 2.3287320137023926, |
| "sampling/importance_sampling_ratio/mean": 1.0207520723342896, |
| "sampling/importance_sampling_ratio/min": 0.35892435908317566, |
| "sampling/sampling_logp_difference/max": 1.0962285995483398, |
| "sampling/sampling_logp_difference/mean": 0.044818222522735596, |
| "step": 99, |
| "step_time": 36.753581342999496 |
| }, |
| { |
| "clip_ratio/high_max": 0.01171875, |
| "clip_ratio/high_mean": 0.005859375, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.005859375, |
| "entropy": 0.2348782755434513, |
| "epoch": 0.002, |
| "grad_norm": 0.9503746628761292, |
| "kl": 0.3419312732294202, |
| "learning_rate": 9.999992414837692e-06, |
| "loss": -0.1746, |
| "step": 100, |
| "step_time": 5.861453168999105 |
| }, |
| { |
| "clip_ratio/high_max": 0.0078125, |
| "clip_ratio/high_mean": 0.00390625, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.00390625, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1808.0, |
| "completions/max_terminated_length": 1808.0, |
| "completions/mean_length": 1658.5, |
| "completions/mean_terminated_length": 1658.5, |
| "completions/min_length": 665.0, |
| "completions/min_terminated_length": 665.0, |
| "entropy": 0.2949713133275509, |
| "epoch": 0.00202, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.8095335960388184, |
| "kl": 0.4149684626609087, |
| "learning_rate": 9.999992175949606e-06, |
| "loss": -0.1327, |
| "num_tokens": 3785245.0, |
| "reward": -1.9772329330444336, |
| "reward_std": 6.376922607421875, |
| "rewards/rollout_reward_func/mean": -1.9772329330444336, |
| "rewards/rollout_reward_func/std": 9.654932022094727, |
| "sampling/importance_sampling_ratio/max": 1.6300160884857178, |
| "sampling/importance_sampling_ratio/mean": 0.9628180265426636, |
| "sampling/importance_sampling_ratio/min": 0.14600704610347748, |
| "sampling/sampling_logp_difference/max": 1.1694939136505127, |
| "sampling/sampling_logp_difference/mean": 0.055236026644706726, |
| "step": 101, |
| "step_time": 36.26178865900147 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "entropy": 0.2924080714583397, |
| "epoch": 0.00204, |
| "grad_norm": 0.8164064884185791, |
| "kl": 0.48169367760419846, |
| "learning_rate": 9.999991933357835e-06, |
| "loss": -0.1325, |
| "step": 102, |
| "step_time": 5.8681000480000876 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1814.0, |
| "completions/max_terminated_length": 1814.0, |
| "completions/mean_length": 1653.375, |
| "completions/mean_terminated_length": 1653.375, |
| "completions/min_length": 663.0, |
| "completions/min_terminated_length": 663.0, |
| "entropy": 0.2441821303218603, |
| "epoch": 0.00206, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.1374014616012573, |
| "kl": 0.19944205041974783, |
| "learning_rate": 9.999991687062379e-06, |
| "loss": -0.1332, |
| "num_tokens": 3859549.0, |
| "reward": -3.1205766201019287, |
| "reward_std": 6.89734411239624, |
| "rewards/rollout_reward_func/mean": -3.1205766201019287, |
| "rewards/rollout_reward_func/std": 8.786608695983887, |
| "sampling/importance_sampling_ratio/max": 2.907442092895508, |
| "sampling/importance_sampling_ratio/mean": 0.9543389678001404, |
| "sampling/importance_sampling_ratio/min": 0.26082849502563477, |
| "sampling/sampling_logp_difference/max": 1.416438102722168, |
| "sampling/sampling_logp_difference/mean": 0.04272625967860222, |
| "step": 103, |
| "step_time": 35.06253007600026 |
| }, |
| { |
| "clip_ratio/high_max": 0.00390625, |
| "clip_ratio/high_mean": 0.001953125, |
| "clip_ratio/low_mean": 0.006510416744276881, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.008463541744276881, |
| "entropy": 0.24291709996759892, |
| "epoch": 0.00208, |
| "grad_norm": 1.0697972774505615, |
| "kl": 0.21361435670405626, |
| "learning_rate": 9.999991437063234e-06, |
| "loss": -0.1369, |
| "step": 104, |
| "step_time": 6.35797552799977 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1813.0, |
| "completions/max_terminated_length": 1813.0, |
| "completions/mean_length": 1677.625, |
| "completions/mean_terminated_length": 1677.625, |
| "completions/min_length": 1535.0, |
| "completions/min_terminated_length": 1535.0, |
| "entropy": 0.2205460276454687, |
| "epoch": 0.0021, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.7834408283233643, |
| "kl": 0.41426409501582384, |
| "learning_rate": 9.999991183360406e-06, |
| "loss": -0.1262, |
| "num_tokens": 3934186.0, |
| "reward": -1.3775752782821655, |
| "reward_std": 6.934841156005859, |
| "rewards/rollout_reward_func/mean": -1.3775752782821655, |
| "rewards/rollout_reward_func/std": 9.078507423400879, |
| "sampling/importance_sampling_ratio/max": 1.6336519718170166, |
| "sampling/importance_sampling_ratio/mean": 0.9894572496414185, |
| "sampling/importance_sampling_ratio/min": 0.09595068544149399, |
| "sampling/sampling_logp_difference/max": 1.2380528450012207, |
| "sampling/sampling_logp_difference/mean": 0.04348953068256378, |
| "step": 105, |
| "step_time": 37.434679850000975 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.001953125, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.001953125, |
| "entropy": 0.21956982091069221, |
| "epoch": 0.00212, |
| "grad_norm": 0.7701007723808289, |
| "kl": 0.435552092269063, |
| "learning_rate": 9.999990925953894e-06, |
| "loss": -0.1276, |
| "step": 106, |
| "step_time": 5.842552839000746 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.001953125, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.001953125, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1784.0, |
| "completions/max_terminated_length": 1784.0, |
| "completions/mean_length": 1683.90625, |
| "completions/mean_terminated_length": 1683.90625, |
| "completions/min_length": 1482.0, |
| "completions/min_terminated_length": 1482.0, |
| "entropy": 0.23681390658020973, |
| "epoch": 0.00214, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.7701250314712524, |
| "kl": 0.675053995102644, |
| "learning_rate": 9.999990664843696e-06, |
| "loss": -0.2223, |
| "num_tokens": 4009166.0, |
| "reward": -3.5597405433654785, |
| "reward_std": 6.297412395477295, |
| "rewards/rollout_reward_func/mean": -3.5597405433654785, |
| "rewards/rollout_reward_func/std": 8.763272285461426, |
| "sampling/importance_sampling_ratio/max": 1.9215384721755981, |
| "sampling/importance_sampling_ratio/mean": 0.8674914836883545, |
| "sampling/importance_sampling_ratio/min": 0.052470579743385315, |
| "sampling/sampling_logp_difference/max": 1.6477103233337402, |
| "sampling/sampling_logp_difference/mean": 0.06225297600030899, |
| "step": 107, |
| "step_time": 38.8391734249999 |
| }, |
| { |
| "clip_ratio/high_max": 0.00390625, |
| "clip_ratio/high_mean": 0.001953125, |
| "clip_ratio/low_mean": 0.001953125, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.00390625, |
| "entropy": 0.23528996109962463, |
| "epoch": 0.00216, |
| "grad_norm": 0.7644266486167908, |
| "kl": 0.6335257366299629, |
| "learning_rate": 9.999990400029814e-06, |
| "loss": -0.2237, |
| "step": 108, |
| "step_time": 5.8248516069998 |
| }, |
| { |
| "clip_ratio/high_max": 0.00390625, |
| "clip_ratio/high_mean": 0.001953125, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.001953125, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1759.0, |
| "completions/max_terminated_length": 1759.0, |
| "completions/mean_length": 1650.09375, |
| "completions/mean_terminated_length": 1650.09375, |
| "completions/min_length": 1436.0, |
| "completions/min_terminated_length": 1436.0, |
| "entropy": 0.2473286334425211, |
| "epoch": 0.00218, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.9451829791069031, |
| "kl": 0.3451357875019312, |
| "learning_rate": 9.999990131512245e-06, |
| "loss": -0.1751, |
| "num_tokens": 4083095.0, |
| "reward": -3.8237268924713135, |
| "reward_std": 5.033920764923096, |
| "rewards/rollout_reward_func/mean": -3.8237268924713135, |
| "rewards/rollout_reward_func/std": 6.137859344482422, |
| "sampling/importance_sampling_ratio/max": 1.8476587533950806, |
| "sampling/importance_sampling_ratio/mean": 1.0128724575042725, |
| "sampling/importance_sampling_ratio/min": 0.1566690355539322, |
| "sampling/sampling_logp_difference/max": 1.2108018398284912, |
| "sampling/sampling_logp_difference/mean": 0.03772260248661041, |
| "step": 109, |
| "step_time": 37.583575454999846 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.00390625, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.00390625, |
| "entropy": 0.24676945246756077, |
| "epoch": 0.0022, |
| "grad_norm": 0.8795206546783447, |
| "kl": 0.3198219258338213, |
| "learning_rate": 9.999989859290995e-06, |
| "loss": -0.1785, |
| "step": 110, |
| "step_time": 6.2266275209995 |
| }, |
| { |
| "clip_ratio/high_max": 0.00390625, |
| "clip_ratio/high_mean": 0.001953125, |
| "clip_ratio/low_mean": 0.001953125, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.00390625, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1813.0, |
| "completions/max_terminated_length": 1813.0, |
| "completions/mean_length": 1680.4375, |
| "completions/mean_terminated_length": 1680.4375, |
| "completions/min_length": 1474.0, |
| "completions/min_terminated_length": 1474.0, |
| "entropy": 0.2194829098880291, |
| "epoch": 0.00222, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.7685462832450867, |
| "kl": 0.49289337545633316, |
| "learning_rate": 9.99998958336606e-06, |
| "loss": -0.1256, |
| "num_tokens": 4158181.0, |
| "reward": -5.0853400230407715, |
| "reward_std": 3.0313239097595215, |
| "rewards/rollout_reward_func/mean": -5.0853400230407715, |
| "rewards/rollout_reward_func/std": 4.776071548461914, |
| "sampling/importance_sampling_ratio/max": 1.744321584701538, |
| "sampling/importance_sampling_ratio/mean": 0.8627097606658936, |
| "sampling/importance_sampling_ratio/min": 0.1638958901166916, |
| "sampling/sampling_logp_difference/max": 1.4669370651245117, |
| "sampling/sampling_logp_difference/mean": 0.04983227327466011, |
| "step": 111, |
| "step_time": 37.50297103200319 |
| }, |
| { |
| "clip_ratio/high_max": 0.00390625, |
| "clip_ratio/high_mean": 0.001953125, |
| "clip_ratio/low_mean": 0.001953125, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.00390625, |
| "entropy": 0.2189310658723116, |
| "epoch": 0.00224, |
| "grad_norm": 0.7485074400901794, |
| "kl": 0.480657372623682, |
| "learning_rate": 9.999989303737442e-06, |
| "loss": -0.1253, |
| "step": 112, |
| "step_time": 5.845547954002541 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.00390625, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.00390625, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1802.0, |
| "completions/max_terminated_length": 1802.0, |
| "completions/mean_length": 1632.78125, |
| "completions/mean_terminated_length": 1632.78125, |
| "completions/min_length": 280.0, |
| "completions/min_terminated_length": 280.0, |
| "entropy": 0.25709761306643486, |
| "epoch": 0.00226, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.0180292129516602, |
| "kl": 0.28133167419582605, |
| "learning_rate": 9.999989020405141e-06, |
| "loss": -0.0265, |
| "num_tokens": 4231873.0, |
| "reward": -4.746943950653076, |
| "reward_std": 5.858287811279297, |
| "rewards/rollout_reward_func/mean": -4.746943950653076, |
| "rewards/rollout_reward_func/std": 8.34781551361084, |
| "sampling/importance_sampling_ratio/max": 1.501573920249939, |
| "sampling/importance_sampling_ratio/mean": 0.8685078024864197, |
| "sampling/importance_sampling_ratio/min": 0.08650124073028564, |
| "sampling/sampling_logp_difference/max": 1.4456124305725098, |
| "sampling/sampling_logp_difference/mean": 0.04686921089887619, |
| "step": 113, |
| "step_time": 35.9479068110013 |
| }, |
| { |
| "clip_ratio/high_max": 0.014062500093132257, |
| "clip_ratio/high_mean": 0.007031250046566129, |
| "clip_ratio/low_mean": 0.00390625, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.010937500046566129, |
| "entropy": 0.2569599896669388, |
| "epoch": 0.00228, |
| "grad_norm": 0.9531951546669006, |
| "kl": 0.2381299063563347, |
| "learning_rate": 9.999988733369157e-06, |
| "loss": -0.0309, |
| "step": 114, |
| "step_time": 6.549294945001748 |
| }, |
| { |
| "clip_ratio/high_max": 0.00390625, |
| "clip_ratio/high_mean": 0.001953125, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.001953125, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1823.0, |
| "completions/max_terminated_length": 1823.0, |
| "completions/mean_length": 1726.15625, |
| "completions/mean_terminated_length": 1726.15625, |
| "completions/min_length": 1611.0, |
| "completions/min_terminated_length": 1611.0, |
| "entropy": 0.23173769749701023, |
| "epoch": 0.0023, |
| "frac_reward_zero_std": 0.125, |
| "grad_norm": 0.8843880891799927, |
| "kl": 0.37245292216539383, |
| "learning_rate": 9.999988442629489e-06, |
| "loss": 0.0203, |
| "num_tokens": 4308553.0, |
| "reward": -4.190584182739258, |
| "reward_std": 2.972388744354248, |
| "rewards/rollout_reward_func/mean": -4.190584182739258, |
| "rewards/rollout_reward_func/std": 5.20486307144165, |
| "sampling/importance_sampling_ratio/max": 1.6944365501403809, |
| "sampling/importance_sampling_ratio/mean": 0.9236411452293396, |
| "sampling/importance_sampling_ratio/min": 0.13073918223381042, |
| "sampling/sampling_logp_difference/max": 1.3462402820587158, |
| "sampling/sampling_logp_difference/mean": 0.052857253700494766, |
| "step": 115, |
| "step_time": 38.24738468399846 |
| }, |
| { |
| "clip_ratio/high_max": 0.0078125, |
| "clip_ratio/high_mean": 0.005859375, |
| "clip_ratio/low_mean": 0.001953125, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0078125, |
| "entropy": 0.23898235149681568, |
| "epoch": 0.00232, |
| "grad_norm": 0.7934396266937256, |
| "kl": 0.37963598500937223, |
| "learning_rate": 9.99998814818614e-06, |
| "loss": 0.0198, |
| "step": 116, |
| "step_time": 5.872056240998063 |
| }, |
| { |
| "clip_ratio/high_max": 0.00390625, |
| "clip_ratio/high_mean": 0.001953125, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.001953125, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1798.0, |
| "completions/max_terminated_length": 1798.0, |
| "completions/mean_length": 1668.34375, |
| "completions/mean_terminated_length": 1668.34375, |
| "completions/min_length": 1463.0, |
| "completions/min_terminated_length": 1463.0, |
| "entropy": 0.2294948324561119, |
| "epoch": 0.00234, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.2376160621643066, |
| "kl": 0.2936890870332718, |
| "learning_rate": 9.999987850039108e-06, |
| "loss": -0.1774, |
| "num_tokens": 4383556.0, |
| "reward": -3.607532501220703, |
| "reward_std": 5.6845526695251465, |
| "rewards/rollout_reward_func/mean": -3.607532501220703, |
| "rewards/rollout_reward_func/std": 8.885178565979004, |
| "sampling/importance_sampling_ratio/max": 2.275712251663208, |
| "sampling/importance_sampling_ratio/mean": 1.1071913242340088, |
| "sampling/importance_sampling_ratio/min": 0.16783574223518372, |
| "sampling/sampling_logp_difference/max": 1.2079877853393555, |
| "sampling/sampling_logp_difference/mean": 0.04779823124408722, |
| "step": 117, |
| "step_time": 37.413396432997615 |
| }, |
| { |
| "clip_ratio/high_max": 0.01171875, |
| "clip_ratio/high_mean": 0.005859375, |
| "clip_ratio/low_mean": 0.00390625, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.009765625, |
| "entropy": 0.23724446073174477, |
| "epoch": 0.00236, |
| "grad_norm": 1.1334284543991089, |
| "kl": 0.257572659291327, |
| "learning_rate": 9.999987548188395e-06, |
| "loss": -0.1829, |
| "step": 118, |
| "step_time": 5.831991403998472 |
| }, |
| { |
| "clip_ratio/high_max": 0.00390625, |
| "clip_ratio/high_mean": 0.001953125, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.001953125, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1806.0, |
| "completions/max_terminated_length": 1806.0, |
| "completions/mean_length": 1690.75, |
| "completions/mean_terminated_length": 1690.75, |
| "completions/min_length": 1531.0, |
| "completions/min_terminated_length": 1531.0, |
| "entropy": 0.27842383086681366, |
| "epoch": 0.00238, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.7958987355232239, |
| "kl": 0.3204949628561735, |
| "learning_rate": 9.999987242634e-06, |
| "loss": -0.104, |
| "num_tokens": 4458969.0, |
| "reward": -8.779083251953125, |
| "reward_std": 6.466405868530273, |
| "rewards/rollout_reward_func/mean": -8.779083251953125, |
| "rewards/rollout_reward_func/std": 12.54110336303711, |
| "sampling/importance_sampling_ratio/max": 1.7512174844741821, |
| "sampling/importance_sampling_ratio/mean": 0.9728891849517822, |
| "sampling/importance_sampling_ratio/min": 0.23163382709026337, |
| "sampling/sampling_logp_difference/max": 1.429762363433838, |
| "sampling/sampling_logp_difference/mean": 0.059581462293863297, |
| "step": 119, |
| "step_time": 37.53063563500109 |
| }, |
| { |
| "clip_ratio/high_max": 0.0078125, |
| "clip_ratio/high_mean": 0.005859375, |
| "clip_ratio/low_mean": 0.00390625, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.009765625, |
| "entropy": 0.2830961886793375, |
| "epoch": 0.0024, |
| "grad_norm": 0.7229918837547302, |
| "kl": 0.32604870945215225, |
| "learning_rate": 9.999986933375924e-06, |
| "loss": -0.107, |
| "step": 120, |
| "step_time": 6.557443951995083 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.03125, |
| "completions/max_length": 1802.0, |
| "completions/max_terminated_length": 1802.0, |
| "completions/mean_length": 1673.0625, |
| "completions/mean_terminated_length": 1669.4515380859375, |
| "completions/min_length": 1072.0, |
| "completions/min_terminated_length": 1072.0, |
| "entropy": 0.3067500479519367, |
| "epoch": 0.00242, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.1562621593475342, |
| "kl": 0.21205937396734953, |
| "learning_rate": 9.999986620414169e-06, |
| "loss": -0.1476, |
| "num_tokens": 4533539.0, |
| "reward": -5.477802276611328, |
| "reward_std": 3.7002973556518555, |
| "rewards/rollout_reward_func/mean": -5.477802276611328, |
| "rewards/rollout_reward_func/std": 4.9684367179870605, |
| "sampling/importance_sampling_ratio/max": 2.1120123863220215, |
| "sampling/importance_sampling_ratio/mean": 1.002963900566101, |
| "sampling/importance_sampling_ratio/min": 0.1644321084022522, |
| "sampling/sampling_logp_difference/max": 0.7201070785522461, |
| "sampling/sampling_logp_difference/mean": 0.044744931161403656, |
| "step": 121, |
| "step_time": 37.94513329599977 |
| }, |
| { |
| "clip_ratio/high_max": 0.00390625, |
| "clip_ratio/high_mean": 0.001953125, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.001953125, |
| "entropy": 0.30317614786326885, |
| "epoch": 0.00244, |
| "grad_norm": 1.1647439002990723, |
| "kl": 0.2262652236968279, |
| "learning_rate": 9.999986303748731e-06, |
| "loss": -0.1508, |
| "step": 122, |
| "step_time": 5.857959121001841 |
| }, |
| { |
| "clip_ratio/high_max": 0.00390625, |
| "clip_ratio/high_mean": 0.001953125, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.001953125, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1785.0, |
| "completions/max_terminated_length": 1785.0, |
| "completions/mean_length": 1429.46875, |
| "completions/mean_terminated_length": 1429.46875, |
| "completions/min_length": 2.0, |
| "completions/min_terminated_length": 2.0, |
| "entropy": 0.22994763404130936, |
| "epoch": 0.00246, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.0889474153518677, |
| "kl": 0.7471896391361952, |
| "learning_rate": 9.999985983379614e-06, |
| "loss": -0.1179, |
| "num_tokens": 4600314.0, |
| "reward": -1.682168960571289, |
| "reward_std": 8.262513160705566, |
| "rewards/rollout_reward_func/mean": -1.682168960571289, |
| "rewards/rollout_reward_func/std": 16.95815658569336, |
| "sampling/importance_sampling_ratio/max": 2.0507054328918457, |
| "sampling/importance_sampling_ratio/mean": 0.9804076552391052, |
| "sampling/importance_sampling_ratio/min": 0.13134591281414032, |
| "sampling/sampling_logp_difference/max": 1.8472480773925781, |
| "sampling/sampling_logp_difference/mean": 0.06341268122196198, |
| "step": 123, |
| "step_time": 33.21367415700297 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.005859375, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.005859375, |
| "entropy": 0.22646107524633408, |
| "epoch": 0.00248, |
| "grad_norm": 1.079579472541809, |
| "kl": 0.8753251153975725, |
| "learning_rate": 9.999985659306817e-06, |
| "loss": -0.121, |
| "step": 124, |
| "step_time": 5.792652656999053 |
| }, |
| { |
| "clip_ratio/high_max": 0.00390625, |
| "clip_ratio/high_mean": 0.001953125, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.001953125, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1804.0, |
| "completions/max_terminated_length": 1804.0, |
| "completions/mean_length": 1706.4375, |
| "completions/mean_terminated_length": 1706.4375, |
| "completions/min_length": 1559.0, |
| "completions/min_terminated_length": 1559.0, |
| "entropy": 0.282099112868309, |
| "epoch": 0.0025, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.3554356098175049, |
| "kl": 0.2657380551099777, |
| "learning_rate": 9.999985331530339e-06, |
| "loss": -0.0685, |
| "num_tokens": 4676158.0, |
| "reward": 1.4572546482086182, |
| "reward_std": 7.827357292175293, |
| "rewards/rollout_reward_func/mean": 1.4572546482086182, |
| "rewards/rollout_reward_func/std": 8.701656341552734, |
| "sampling/importance_sampling_ratio/max": 2.6973798274993896, |
| "sampling/importance_sampling_ratio/mean": 0.9617570638656616, |
| "sampling/importance_sampling_ratio/min": 0.0, |
| "sampling/sampling_logp_difference/max": 1.366655945777893, |
| "sampling/sampling_logp_difference/mean": 0.06495144963264465, |
| "step": 125, |
| "step_time": 38.350035333998676 |
| }, |
| { |
| "clip_ratio/high_max": 0.01171875, |
| "clip_ratio/high_mean": 0.005859375, |
| "clip_ratio/low_mean": 0.005859375, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.01171875, |
| "entropy": 0.2868986092507839, |
| "epoch": 0.00252, |
| "grad_norm": 1.1578820943832397, |
| "kl": 0.2646348997950554, |
| "learning_rate": 9.999985000050181e-06, |
| "loss": -0.0723, |
| "step": 126, |
| "step_time": 6.268096281999533 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.001953125, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.001953125, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1826.0, |
| "completions/max_terminated_length": 1826.0, |
| "completions/mean_length": 1739.8125, |
| "completions/mean_terminated_length": 1739.8125, |
| "completions/min_length": 1644.0, |
| "completions/min_terminated_length": 1644.0, |
| "entropy": 0.26093689538538456, |
| "epoch": 0.00254, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.2453805208206177, |
| "kl": 0.4399577025324106, |
| "learning_rate": 9.999984664866347e-06, |
| "loss": -0.0086, |
| "num_tokens": 4753406.0, |
| "reward": -2.7315988540649414, |
| "reward_std": 4.536945343017578, |
| "rewards/rollout_reward_func/mean": -2.7315988540649414, |
| "rewards/rollout_reward_func/std": 7.6850104331970215, |
| "sampling/importance_sampling_ratio/max": 2.3371880054473877, |
| "sampling/importance_sampling_ratio/mean": 1.0783873796463013, |
| "sampling/importance_sampling_ratio/min": 0.0, |
| "sampling/sampling_logp_difference/max": 1.2284293174743652, |
| "sampling/sampling_logp_difference/mean": 0.04868567734956741, |
| "step": 127, |
| "step_time": 37.24148940499981 |
| }, |
| { |
| "clip_ratio/high_max": 0.01171875, |
| "clip_ratio/high_mean": 0.0078125, |
| "clip_ratio/low_mean": 0.001953125, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.009765625, |
| "entropy": 0.26575652323663235, |
| "epoch": 0.00256, |
| "grad_norm": 1.235435962677002, |
| "kl": 0.4320835890248418, |
| "learning_rate": 9.999984325978833e-06, |
| "loss": -0.0116, |
| "step": 128, |
| "step_time": 5.902758816999267 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1805.0, |
| "completions/max_terminated_length": 1805.0, |
| "completions/mean_length": 1686.875, |
| "completions/mean_terminated_length": 1686.875, |
| "completions/min_length": 1159.0, |
| "completions/min_terminated_length": 1159.0, |
| "entropy": 0.3018810376524925, |
| "epoch": 0.00258, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.9894475936889648, |
| "kl": 0.32896456494927406, |
| "learning_rate": 9.99998398338764e-06, |
| "loss": -0.089, |
| "num_tokens": 4828478.0, |
| "reward": -1.9806309938430786, |
| "reward_std": 5.783495903015137, |
| "rewards/rollout_reward_func/mean": -1.9806309938430786, |
| "rewards/rollout_reward_func/std": 9.691821098327637, |
| "sampling/importance_sampling_ratio/max": 2.419085741043091, |
| "sampling/importance_sampling_ratio/mean": 0.9663759469985962, |
| "sampling/importance_sampling_ratio/min": 0.0, |
| "sampling/sampling_logp_difference/max": 1.6888089179992676, |
| "sampling/sampling_logp_difference/mean": 0.06528542190790176, |
| "step": 129, |
| "step_time": 37.874985575997925 |
| }, |
| { |
| "clip_ratio/high_max": 0.00390625, |
| "clip_ratio/high_mean": 0.001953125, |
| "clip_ratio/low_mean": 0.00390625, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.005859375, |
| "entropy": 0.3036986291408539, |
| "epoch": 0.0026, |
| "grad_norm": 0.9928931593894958, |
| "kl": 0.33502755127847195, |
| "learning_rate": 9.99998363709277e-06, |
| "loss": -0.0895, |
| "step": 130, |
| "step_time": 5.84719717599728 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.001953125, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.001953125, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1772.0, |
| "completions/max_terminated_length": 1772.0, |
| "completions/mean_length": 1671.9375, |
| "completions/mean_terminated_length": 1671.9375, |
| "completions/min_length": 1508.0, |
| "completions/min_terminated_length": 1508.0, |
| "entropy": 0.276044437661767, |
| "epoch": 0.00262, |
| "frac_reward_zero_std": 0.125, |
| "grad_norm": 0.8475321531295776, |
| "kl": 0.3156882934272289, |
| "learning_rate": 9.999983287094222e-06, |
| "loss": -0.0246, |
| "num_tokens": 4903229.0, |
| "reward": -4.237326145172119, |
| "reward_std": 7.174188613891602, |
| "rewards/rollout_reward_func/mean": -4.237326145172119, |
| "rewards/rollout_reward_func/std": 13.78705883026123, |
| "sampling/importance_sampling_ratio/max": 2.0592706203460693, |
| "sampling/importance_sampling_ratio/mean": 0.9671538472175598, |
| "sampling/importance_sampling_ratio/min": 0.17537109553813934, |
| "sampling/sampling_logp_difference/max": 1.267343282699585, |
| "sampling/sampling_logp_difference/mean": 0.05497532710433006, |
| "step": 131, |
| "step_time": 38.15398663500309 |
| }, |
| { |
| "clip_ratio/high_max": 0.00390625, |
| "clip_ratio/high_mean": 0.001953125, |
| "clip_ratio/low_mean": 0.001953125, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.00390625, |
| "entropy": 0.27523134648799896, |
| "epoch": 0.00264, |
| "grad_norm": 0.8275483250617981, |
| "kl": 0.3333571758121252, |
| "learning_rate": 9.999982933391998e-06, |
| "loss": -0.0265, |
| "step": 132, |
| "step_time": 6.213580482999532 |
| }, |
| { |
| "clip_ratio/high_max": 0.00390625, |
| "clip_ratio/high_mean": 0.001953125, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.001953125, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1787.0, |
| "completions/max_terminated_length": 1787.0, |
| "completions/mean_length": 1690.6875, |
| "completions/mean_terminated_length": 1690.6875, |
| "completions/min_length": 1552.0, |
| "completions/min_terminated_length": 1552.0, |
| "entropy": 0.31048087403178215, |
| "epoch": 0.00266, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.9148619771003723, |
| "kl": 0.5922599658370018, |
| "learning_rate": 9.999982575986095e-06, |
| "loss": -0.1689, |
| "num_tokens": 4978651.0, |
| "reward": -7.106402397155762, |
| "reward_std": 7.335752487182617, |
| "rewards/rollout_reward_func/mean": -7.106402397155762, |
| "rewards/rollout_reward_func/std": 10.287908554077148, |
| "sampling/importance_sampling_ratio/max": 2.353391170501709, |
| "sampling/importance_sampling_ratio/mean": 0.7480576038360596, |
| "sampling/importance_sampling_ratio/min": 0.10871558636426926, |
| "sampling/sampling_logp_difference/max": 1.7691650390625, |
| "sampling/sampling_logp_difference/mean": 0.08270461857318878, |
| "step": 133, |
| "step_time": 37.87937480399705 |
| }, |
| { |
| "clip_ratio/high_max": 0.00390625, |
| "clip_ratio/high_mean": 0.001953125, |
| "clip_ratio/low_mean": 0.005859375, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0078125, |
| "entropy": 0.3103002533316612, |
| "epoch": 0.00268, |
| "grad_norm": 0.8720380067825317, |
| "kl": 0.596416313201189, |
| "learning_rate": 9.999982214876516e-06, |
| "loss": -0.1711, |
| "step": 134, |
| "step_time": 5.838397514999087 |
| }, |
| { |
| "clip_ratio/high_max": 0.00390625, |
| "clip_ratio/high_mean": 0.001953125, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.001953125, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1810.0, |
| "completions/max_terminated_length": 1810.0, |
| "completions/mean_length": 1734.5, |
| "completions/mean_terminated_length": 1734.5, |
| "completions/min_length": 1659.0, |
| "completions/min_terminated_length": 1659.0, |
| "entropy": 0.296902384608984, |
| "epoch": 0.0027, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.5514373779296875, |
| "kl": 0.32854650542140007, |
| "learning_rate": 9.999981850063262e-06, |
| "loss": -0.2692, |
| "num_tokens": 5055484.0, |
| "reward": 2.0809240341186523, |
| "reward_std": 5.269416809082031, |
| "rewards/rollout_reward_func/mean": 2.0809240341186523, |
| "rewards/rollout_reward_func/std": 7.5051188468933105, |
| "sampling/importance_sampling_ratio/max": 2.7047743797302246, |
| "sampling/importance_sampling_ratio/mean": 1.0847183465957642, |
| "sampling/importance_sampling_ratio/min": 0.0, |
| "sampling/sampling_logp_difference/max": 1.3921051025390625, |
| "sampling/sampling_logp_difference/mean": 0.07780618220567703, |
| "step": 135, |
| "step_time": 37.11384174500017 |
| }, |
| { |
| "clip_ratio/high_max": 0.01171875, |
| "clip_ratio/high_mean": 0.005859375, |
| "clip_ratio/low_mean": 0.001953125, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0078125, |
| "entropy": 0.2912406101822853, |
| "epoch": 0.00272, |
| "grad_norm": 1.203852653503418, |
| "kl": 0.34848837181925774, |
| "learning_rate": 9.99998148154633e-06, |
| "loss": -0.2722, |
| "step": 136, |
| "step_time": 6.586184759000389 |
| }, |
| { |
| "clip_ratio/high_max": 0.00390625, |
| "clip_ratio/high_mean": 0.001953125, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.001953125, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1788.0, |
| "completions/max_terminated_length": 1788.0, |
| "completions/mean_length": 1674.96875, |
| "completions/mean_terminated_length": 1674.96875, |
| "completions/min_length": 1567.0, |
| "completions/min_terminated_length": 1567.0, |
| "entropy": 0.2682835068553686, |
| "epoch": 0.00274, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.322084903717041, |
| "kl": 0.4422433190047741, |
| "learning_rate": 9.999981109325725e-06, |
| "loss": 0.0099, |
| "num_tokens": 5130363.0, |
| "reward": -0.8909265995025635, |
| "reward_std": 6.19950008392334, |
| "rewards/rollout_reward_func/mean": -0.8909265995025635, |
| "rewards/rollout_reward_func/std": 11.643590927124023, |
| "sampling/importance_sampling_ratio/max": 2.353513717651367, |
| "sampling/importance_sampling_ratio/mean": 1.027718186378479, |
| "sampling/importance_sampling_ratio/min": 0.0, |
| "sampling/sampling_logp_difference/max": 0.9235548973083496, |
| "sampling/sampling_logp_difference/mean": 0.05995417386293411, |
| "step": 137, |
| "step_time": 37.823591190001025 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "entropy": 0.2614995054900646, |
| "epoch": 0.00276, |
| "grad_norm": 1.342252254486084, |
| "kl": 0.4471647199243307, |
| "learning_rate": 9.999980733401442e-06, |
| "loss": 0.0087, |
| "step": 138, |
| "step_time": 5.804548355996303 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.001953125, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.001953125, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1774.0, |
| "completions/max_terminated_length": 1774.0, |
| "completions/mean_length": 1635.40625, |
| "completions/mean_terminated_length": 1635.40625, |
| "completions/min_length": 637.0, |
| "completions/min_terminated_length": 637.0, |
| "entropy": 0.28644070588052273, |
| "epoch": 0.00278, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.5117998123168945, |
| "kl": 0.37019682209938765, |
| "learning_rate": 9.999980353773486e-06, |
| "loss": -0.0649, |
| "num_tokens": 5204087.0, |
| "reward": -2.6386919021606445, |
| "reward_std": 7.861667156219482, |
| "rewards/rollout_reward_func/mean": -2.6386919021606445, |
| "rewards/rollout_reward_func/std": 9.34830093383789, |
| "sampling/importance_sampling_ratio/max": 2.392774820327759, |
| "sampling/importance_sampling_ratio/mean": 0.8888267278671265, |
| "sampling/importance_sampling_ratio/min": 0.06766009330749512, |
| "sampling/sampling_logp_difference/max": 0.964139461517334, |
| "sampling/sampling_logp_difference/mean": 0.07269679009914398, |
| "step": 139, |
| "step_time": 36.273601793001944 |
| }, |
| { |
| "clip_ratio/high_max": 0.02734375, |
| "clip_ratio/high_mean": 0.013671875, |
| "clip_ratio/low_mean": 0.001953125, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.015625, |
| "entropy": 0.2876611240208149, |
| "epoch": 0.0028, |
| "grad_norm": 1.0806161165237427, |
| "kl": 0.3748003738000989, |
| "learning_rate": 9.999979970441856e-06, |
| "loss": -0.066, |
| "step": 140, |
| "step_time": 5.793748694000897 |
| }, |
| { |
| "clip_ratio/high_max": 0.0078125, |
| "clip_ratio/high_mean": 0.00390625, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.00390625, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1778.0, |
| "completions/max_terminated_length": 1778.0, |
| "completions/mean_length": 1630.625, |
| "completions/mean_terminated_length": 1630.625, |
| "completions/min_length": 1075.0, |
| "completions/min_terminated_length": 1075.0, |
| "entropy": 0.2753357030451298, |
| "epoch": 0.00282, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.2243664264678955, |
| "kl": 0.3961847685277462, |
| "learning_rate": 9.999979583406551e-06, |
| "loss": -0.1401, |
| "num_tokens": 5277342.0, |
| "reward": -0.9446412324905396, |
| "reward_std": 6.179128646850586, |
| "rewards/rollout_reward_func/mean": -0.9446412324905396, |
| "rewards/rollout_reward_func/std": 7.662261009216309, |
| "sampling/importance_sampling_ratio/max": 2.07578706741333, |
| "sampling/importance_sampling_ratio/mean": 0.9215522408485413, |
| "sampling/importance_sampling_ratio/min": 0.17828358709812164, |
| "sampling/sampling_logp_difference/max": 1.095733880996704, |
| "sampling/sampling_logp_difference/mean": 0.06696178764104843, |
| "step": 141, |
| "step_time": 37.386809142004495 |
| }, |
| { |
| "clip_ratio/high_max": 0.01171875, |
| "clip_ratio/high_mean": 0.005859375, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.005859375, |
| "entropy": 0.2797367610037327, |
| "epoch": 0.00284, |
| "grad_norm": 1.1719934940338135, |
| "kl": 0.36871890537440777, |
| "learning_rate": 9.999979192667574e-06, |
| "loss": -0.1444, |
| "step": 142, |
| "step_time": 6.457391539001037 |
| }, |
| { |
| "clip_ratio/high_max": 0.00390625, |
| "clip_ratio/high_mean": 0.001953125, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.001953125, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1805.0, |
| "completions/max_terminated_length": 1805.0, |
| "completions/mean_length": 1678.625, |
| "completions/mean_terminated_length": 1678.625, |
| "completions/min_length": 1457.0, |
| "completions/min_terminated_length": 1457.0, |
| "entropy": 0.2741607278585434, |
| "epoch": 0.00286, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.8713260293006897, |
| "kl": 0.43569475039839745, |
| "learning_rate": 9.999978798224922e-06, |
| "loss": -0.138, |
| "num_tokens": 5352366.0, |
| "reward": 0.8101233839988708, |
| "reward_std": 3.6327834129333496, |
| "rewards/rollout_reward_func/mean": 0.8101233839988708, |
| "rewards/rollout_reward_func/std": 7.983520030975342, |
| "sampling/importance_sampling_ratio/max": 2.63127064704895, |
| "sampling/importance_sampling_ratio/mean": 0.9456866979598999, |
| "sampling/importance_sampling_ratio/min": 0.17168530821800232, |
| "sampling/sampling_logp_difference/max": 1.6172382831573486, |
| "sampling/sampling_logp_difference/mean": 0.05936865881085396, |
| "step": 143, |
| "step_time": 38.01681525400272 |
| }, |
| { |
| "clip_ratio/high_max": 0.00390625, |
| "clip_ratio/high_mean": 0.001953125, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.001953125, |
| "entropy": 0.28014545887708664, |
| "epoch": 0.00288, |
| "grad_norm": 0.8923928737640381, |
| "kl": 0.40905678272247314, |
| "learning_rate": 9.999978400078598e-06, |
| "loss": -0.1408, |
| "step": 144, |
| "step_time": 5.900416173997655 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.001953125, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.001953125, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1773.0, |
| "completions/max_terminated_length": 1773.0, |
| "completions/mean_length": 1679.46875, |
| "completions/mean_terminated_length": 1679.46875, |
| "completions/min_length": 1426.0, |
| "completions/min_terminated_length": 1426.0, |
| "entropy": 0.28977033123373985, |
| "epoch": 0.0029, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.2021753787994385, |
| "kl": 0.9492019787430763, |
| "learning_rate": 9.9999779982286e-06, |
| "loss": -0.1518, |
| "num_tokens": 5427370.0, |
| "reward": -1.4256936311721802, |
| "reward_std": 5.131044387817383, |
| "rewards/rollout_reward_func/mean": -1.4256936311721802, |
| "rewards/rollout_reward_func/std": 6.864547252655029, |
| "sampling/importance_sampling_ratio/max": 2.061087131500244, |
| "sampling/importance_sampling_ratio/mean": 0.7186421155929565, |
| "sampling/importance_sampling_ratio/min": 0.03768601268529892, |
| "sampling/sampling_logp_difference/max": 1.7941226959228516, |
| "sampling/sampling_logp_difference/mean": 0.0829053670167923, |
| "step": 145, |
| "step_time": 35.9784139159965 |
| }, |
| { |
| "clip_ratio/high_max": 0.0078125, |
| "clip_ratio/high_mean": 0.00390625, |
| "clip_ratio/low_mean": 0.001953125, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.005859375, |
| "entropy": 0.2919359765946865, |
| "epoch": 0.00292, |
| "grad_norm": 0.8570596575737, |
| "kl": 0.9193199034780264, |
| "learning_rate": 9.999977592674933e-06, |
| "loss": -0.1533, |
| "step": 146, |
| "step_time": 5.8281254610010365 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1779.0, |
| "completions/max_terminated_length": 1779.0, |
| "completions/mean_length": 1666.84375, |
| "completions/mean_terminated_length": 1666.84375, |
| "completions/min_length": 1414.0, |
| "completions/min_terminated_length": 1414.0, |
| "entropy": 0.28393640369176865, |
| "epoch": 0.00294, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.9220053553581238, |
| "kl": 0.4149657338857651, |
| "learning_rate": 9.999977183417593e-06, |
| "loss": -0.1243, |
| "num_tokens": 5502317.0, |
| "reward": -4.950237274169922, |
| "reward_std": 7.430306434631348, |
| "rewards/rollout_reward_func/mean": -4.950237274169922, |
| "rewards/rollout_reward_func/std": 8.644596099853516, |
| "sampling/importance_sampling_ratio/max": 1.6269994974136353, |
| "sampling/importance_sampling_ratio/mean": 0.8315407633781433, |
| "sampling/importance_sampling_ratio/min": 0.07847892493009567, |
| "sampling/sampling_logp_difference/max": 1.8345155715942383, |
| "sampling/sampling_logp_difference/mean": 0.07923141121864319, |
| "step": 147, |
| "step_time": 37.406879453998044 |
| }, |
| { |
| "clip_ratio/high_max": 0.0078125, |
| "clip_ratio/high_mean": 0.00390625, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.00390625, |
| "entropy": 0.28766966238617897, |
| "epoch": 0.00296, |
| "grad_norm": 0.9152698516845703, |
| "kl": 0.3896927610039711, |
| "learning_rate": 9.999976770456581e-06, |
| "loss": -0.126, |
| "step": 148, |
| "step_time": 6.265031434000775 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.00390625, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.00390625, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1810.0, |
| "completions/max_terminated_length": 1810.0, |
| "completions/mean_length": 1691.28125, |
| "completions/mean_terminated_length": 1691.28125, |
| "completions/min_length": 1595.0, |
| "completions/min_terminated_length": 1595.0, |
| "entropy": 0.251856479793787, |
| "epoch": 0.00298, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.9210507869720459, |
| "kl": 0.4119174964725971, |
| "learning_rate": 9.999976353791898e-06, |
| "loss": -0.1814, |
| "num_tokens": 5577778.0, |
| "reward": -1.1243976354599, |
| "reward_std": 5.285574436187744, |
| "rewards/rollout_reward_func/mean": -1.1243976354599, |
| "rewards/rollout_reward_func/std": 7.979835510253906, |
| "sampling/importance_sampling_ratio/max": 2.8175506591796875, |
| "sampling/importance_sampling_ratio/mean": 1.1608731746673584, |
| "sampling/importance_sampling_ratio/min": 0.0, |
| "sampling/sampling_logp_difference/max": 1.2149560451507568, |
| "sampling/sampling_logp_difference/mean": 0.06952120363712311, |
| "step": 149, |
| "step_time": 37.22757341200122 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.00390625, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.00390625, |
| "entropy": 0.25096623599529266, |
| "epoch": 0.003, |
| "grad_norm": 0.9395397305488586, |
| "kl": 0.4342615343630314, |
| "learning_rate": 9.999975933423546e-06, |
| "loss": -0.184, |
| "step": 150, |
| "step_time": 5.87532146200283 |
| } |
| ], |
| "logging_steps": 1.0, |
| "max_steps": 100000, |
| "num_input_tokens_seen": 5577778, |
| "num_train_epochs": 2, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 2, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|