instruct_1b46e25 / trainer_state.json
bimabk's picture
Upload task output 1
3c016c2 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.003,
"eval_steps": 500,
"global_step": 150,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1775.0,
"completions/max_terminated_length": 1775.0,
"completions/mean_length": 1624.96875,
"completions/mean_terminated_length": 1624.96875,
"completions/min_length": 1388.0,
"completions/min_terminated_length": 1388.0,
"entropy": 0.5600852482020855,
"epoch": 2e-05,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.3755558729171753,
"kl": 0.0,
"learning_rate": 0.0,
"loss": -0.0592,
"num_tokens": 73247.0,
"reward": -12.172411918640137,
"reward_std": 7.601527214050293,
"rewards/rollout_reward_func/mean": -12.172411918640137,
"rewards/rollout_reward_func/std": 10.38169002532959,
"sampling/importance_sampling_ratio/max": 1.408553123474121,
"sampling/importance_sampling_ratio/mean": 0.9712058901786804,
"sampling/importance_sampling_ratio/min": 0.6454448103904724,
"sampling/sampling_logp_difference/max": 0.22739958763122559,
"sampling/sampling_logp_difference/mean": 0.016150973737239838,
"step": 1,
"step_time": 36.755565460999605
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.5600852482020855,
"epoch": 4e-05,
"grad_norm": 1.3615893125534058,
"kl": 0.0,
"learning_rate": 2.8571428571428575e-07,
"loss": -0.0592,
"step": 2,
"step_time": 5.746241367000948
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1753.0,
"completions/max_terminated_length": 1753.0,
"completions/mean_length": 1628.96875,
"completions/mean_terminated_length": 1628.96875,
"completions/min_length": 1271.0,
"completions/min_terminated_length": 1271.0,
"entropy": 0.5380602143704891,
"epoch": 6e-05,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.4941445589065552,
"kl": 0.0005305010126903653,
"learning_rate": 5.714285714285715e-07,
"loss": 0.006,
"num_tokens": 146725.0,
"reward": -8.265422821044922,
"reward_std": 8.979022026062012,
"rewards/rollout_reward_func/mean": -8.265422821044922,
"rewards/rollout_reward_func/std": 13.061026573181152,
"sampling/importance_sampling_ratio/max": 1.2190126180648804,
"sampling/importance_sampling_ratio/mean": 0.9876266121864319,
"sampling/importance_sampling_ratio/min": 0.5881595015525818,
"sampling/sampling_logp_difference/max": 0.45802879333496094,
"sampling/sampling_logp_difference/mean": 0.014619816094636917,
"step": 3,
"step_time": 36.527911828999095
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.5377197824418545,
"epoch": 8e-05,
"grad_norm": 1.4071228504180908,
"kl": 0.0005172143501113169,
"learning_rate": 8.571428571428572e-07,
"loss": 0.0058,
"step": 4,
"step_time": 5.69982043300206
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.001953125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1754.0,
"completions/max_terminated_length": 1754.0,
"completions/mean_length": 1595.15625,
"completions/mean_terminated_length": 1595.15625,
"completions/min_length": 1299.0,
"completions/min_terminated_length": 1299.0,
"entropy": 0.539891816675663,
"epoch": 0.0001,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.4629162549972534,
"kl": 0.0005519717960851267,
"learning_rate": 1.142857142857143e-06,
"loss": 0.0243,
"num_tokens": 219002.0,
"reward": -14.256836891174316,
"reward_std": 9.0944185256958,
"rewards/rollout_reward_func/mean": -14.256836891174316,
"rewards/rollout_reward_func/std": 12.482532501220703,
"sampling/importance_sampling_ratio/max": 1.6900306940078735,
"sampling/importance_sampling_ratio/mean": 1.0195035934448242,
"sampling/importance_sampling_ratio/min": 0.8020860552787781,
"sampling/sampling_logp_difference/max": 0.25893688201904297,
"sampling/sampling_logp_difference/mean": 0.016118371859192848,
"step": 5,
"step_time": 38.65034878200095
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.5395300313830376,
"epoch": 0.00012,
"grad_norm": 1.5035072565078735,
"kl": 0.0006097570294514298,
"learning_rate": 1.4285714285714286e-06,
"loss": 0.0242,
"step": 6,
"step_time": 5.672908046001794
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0031250000465661287,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0031250000465661287,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1782.0,
"completions/max_terminated_length": 1782.0,
"completions/mean_length": 1521.90625,
"completions/mean_terminated_length": 1521.90625,
"completions/min_length": 723.0,
"completions/min_terminated_length": 723.0,
"entropy": 0.5277910158038139,
"epoch": 0.00014,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.2564315795898438,
"kl": 0.0006845891803095583,
"learning_rate": 1.7142857142857145e-06,
"loss": 0.0223,
"num_tokens": 288633.0,
"reward": -16.092445373535156,
"reward_std": 8.75448989868164,
"rewards/rollout_reward_func/mean": -16.092445373535156,
"rewards/rollout_reward_func/std": 15.618288040161133,
"sampling/importance_sampling_ratio/max": 1.5459660291671753,
"sampling/importance_sampling_ratio/mean": 1.024022102355957,
"sampling/importance_sampling_ratio/min": 0.7249171733856201,
"sampling/sampling_logp_difference/max": 0.29637718200683594,
"sampling/sampling_logp_difference/mean": 0.018320683389902115,
"step": 7,
"step_time": 32.748348556002384
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.001953125,
"entropy": 0.5283261835575104,
"epoch": 0.00016,
"grad_norm": 1.2439873218536377,
"kl": 0.0006405085659935139,
"learning_rate": 2.0000000000000003e-06,
"loss": 0.0223,
"step": 8,
"step_time": 5.7717726080009015
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00390625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1818.0,
"completions/max_terminated_length": 1818.0,
"completions/mean_length": 1608.375,
"completions/mean_terminated_length": 1608.375,
"completions/min_length": 305.0,
"completions/min_terminated_length": 305.0,
"entropy": 0.5326173529028893,
"epoch": 0.00018,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.9701208472251892,
"kl": 0.0007356673304457217,
"learning_rate": 2.285714285714286e-06,
"loss": 0.012,
"num_tokens": 361374.0,
"reward": -6.8430585861206055,
"reward_std": 12.837440490722656,
"rewards/rollout_reward_func/mean": -6.8430585861206055,
"rewards/rollout_reward_func/std": 17.0405216217041,
"sampling/importance_sampling_ratio/max": 1.2777214050292969,
"sampling/importance_sampling_ratio/mean": 0.9900251626968384,
"sampling/importance_sampling_ratio/min": 0.6748403310775757,
"sampling/sampling_logp_difference/max": 0.3269679546356201,
"sampling/sampling_logp_difference/mean": 0.0145448949187994,
"step": 9,
"step_time": 34.28418227700058
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.5334620848298073,
"epoch": 0.0002,
"grad_norm": 1.0108616352081299,
"kl": 0.0005742738721892238,
"learning_rate": 2.571428571428571e-06,
"loss": 0.012,
"step": 10,
"step_time": 6.918311860000358
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1797.0,
"completions/max_terminated_length": 1797.0,
"completions/mean_length": 1687.25,
"completions/mean_terminated_length": 1687.25,
"completions/min_length": 1423.0,
"completions/min_terminated_length": 1423.0,
"entropy": 0.5696082189679146,
"epoch": 0.00022,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.1935234069824219,
"kl": 0.0007269367633853108,
"learning_rate": 2.8571428571428573e-06,
"loss": -0.0013,
"num_tokens": 436067.0,
"reward": -9.953402519226074,
"reward_std": 9.885331153869629,
"rewards/rollout_reward_func/mean": -9.953402519226074,
"rewards/rollout_reward_func/std": 11.941234588623047,
"sampling/importance_sampling_ratio/max": 1.3005088567733765,
"sampling/importance_sampling_ratio/mean": 0.9863357543945312,
"sampling/importance_sampling_ratio/min": 0.7671698927879333,
"sampling/sampling_logp_difference/max": 0.1938610076904297,
"sampling/sampling_logp_difference/mean": 0.016408588737249374,
"step": 11,
"step_time": 36.470574425999075
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.001953125,
"entropy": 0.5690113827586174,
"epoch": 0.00024,
"grad_norm": 1.1512264013290405,
"kl": 0.0009323725680587813,
"learning_rate": 3.142857142857143e-06,
"loss": -0.0002,
"step": 12,
"step_time": 5.875566556000194
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.001953125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1832.0,
"completions/max_terminated_length": 1832.0,
"completions/mean_length": 1571.625,
"completions/mean_terminated_length": 1571.625,
"completions/min_length": 262.0,
"completions/min_terminated_length": 262.0,
"entropy": 0.5204437598586082,
"epoch": 0.00026,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.1478338241577148,
"kl": 0.0008145252213580534,
"learning_rate": 3.428571428571429e-06,
"loss": 0.0344,
"num_tokens": 507640.0,
"reward": -5.592676162719727,
"reward_std": 11.350366592407227,
"rewards/rollout_reward_func/mean": -5.592676162719727,
"rewards/rollout_reward_func/std": 16.42201805114746,
"sampling/importance_sampling_ratio/max": 1.401992917060852,
"sampling/importance_sampling_ratio/mean": 1.043225884437561,
"sampling/importance_sampling_ratio/min": 0.7300771474838257,
"sampling/sampling_logp_difference/max": 0.24753212928771973,
"sampling/sampling_logp_difference/mean": 0.016798537224531174,
"step": 13,
"step_time": 34.95688835600049
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00390625,
"entropy": 0.5211725942790508,
"epoch": 0.00028,
"grad_norm": 1.1339523792266846,
"kl": 0.001084248440747615,
"learning_rate": 3.7142857142857146e-06,
"loss": 0.0358,
"step": 14,
"step_time": 5.896424438000395
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1777.0,
"completions/max_terminated_length": 1777.0,
"completions/mean_length": 1681.0625,
"completions/mean_terminated_length": 1681.0625,
"completions/min_length": 1441.0,
"completions/min_terminated_length": 1441.0,
"entropy": 0.5041001103818417,
"epoch": 0.0003,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.4473488330841064,
"kl": 0.0014922630070941523,
"learning_rate": 4.000000000000001e-06,
"loss": -0.0171,
"num_tokens": 582735.0,
"reward": -6.595297336578369,
"reward_std": 7.1833062171936035,
"rewards/rollout_reward_func/mean": -6.595297336578369,
"rewards/rollout_reward_func/std": 9.555194854736328,
"sampling/importance_sampling_ratio/max": 1.368825078010559,
"sampling/importance_sampling_ratio/mean": 0.9549809098243713,
"sampling/importance_sampling_ratio/min": 0.7357600331306458,
"sampling/sampling_logp_difference/max": 0.22634148597717285,
"sampling/sampling_logp_difference/mean": 0.015938639640808105,
"step": 15,
"step_time": 36.23906176099899
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.001953125,
"entropy": 0.5047293417155743,
"epoch": 0.00032,
"grad_norm": 1.4626593589782715,
"kl": 0.001966523894225247,
"learning_rate": 4.2857142857142855e-06,
"loss": -0.0206,
"step": 16,
"step_time": 7.021587903999716
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1747.0,
"completions/max_terminated_length": 1747.0,
"completions/mean_length": 1580.9375,
"completions/mean_terminated_length": 1580.9375,
"completions/min_length": 903.0,
"completions/min_terminated_length": 903.0,
"entropy": 0.5249488092958927,
"epoch": 0.00034,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.4577637910842896,
"kl": 0.003107522992650047,
"learning_rate": 4.571428571428572e-06,
"loss": -0.062,
"num_tokens": 655209.0,
"reward": -11.51949691772461,
"reward_std": 5.783572673797607,
"rewards/rollout_reward_func/mean": -11.51949691772461,
"rewards/rollout_reward_func/std": 6.822617530822754,
"sampling/importance_sampling_ratio/max": 1.3054317235946655,
"sampling/importance_sampling_ratio/mean": 0.9661756753921509,
"sampling/importance_sampling_ratio/min": 0.7799487709999084,
"sampling/sampling_logp_difference/max": 0.15545654296875,
"sampling/sampling_logp_difference/mean": 0.014764709398150444,
"step": 17,
"step_time": 35.02218110599915
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.5238873660564423,
"epoch": 0.00036,
"grad_norm": 1.4641653299331665,
"kl": 0.0047977561771404,
"learning_rate": 4.857142857142858e-06,
"loss": -0.0645,
"step": 18,
"step_time": 5.746608606000336
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1798.0,
"completions/max_terminated_length": 1798.0,
"completions/mean_length": 1634.25,
"completions/mean_terminated_length": 1634.25,
"completions/min_length": 1522.0,
"completions/min_terminated_length": 1522.0,
"entropy": 0.5075966455042362,
"epoch": 0.00038,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.6138638257980347,
"kl": 0.007999939436558634,
"learning_rate": 5.142857142857142e-06,
"loss": -0.0053,
"num_tokens": 728702.0,
"reward": -7.502280235290527,
"reward_std": 9.169681549072266,
"rewards/rollout_reward_func/mean": -7.502280235290527,
"rewards/rollout_reward_func/std": 9.848286628723145,
"sampling/importance_sampling_ratio/max": 1.388090968132019,
"sampling/importance_sampling_ratio/mean": 1.0349256992340088,
"sampling/importance_sampling_ratio/min": 0.6338706612586975,
"sampling/sampling_logp_difference/max": 0.2868894338607788,
"sampling/sampling_logp_difference/mean": 0.021757658571004868,
"step": 19,
"step_time": 37.415181820002545
},
{
"clip_ratio/high_max": 0.01171875,
"clip_ratio/high_mean": 0.005859375,
"clip_ratio/low_mean": 0.005800189450383186,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.011659564450383186,
"entropy": 0.5063299536705017,
"epoch": 0.0004,
"grad_norm": 1.244437336921692,
"kl": 0.01308579370379448,
"learning_rate": 5.428571428571429e-06,
"loss": -0.0075,
"step": 20,
"step_time": 5.793050424000285
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.001953125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1807.0,
"completions/max_terminated_length": 1807.0,
"completions/mean_length": 1655.15625,
"completions/mean_terminated_length": 1655.15625,
"completions/min_length": 1502.0,
"completions/min_terminated_length": 1502.0,
"entropy": 0.5319979190826416,
"epoch": 0.00042,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.1382571458816528,
"kl": 0.014530768617987633,
"learning_rate": 5.7142857142857145e-06,
"loss": 0.0206,
"num_tokens": 803086.0,
"reward": 0.2018265724182129,
"reward_std": 8.02773666381836,
"rewards/rollout_reward_func/mean": 0.2018265724182129,
"rewards/rollout_reward_func/std": 10.535411834716797,
"sampling/importance_sampling_ratio/max": 1.9407941102981567,
"sampling/importance_sampling_ratio/mean": 1.0456597805023193,
"sampling/importance_sampling_ratio/min": 0.5120582580566406,
"sampling/sampling_logp_difference/max": 0.3853168487548828,
"sampling/sampling_logp_difference/mean": 0.033494722098112106,
"step": 21,
"step_time": 38.7354500320007
},
{
"clip_ratio/high_max": 0.01171875,
"clip_ratio/high_mean": 0.007753314450383186,
"clip_ratio/low_mean": 0.0014880952658131719,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.009241409716196358,
"entropy": 0.5304564274847507,
"epoch": 0.00044,
"grad_norm": 1.0724774599075317,
"kl": 0.02271496201865375,
"learning_rate": 6e-06,
"loss": 0.0207,
"step": 22,
"step_time": 5.822538338000413
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1794.0,
"completions/max_terminated_length": 1794.0,
"completions/mean_length": 1641.09375,
"completions/mean_terminated_length": 1641.09375,
"completions/min_length": 1125.0,
"completions/min_terminated_length": 1125.0,
"entropy": 0.5019906461238861,
"epoch": 0.00046,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.1990811824798584,
"kl": 0.03286108747124672,
"learning_rate": 6.285714285714286e-06,
"loss": -0.042,
"num_tokens": 877020.0,
"reward": -8.106513977050781,
"reward_std": 8.252906799316406,
"rewards/rollout_reward_func/mean": -8.106513977050781,
"rewards/rollout_reward_func/std": 9.194578170776367,
"sampling/importance_sampling_ratio/max": 1.5264556407928467,
"sampling/importance_sampling_ratio/mean": 0.9783341884613037,
"sampling/importance_sampling_ratio/min": 0.4424620270729065,
"sampling/sampling_logp_difference/max": 0.4774820804595947,
"sampling/sampling_logp_difference/mean": 0.046494003385305405,
"step": 23,
"step_time": 36.808606273000805
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00390625,
"entropy": 0.496415089815855,
"epoch": 0.00048,
"grad_norm": 1.1097930669784546,
"kl": 0.04643937526270747,
"learning_rate": 6.571428571428572e-06,
"loss": -0.0443,
"step": 24,
"step_time": 5.7939676890000555
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1778.0,
"completions/max_terminated_length": 1778.0,
"completions/mean_length": 1637.375,
"completions/mean_terminated_length": 1637.375,
"completions/min_length": 1411.0,
"completions/min_terminated_length": 1411.0,
"entropy": 0.49401185661554337,
"epoch": 0.0005,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.5811240673065186,
"kl": 0.08443848416209221,
"learning_rate": 6.857142857142858e-06,
"loss": -0.2108,
"num_tokens": 950862.0,
"reward": -4.675426006317139,
"reward_std": 7.909944534301758,
"rewards/rollout_reward_func/mean": -4.675426006317139,
"rewards/rollout_reward_func/std": 9.238933563232422,
"sampling/importance_sampling_ratio/max": 2.070371150970459,
"sampling/importance_sampling_ratio/mean": 0.9819083213806152,
"sampling/importance_sampling_ratio/min": 0.25233596563339233,
"sampling/sampling_logp_difference/max": 0.7732794284820557,
"sampling/sampling_logp_difference/mean": 0.06470471620559692,
"step": 25,
"step_time": 38.14812202200119
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.009548611124046147,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.009548611124046147,
"entropy": 0.4837190806865692,
"epoch": 0.00052,
"grad_norm": 1.3750770092010498,
"kl": 0.11863584071397781,
"learning_rate": 7.1428571428571436e-06,
"loss": -0.2169,
"step": 26,
"step_time": 5.773092350001207
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.001953125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1693.0,
"completions/max_terminated_length": 1693.0,
"completions/mean_length": 1613.34375,
"completions/mean_terminated_length": 1613.34375,
"completions/min_length": 1434.0,
"completions/min_terminated_length": 1434.0,
"entropy": 0.45354875922203064,
"epoch": 0.00054,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.5507698059082031,
"kl": 0.1627941089682281,
"learning_rate": 7.428571428571429e-06,
"loss": -0.1993,
"num_tokens": 1023781.0,
"reward": -7.355703353881836,
"reward_std": 10.72867202758789,
"rewards/rollout_reward_func/mean": -7.355703353881836,
"rewards/rollout_reward_func/std": 12.46450138092041,
"sampling/importance_sampling_ratio/max": 2.511967420578003,
"sampling/importance_sampling_ratio/mean": 1.0184850692749023,
"sampling/importance_sampling_ratio/min": 0.14674033224582672,
"sampling/sampling_logp_difference/max": 1.2143032550811768,
"sampling/sampling_logp_difference/mean": 0.07817384600639343,
"step": 27,
"step_time": 38.66307055499874
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.025390625,
"clip_ratio/low_min": 0.00390625,
"clip_ratio/region_mean": 0.029296875,
"entropy": 0.437204722315073,
"epoch": 0.00056,
"grad_norm": 1.2980964183807373,
"kl": 0.25656731706112623,
"learning_rate": 7.714285714285716e-06,
"loss": -0.2037,
"step": 28,
"step_time": 5.5929295870000715
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.001953125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1793.0,
"completions/max_terminated_length": 1793.0,
"completions/mean_length": 1653.5,
"completions/mean_terminated_length": 1653.5,
"completions/min_length": 1433.0,
"completions/min_terminated_length": 1433.0,
"entropy": 0.40794313699007034,
"epoch": 0.00058,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.0819990634918213,
"kl": 0.5643582288175821,
"learning_rate": 8.000000000000001e-06,
"loss": -0.2202,
"num_tokens": 1098013.0,
"reward": -6.507538318634033,
"reward_std": 11.262900352478027,
"rewards/rollout_reward_func/mean": -6.507538318634033,
"rewards/rollout_reward_func/std": 15.167143821716309,
"sampling/importance_sampling_ratio/max": 2.3346853256225586,
"sampling/importance_sampling_ratio/mean": 0.7045407295227051,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.8893389701843262,
"sampling/sampling_logp_difference/mean": 0.11059033870697021,
"step": 29,
"step_time": 36.8627199120001
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.042909564450383186,
"clip_ratio/low_min": 0.019412878900766373,
"clip_ratio/region_mean": 0.042909564450383186,
"entropy": 0.3947901092469692,
"epoch": 0.0006,
"grad_norm": 1.311099648475647,
"kl": 0.829475361853838,
"learning_rate": 8.285714285714287e-06,
"loss": -0.2219,
"step": 30,
"step_time": 5.809630234999531
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1791.0,
"completions/max_terminated_length": 1791.0,
"completions/mean_length": 1661.65625,
"completions/mean_terminated_length": 1661.65625,
"completions/min_length": 1507.0,
"completions/min_terminated_length": 1507.0,
"entropy": 0.3842233642935753,
"epoch": 0.00062,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.9492745995521545,
"kl": 0.836035018786788,
"learning_rate": 8.571428571428571e-06,
"loss": -0.1847,
"num_tokens": 1172333.0,
"reward": -7.43505859375,
"reward_std": 10.108196258544922,
"rewards/rollout_reward_func/mean": -7.43505859375,
"rewards/rollout_reward_func/std": 12.447552680969238,
"sampling/importance_sampling_ratio/max": 1.9039174318313599,
"sampling/importance_sampling_ratio/mean": 0.7630480527877808,
"sampling/importance_sampling_ratio/min": 0.03481662645936012,
"sampling/sampling_logp_difference/max": 2.102973699569702,
"sampling/sampling_logp_difference/mean": 0.10207939893007278,
"step": 31,
"step_time": 37.01448380799866
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.015625,
"clip_ratio/low_min": 0.0078125,
"clip_ratio/region_mean": 0.017578125,
"entropy": 0.3787720203399658,
"epoch": 0.00064,
"grad_norm": 0.9088082313537598,
"kl": 0.954752204939723,
"learning_rate": 8.857142857142858e-06,
"loss": -0.1848,
"step": 32,
"step_time": 6.941179774000375
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1777.0,
"completions/max_terminated_length": 1777.0,
"completions/mean_length": 1584.28125,
"completions/mean_terminated_length": 1584.28125,
"completions/min_length": 274.0,
"completions/min_terminated_length": 274.0,
"entropy": 0.34466781467199326,
"epoch": 0.00066,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.3309731483459473,
"kl": 0.8169562551192939,
"learning_rate": 9.142857142857144e-06,
"loss": -0.0371,
"num_tokens": 1244354.0,
"reward": -5.68795108795166,
"reward_std": 6.089047431945801,
"rewards/rollout_reward_func/mean": -5.68795108795166,
"rewards/rollout_reward_func/std": 7.458269119262695,
"sampling/importance_sampling_ratio/max": 1.7896546125411987,
"sampling/importance_sampling_ratio/mean": 0.8577574491500854,
"sampling/importance_sampling_ratio/min": 0.03787967935204506,
"sampling/sampling_logp_difference/max": 2.316878080368042,
"sampling/sampling_logp_difference/mean": 0.08233191072940826,
"step": 33,
"step_time": 35.641360890001124
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.009765625,
"clip_ratio/low_min": 0.00390625,
"clip_ratio/region_mean": 0.009765625,
"entropy": 0.3404731787741184,
"epoch": 0.00068,
"grad_norm": 1.2554371356964111,
"kl": 0.8635595235973597,
"learning_rate": 9.42857142857143e-06,
"loss": -0.0392,
"step": 34,
"step_time": 5.800049977002345
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1793.0,
"completions/max_terminated_length": 1793.0,
"completions/mean_length": 1645.40625,
"completions/mean_terminated_length": 1645.40625,
"completions/min_length": 1436.0,
"completions/min_terminated_length": 1436.0,
"entropy": 0.34361691400408745,
"epoch": 0.0007,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.8878010511398315,
"kl": 0.6783301420509815,
"learning_rate": 9.714285714285715e-06,
"loss": -0.0546,
"num_tokens": 1318828.0,
"reward": -6.509824752807617,
"reward_std": 5.78992223739624,
"rewards/rollout_reward_func/mean": -6.509824752807617,
"rewards/rollout_reward_func/std": 7.799504280090332,
"sampling/importance_sampling_ratio/max": 2.6033241748809814,
"sampling/importance_sampling_ratio/mean": 0.7217234373092651,
"sampling/importance_sampling_ratio/min": 0.03342561423778534,
"sampling/sampling_logp_difference/max": 1.9652609825134277,
"sampling/sampling_logp_difference/mean": 0.10379400849342346,
"step": 35,
"step_time": 38.887631579999834
},
{
"clip_ratio/high_max": 0.01953125,
"clip_ratio/high_mean": 0.009765625,
"clip_ratio/low_mean": 0.00390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.013671875,
"entropy": 0.34295540675520897,
"epoch": 0.00072,
"grad_norm": 0.8096361756324768,
"kl": 0.608342956751585,
"learning_rate": 1e-05,
"loss": -0.0557,
"step": 36,
"step_time": 5.798652657001185
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1830.0,
"completions/max_terminated_length": 1830.0,
"completions/mean_length": 1669.1875,
"completions/mean_terminated_length": 1669.1875,
"completions/min_length": 1051.0,
"completions/min_terminated_length": 1051.0,
"entropy": 0.40553563460707664,
"epoch": 0.00074,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.1997655630111694,
"kl": 0.6256659794598818,
"learning_rate": 9.999999998148153e-06,
"loss": -0.1523,
"num_tokens": 1393543.0,
"reward": -6.479705333709717,
"reward_std": 5.127873420715332,
"rewards/rollout_reward_func/mean": -6.479705333709717,
"rewards/rollout_reward_func/std": 5.757119655609131,
"sampling/importance_sampling_ratio/max": 2.8293187618255615,
"sampling/importance_sampling_ratio/mean": 0.8761348724365234,
"sampling/importance_sampling_ratio/min": 0.04236992821097374,
"sampling/sampling_logp_difference/max": 1.9487248659133911,
"sampling/sampling_logp_difference/mean": 0.10332974791526794,
"step": 37,
"step_time": 37.80114303599839
},
{
"clip_ratio/high_max": 0.022248641354963183,
"clip_ratio/high_mean": 0.011124320677481592,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.011124320677481592,
"entropy": 0.4083840139210224,
"epoch": 0.00076,
"grad_norm": 1.0707274675369263,
"kl": 0.46542409248650074,
"learning_rate": 9.999999992592613e-06,
"loss": -0.1538,
"step": 38,
"step_time": 6.32874613399963
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1798.0,
"completions/max_terminated_length": 1798.0,
"completions/mean_length": 1652.875,
"completions/mean_terminated_length": 1652.875,
"completions/min_length": 1438.0,
"completions/min_terminated_length": 1438.0,
"entropy": 0.3374646417796612,
"epoch": 0.00078,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.204039216041565,
"kl": 0.31642488297075033,
"learning_rate": 9.999999983333379e-06,
"loss": -0.2975,
"num_tokens": 1467743.0,
"reward": -4.7108917236328125,
"reward_std": 5.350179672241211,
"rewards/rollout_reward_func/mean": -4.7108917236328125,
"rewards/rollout_reward_func/std": 5.910353660583496,
"sampling/importance_sampling_ratio/max": 2.4041433334350586,
"sampling/importance_sampling_ratio/mean": 0.9955360889434814,
"sampling/importance_sampling_ratio/min": 0.0774984359741211,
"sampling/sampling_logp_difference/max": 1.3263565301895142,
"sampling/sampling_logp_difference/mean": 0.08658263087272644,
"step": 39,
"step_time": 38.75331890299822
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00390625,
"entropy": 0.3382052704691887,
"epoch": 0.0008,
"grad_norm": 1.1924562454223633,
"kl": 0.293441329151392,
"learning_rate": 9.999999970370451e-06,
"loss": -0.2999,
"step": 40,
"step_time": 5.813114761998804
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.001953125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1829.0,
"completions/max_terminated_length": 1829.0,
"completions/mean_length": 1647.59375,
"completions/mean_terminated_length": 1647.59375,
"completions/min_length": 871.0,
"completions/min_terminated_length": 871.0,
"entropy": 0.3876206576824188,
"epoch": 0.00082,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.0214810371398926,
"kl": 0.42729487270116806,
"learning_rate": 9.99999995370383e-06,
"loss": 0.0543,
"num_tokens": 1542142.0,
"reward": -3.3205041885375977,
"reward_std": 4.228387832641602,
"rewards/rollout_reward_func/mean": -3.3205041885375977,
"rewards/rollout_reward_func/std": 8.351001739501953,
"sampling/importance_sampling_ratio/max": 2.9104766845703125,
"sampling/importance_sampling_ratio/mean": 0.9624049067497253,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.469689130783081,
"sampling/sampling_logp_difference/mean": 0.09306588023900986,
"step": 41,
"step_time": 36.42707723900003
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.001953125,
"entropy": 0.38661571592092514,
"epoch": 0.00084,
"grad_norm": 1.0359998941421509,
"kl": 0.41935206204652786,
"learning_rate": 9.999999933333514e-06,
"loss": 0.0525,
"step": 42,
"step_time": 5.882109656998182
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1772.0,
"completions/max_terminated_length": 1772.0,
"completions/mean_length": 1624.15625,
"completions/mean_terminated_length": 1624.15625,
"completions/min_length": 879.0,
"completions/min_terminated_length": 879.0,
"entropy": 0.3577045015990734,
"epoch": 0.00086,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.2550678253173828,
"kl": 0.32372760958969593,
"learning_rate": 9.999999909259504e-06,
"loss": -0.0496,
"num_tokens": 1615506.0,
"reward": -5.2150774002075195,
"reward_std": 6.197805404663086,
"rewards/rollout_reward_func/mean": -5.2150774002075195,
"rewards/rollout_reward_func/std": 8.147597312927246,
"sampling/importance_sampling_ratio/max": 2.349579095840454,
"sampling/importance_sampling_ratio/mean": 0.946481466293335,
"sampling/importance_sampling_ratio/min": 0.06307531893253326,
"sampling/sampling_logp_difference/max": 1.156632423400879,
"sampling/sampling_logp_difference/mean": 0.07861532270908356,
"step": 43,
"step_time": 37.13630858600118
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005859375,
"entropy": 0.3559652045369148,
"epoch": 0.00088,
"grad_norm": 1.0217231512069702,
"kl": 0.3223106600344181,
"learning_rate": 9.9999998814818e-06,
"loss": -0.0515,
"step": 44,
"step_time": 6.2407338969997
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.001953125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1814.0,
"completions/max_terminated_length": 1814.0,
"completions/mean_length": 1681.125,
"completions/mean_terminated_length": 1681.125,
"completions/min_length": 1242.0,
"completions/min_terminated_length": 1242.0,
"entropy": 0.3759612925350666,
"epoch": 0.0009,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.528295636177063,
"kl": 0.4129584180191159,
"learning_rate": 9.999999850000403e-06,
"loss": -0.0326,
"num_tokens": 1690335.0,
"reward": -2.4644203186035156,
"reward_std": 10.457454681396484,
"rewards/rollout_reward_func/mean": -2.4644203186035156,
"rewards/rollout_reward_func/std": 15.407123565673828,
"sampling/importance_sampling_ratio/max": 2.56207537651062,
"sampling/importance_sampling_ratio/mean": 0.8990581035614014,
"sampling/importance_sampling_ratio/min": 0.13844478130340576,
"sampling/sampling_logp_difference/max": 1.2251713275909424,
"sampling/sampling_logp_difference/mean": 0.07088702917098999,
"step": 45,
"step_time": 36.063344202000735
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00390625,
"entropy": 0.37699316069483757,
"epoch": 0.00092,
"grad_norm": 1.5766605138778687,
"kl": 0.37809659354388714,
"learning_rate": 9.999999814815314e-06,
"loss": -0.0346,
"step": 46,
"step_time": 5.878628188998846
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.001953125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1796.0,
"completions/max_terminated_length": 1796.0,
"completions/mean_length": 1684.96875,
"completions/mean_terminated_length": 1684.96875,
"completions/min_length": 1413.0,
"completions/min_terminated_length": 1413.0,
"entropy": 0.35037482157349586,
"epoch": 0.00094,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.7219093441963196,
"kl": 0.3052864633500576,
"learning_rate": 9.99999977592653e-06,
"loss": -0.1454,
"num_tokens": 1765997.0,
"reward": -5.9493207931518555,
"reward_std": 6.758513450622559,
"rewards/rollout_reward_func/mean": -5.9493207931518555,
"rewards/rollout_reward_func/std": 7.776234149932861,
"sampling/importance_sampling_ratio/max": 2.8358778953552246,
"sampling/importance_sampling_ratio/mean": 0.7998743653297424,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.1099776029586792,
"sampling/sampling_logp_difference/mean": 0.0953160896897316,
"step": 47,
"step_time": 37.005451356000776
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.00390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005859375,
"entropy": 0.34440867975354195,
"epoch": 0.00096,
"grad_norm": 0.6548582911491394,
"kl": 0.3214886896312237,
"learning_rate": 9.999999733334051e-06,
"loss": -0.1452,
"step": 48,
"step_time": 5.856181479999577
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1795.0,
"completions/max_terminated_length": 1795.0,
"completions/mean_length": 1652.40625,
"completions/mean_terminated_length": 1652.40625,
"completions/min_length": 1492.0,
"completions/min_terminated_length": 1492.0,
"entropy": 0.3001542203128338,
"epoch": 0.00098,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.1067109107971191,
"kl": 0.20680313045158982,
"learning_rate": 9.99999968703788e-06,
"loss": -0.1033,
"num_tokens": 1839890.0,
"reward": -3.3683390617370605,
"reward_std": 3.6531944274902344,
"rewards/rollout_reward_func/mean": -3.3683390617370605,
"rewards/rollout_reward_func/std": 7.470107078552246,
"sampling/importance_sampling_ratio/max": 2.1421477794647217,
"sampling/importance_sampling_ratio/mean": 1.1610007286071777,
"sampling/importance_sampling_ratio/min": 0.263545960187912,
"sampling/sampling_logp_difference/max": 0.9924228191375732,
"sampling/sampling_logp_difference/mean": 0.05461367964744568,
"step": 49,
"step_time": 38.469305269999495
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00390625,
"entropy": 0.29572881013154984,
"epoch": 0.001,
"grad_norm": 0.9047728180885315,
"kl": 0.22563101211562753,
"learning_rate": 9.999999637038016e-06,
"loss": -0.1081,
"step": 50,
"step_time": 5.818200681000235
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00390625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1796.0,
"completions/max_terminated_length": 1796.0,
"completions/mean_length": 1690.875,
"completions/mean_terminated_length": 1690.875,
"completions/min_length": 1463.0,
"completions/min_terminated_length": 1463.0,
"entropy": 0.31936580687761307,
"epoch": 0.00102,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.7223824262619019,
"kl": 0.3224958088248968,
"learning_rate": 9.999999583334458e-06,
"loss": -0.1377,
"num_tokens": 1915271.0,
"reward": -3.352957248687744,
"reward_std": 4.21926212310791,
"rewards/rollout_reward_func/mean": -3.352957248687744,
"rewards/rollout_reward_func/std": 6.320347309112549,
"sampling/importance_sampling_ratio/max": 2.287932872772217,
"sampling/importance_sampling_ratio/mean": 0.8120144605636597,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.2639083862304688,
"sampling/sampling_logp_difference/mean": 0.07481236755847931,
"step": 51,
"step_time": 37.81962620299964
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005859375,
"entropy": 0.3146309144794941,
"epoch": 0.00104,
"grad_norm": 0.7475705742835999,
"kl": 0.33450872637331486,
"learning_rate": 9.999999525927207e-06,
"loss": -0.1398,
"step": 52,
"step_time": 5.822449180000149
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1839.0,
"completions/max_terminated_length": 1839.0,
"completions/mean_length": 1702.75,
"completions/mean_terminated_length": 1702.75,
"completions/min_length": 1476.0,
"completions/min_terminated_length": 1476.0,
"entropy": 0.322977501899004,
"epoch": 0.00106,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.198279619216919,
"kl": 0.2917388379573822,
"learning_rate": 9.999999464816262e-06,
"loss": -0.1064,
"num_tokens": 1991060.0,
"reward": -1.9325592517852783,
"reward_std": 4.967609882354736,
"rewards/rollout_reward_func/mean": -1.9325592517852783,
"rewards/rollout_reward_func/std": 5.8073954582214355,
"sampling/importance_sampling_ratio/max": 2.789353847503662,
"sampling/importance_sampling_ratio/mean": 1.1503762006759644,
"sampling/importance_sampling_ratio/min": 0.1404324173927307,
"sampling/sampling_logp_difference/max": 1.199690580368042,
"sampling/sampling_logp_difference/mean": 0.07725630700588226,
"step": 53,
"step_time": 36.42649295499905
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00390625,
"entropy": 0.31714847683906555,
"epoch": 0.00108,
"grad_norm": 1.1143569946289062,
"kl": 0.3153774570673704,
"learning_rate": 9.999999400001624e-06,
"loss": -0.1103,
"step": 54,
"step_time": 6.59381660300005
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.001953125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1815.0,
"completions/max_terminated_length": 1815.0,
"completions/mean_length": 1677.09375,
"completions/mean_terminated_length": 1677.09375,
"completions/min_length": 1520.0,
"completions/min_terminated_length": 1520.0,
"entropy": 0.2675260417163372,
"epoch": 0.0011,
"frac_reward_zero_std": 0.125,
"grad_norm": 0.6082053780555725,
"kl": 0.35145391430705786,
"learning_rate": 9.999999331483293e-06,
"loss": -0.0145,
"num_tokens": 2065830.0,
"reward": -7.571907997131348,
"reward_std": 7.06196928024292,
"rewards/rollout_reward_func/mean": -7.571907997131348,
"rewards/rollout_reward_func/std": 10.322997093200684,
"sampling/importance_sampling_ratio/max": 2.288501024246216,
"sampling/importance_sampling_ratio/mean": 0.8737363815307617,
"sampling/importance_sampling_ratio/min": 0.11803531646728516,
"sampling/sampling_logp_difference/max": 1.27490234375,
"sampling/sampling_logp_difference/mean": 0.07949512451887131,
"step": 55,
"step_time": 37.09443227400061
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.00390625,
"clip_ratio/low_min": 0.00390625,
"clip_ratio/region_mean": 0.00390625,
"entropy": 0.26163551956415176,
"epoch": 0.00112,
"grad_norm": 0.5967329144477844,
"kl": 0.3754094559699297,
"learning_rate": 9.999999259261269e-06,
"loss": -0.0158,
"step": 56,
"step_time": 5.860317817000578
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1822.0,
"completions/max_terminated_length": 1822.0,
"completions/mean_length": 1653.90625,
"completions/mean_terminated_length": 1653.90625,
"completions/min_length": 1317.0,
"completions/min_terminated_length": 1317.0,
"entropy": 0.32117627188563347,
"epoch": 0.00114,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.9490989446640015,
"kl": 0.3084982577711344,
"learning_rate": 9.999999183335551e-06,
"loss": -0.1246,
"num_tokens": 2139937.0,
"reward": -3.708850860595703,
"reward_std": 6.1540679931640625,
"rewards/rollout_reward_func/mean": -3.708850860595703,
"rewards/rollout_reward_func/std": 7.61086893081665,
"sampling/importance_sampling_ratio/max": 2.2644283771514893,
"sampling/importance_sampling_ratio/mean": 0.9409332275390625,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.2469470500946045,
"sampling/sampling_logp_difference/mean": 0.08474647253751755,
"step": 57,
"step_time": 36.73273920400061
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.005859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005859375,
"entropy": 0.31612110882997513,
"epoch": 0.00116,
"grad_norm": 0.9802373647689819,
"kl": 0.3413949944078922,
"learning_rate": 9.999999103706142e-06,
"loss": -0.1259,
"step": 58,
"step_time": 5.886546145001375
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.001953125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1837.0,
"completions/max_terminated_length": 1837.0,
"completions/mean_length": 1626.5625,
"completions/mean_terminated_length": 1626.5625,
"completions/min_length": 267.0,
"completions/min_terminated_length": 267.0,
"entropy": 0.30012011528015137,
"epoch": 0.00118,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.0481098890304565,
"kl": 0.5502468682825565,
"learning_rate": 9.999999020373038e-06,
"loss": -0.1811,
"num_tokens": 2213209.0,
"reward": -5.389894485473633,
"reward_std": 4.981201648712158,
"rewards/rollout_reward_func/mean": -5.389894485473633,
"rewards/rollout_reward_func/std": 6.769619941711426,
"sampling/importance_sampling_ratio/max": 2.7308578491210938,
"sampling/importance_sampling_ratio/mean": 0.9160431027412415,
"sampling/importance_sampling_ratio/min": 0.08488596975803375,
"sampling/sampling_logp_difference/max": 1.468017578125,
"sampling/sampling_logp_difference/mean": 0.09639683365821838,
"step": 59,
"step_time": 37.19838971900026
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.00390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005859375,
"entropy": 0.2991691865026951,
"epoch": 0.0012,
"grad_norm": 0.8857253193855286,
"kl": 0.5988470073789358,
"learning_rate": 9.999998933336242e-06,
"loss": -0.1844,
"step": 60,
"step_time": 6.367170782000358
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1822.0,
"completions/max_terminated_length": 1822.0,
"completions/mean_length": 1680.0625,
"completions/mean_terminated_length": 1680.0625,
"completions/min_length": 1396.0,
"completions/min_terminated_length": 1396.0,
"entropy": 0.27592010982334614,
"epoch": 0.00122,
"frac_reward_zero_std": 0.125,
"grad_norm": 1.353814721107483,
"kl": 0.37851969711482525,
"learning_rate": 9.999998842595754e-06,
"loss": -0.0009,
"num_tokens": 2288126.0,
"reward": -4.979825496673584,
"reward_std": 6.0550150871276855,
"rewards/rollout_reward_func/mean": -4.979825496673584,
"rewards/rollout_reward_func/std": 9.026530265808105,
"sampling/importance_sampling_ratio/max": 1.9612035751342773,
"sampling/importance_sampling_ratio/mean": 0.9742088317871094,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 2.270935535430908,
"sampling/sampling_logp_difference/mean": 0.07654492557048798,
"step": 61,
"step_time": 37.15296104299978
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.272587139159441,
"epoch": 0.00124,
"grad_norm": 1.322236180305481,
"kl": 0.403486505150795,
"learning_rate": 9.999998748151573e-06,
"loss": -0.0003,
"step": 62,
"step_time": 5.884493231998022
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1779.0,
"completions/max_terminated_length": 1779.0,
"completions/mean_length": 1606.4375,
"completions/mean_terminated_length": 1606.4375,
"completions/min_length": 1067.0,
"completions/min_terminated_length": 1067.0,
"entropy": 0.24129757285118103,
"epoch": 0.00126,
"frac_reward_zero_std": 0.125,
"grad_norm": 0.8646777868270874,
"kl": 0.3206884413957596,
"learning_rate": 9.999998650003697e-06,
"loss": -0.0263,
"num_tokens": 2361057.0,
"reward": -4.465510845184326,
"reward_std": 6.844207286834717,
"rewards/rollout_reward_func/mean": -4.465510845184326,
"rewards/rollout_reward_func/std": 8.709650039672852,
"sampling/importance_sampling_ratio/max": 2.8052289485931396,
"sampling/importance_sampling_ratio/mean": 0.9944058656692505,
"sampling/importance_sampling_ratio/min": 0.04083564504981041,
"sampling/sampling_logp_difference/max": 1.8856086730957031,
"sampling/sampling_logp_difference/mean": 0.07281368225812912,
"step": 63,
"step_time": 36.78395269500015
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.001953125,
"entropy": 0.23481187783181667,
"epoch": 0.00128,
"grad_norm": 0.843463659286499,
"kl": 0.33893433026969433,
"learning_rate": 9.999998548152132e-06,
"loss": -0.027,
"step": 64,
"step_time": 5.761317184999825
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.001953125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1810.0,
"completions/max_terminated_length": 1810.0,
"completions/mean_length": 1699.90625,
"completions/mean_terminated_length": 1699.90625,
"completions/min_length": 1413.0,
"completions/min_terminated_length": 1413.0,
"entropy": 0.2767509985715151,
"epoch": 0.0013,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.0217297077178955,
"kl": 0.45753872115164995,
"learning_rate": 9.999998442596872e-06,
"loss": -0.0874,
"num_tokens": 2437012.0,
"reward": -3.256406307220459,
"reward_std": 6.123032093048096,
"rewards/rollout_reward_func/mean": -3.256406307220459,
"rewards/rollout_reward_func/std": 6.840267181396484,
"sampling/importance_sampling_ratio/max": 2.3409619331359863,
"sampling/importance_sampling_ratio/mean": 0.8038904070854187,
"sampling/importance_sampling_ratio/min": 0.022516217082738876,
"sampling/sampling_logp_difference/max": 1.6859521865844727,
"sampling/sampling_logp_difference/mean": 0.08254212141036987,
"step": 65,
"step_time": 36.94500934499865
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.001953125,
"entropy": 0.2692566681653261,
"epoch": 0.00132,
"grad_norm": 0.8563345670700073,
"kl": 0.4730011150240898,
"learning_rate": 9.999998333337923e-06,
"loss": -0.0897,
"step": 66,
"step_time": 6.347890593002376
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0026041667442768812,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0026041667442768812,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1795.0,
"completions/max_terminated_length": 1795.0,
"completions/mean_length": 1669.125,
"completions/mean_terminated_length": 1669.125,
"completions/min_length": 641.0,
"completions/min_terminated_length": 641.0,
"entropy": 0.239725174382329,
"epoch": 0.00134,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.8270325660705566,
"kl": 0.3960014134645462,
"learning_rate": 9.99999822037528e-06,
"loss": -0.107,
"num_tokens": 2511757.0,
"reward": -6.540927886962891,
"reward_std": 7.1129469871521,
"rewards/rollout_reward_func/mean": -6.540927886962891,
"rewards/rollout_reward_func/std": 10.2684907913208,
"sampling/importance_sampling_ratio/max": 2.9500951766967773,
"sampling/importance_sampling_ratio/mean": 1.145231008529663,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.810743808746338,
"sampling/sampling_logp_difference/mean": 0.07273144274950027,
"step": 67,
"step_time": 36.61473885000032
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.004557291744276881,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.006510416744276881,
"entropy": 0.23407302796840668,
"epoch": 0.00136,
"grad_norm": 0.7608462572097778,
"kl": 0.389744964428246,
"learning_rate": 9.999998103708944e-06,
"loss": -0.1089,
"step": 68,
"step_time": 5.844826974000171
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.001953125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1819.0,
"completions/max_terminated_length": 1819.0,
"completions/mean_length": 1668.25,
"completions/mean_terminated_length": 1668.25,
"completions/min_length": 1502.0,
"completions/min_terminated_length": 1502.0,
"entropy": 0.2598415594547987,
"epoch": 0.00138,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.7419317960739136,
"kl": 0.28893633373081684,
"learning_rate": 9.999997983338918e-06,
"loss": -0.0011,
"num_tokens": 2586505.0,
"reward": -7.137851238250732,
"reward_std": 7.074878692626953,
"rewards/rollout_reward_func/mean": -7.137851238250732,
"rewards/rollout_reward_func/std": 10.856270790100098,
"sampling/importance_sampling_ratio/max": 2.6760199069976807,
"sampling/importance_sampling_ratio/mean": 0.8516594171524048,
"sampling/importance_sampling_ratio/min": 0.13397420942783356,
"sampling/sampling_logp_difference/max": 1.6180033683776855,
"sampling/sampling_logp_difference/mean": 0.07290571928024292,
"step": 69,
"step_time": 37.49325247999877
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.005859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.009765625,
"entropy": 0.25334353744983673,
"epoch": 0.0014,
"grad_norm": 0.7260986566543579,
"kl": 0.3038715925067663,
"learning_rate": 9.999997859265198e-06,
"loss": -0.0033,
"step": 70,
"step_time": 6.613173748000918
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.00390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00390625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1825.0,
"completions/max_terminated_length": 1825.0,
"completions/mean_length": 1665.625,
"completions/mean_terminated_length": 1665.625,
"completions/min_length": 1103.0,
"completions/min_terminated_length": 1103.0,
"entropy": 0.1995892282575369,
"epoch": 0.00142,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.8726052641868591,
"kl": 0.3035321347415447,
"learning_rate": 9.999997731487788e-06,
"loss": -0.1951,
"num_tokens": 2661197.0,
"reward": 0.01697838306427002,
"reward_std": 5.1158766746521,
"rewards/rollout_reward_func/mean": 0.01697838306427002,
"rewards/rollout_reward_func/std": 10.660343170166016,
"sampling/importance_sampling_ratio/max": 2.622579574584961,
"sampling/importance_sampling_ratio/mean": 0.7724930047988892,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.3327304124832153,
"sampling/sampling_logp_difference/mean": 0.068142369389534,
"step": 71,
"step_time": 35.71897001199886
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.00390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00390625,
"entropy": 0.19654854200780392,
"epoch": 0.00144,
"grad_norm": 0.9153209328651428,
"kl": 0.30595986917614937,
"learning_rate": 9.999997600006685e-06,
"loss": -0.1967,
"step": 72,
"step_time": 5.866601767999782
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1808.0,
"completions/max_terminated_length": 1808.0,
"completions/mean_length": 1696.9375,
"completions/mean_terminated_length": 1696.9375,
"completions/min_length": 1599.0,
"completions/min_terminated_length": 1599.0,
"entropy": 0.21030581928789616,
"epoch": 0.00146,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.7310628890991211,
"kl": 1.0501810098066926,
"learning_rate": 9.999997464821892e-06,
"loss": -0.2747,
"num_tokens": 2736835.0,
"reward": -7.002237319946289,
"reward_std": 6.102571487426758,
"rewards/rollout_reward_func/mean": -7.002237319946289,
"rewards/rollout_reward_func/std": 10.413652420043945,
"sampling/importance_sampling_ratio/max": 2.1960866451263428,
"sampling/importance_sampling_ratio/mean": 0.8038663268089294,
"sampling/importance_sampling_ratio/min": 0.046504825353622437,
"sampling/sampling_logp_difference/max": 2.5245094299316406,
"sampling/sampling_logp_difference/mean": 0.09849925339221954,
"step": 73,
"step_time": 38.96205426400138
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005859375,
"entropy": 0.20804075710475445,
"epoch": 0.00148,
"grad_norm": 0.6374855041503906,
"kl": 0.9734249282628298,
"learning_rate": 9.999997325933409e-06,
"loss": -0.2766,
"step": 74,
"step_time": 5.84851037899989
},
{
"clip_ratio/high_max": 0.0032051282469183207,
"clip_ratio/high_mean": 0.0016025641234591603,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0035556891234591603,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1783.0,
"completions/max_terminated_length": 1783.0,
"completions/mean_length": 1692.96875,
"completions/mean_terminated_length": 1692.96875,
"completions/min_length": 1394.0,
"completions/min_terminated_length": 1394.0,
"entropy": 0.19729920756071806,
"epoch": 0.0015,
"frac_reward_zero_std": 0.125,
"grad_norm": 0.8172791600227356,
"kl": 0.4751043822616339,
"learning_rate": 9.999997183341233e-06,
"loss": -0.142,
"num_tokens": 2812279.0,
"reward": -2.219168186187744,
"reward_std": 9.400641441345215,
"rewards/rollout_reward_func/mean": -2.219168186187744,
"rewards/rollout_reward_func/std": 17.64300537109375,
"sampling/importance_sampling_ratio/max": 2.173631429672241,
"sampling/importance_sampling_ratio/mean": 0.8823361396789551,
"sampling/importance_sampling_ratio/min": 0.03189156949520111,
"sampling/sampling_logp_difference/max": 1.8222627639770508,
"sampling/sampling_logp_difference/mean": 0.0679212361574173,
"step": 75,
"step_time": 35.097398169999906
},
{
"clip_ratio/high_max": 0.007111378246918321,
"clip_ratio/high_mean": 0.0035556891234591603,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00550881412345916,
"entropy": 0.19708813540637493,
"epoch": 0.00152,
"grad_norm": 0.7900307774543762,
"kl": 0.43427742179483175,
"learning_rate": 9.999997037045365e-06,
"loss": -0.1431,
"step": 76,
"step_time": 6.543489481999131
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.001953125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1808.0,
"completions/max_terminated_length": 1808.0,
"completions/mean_length": 1717.5625,
"completions/mean_terminated_length": 1717.5625,
"completions/min_length": 1391.0,
"completions/min_terminated_length": 1391.0,
"entropy": 0.15479024220257998,
"epoch": 0.00154,
"frac_reward_zero_std": 0.125,
"grad_norm": 0.6995255947113037,
"kl": 0.30347975343465805,
"learning_rate": 9.999996887045808e-06,
"loss": 0.0408,
"num_tokens": 2888657.0,
"reward": -4.00355339050293,
"reward_std": 4.775286674499512,
"rewards/rollout_reward_func/mean": -4.00355339050293,
"rewards/rollout_reward_func/std": 6.246252536773682,
"sampling/importance_sampling_ratio/max": 1.650878667831421,
"sampling/importance_sampling_ratio/mean": 1.0060396194458008,
"sampling/importance_sampling_ratio/min": 0.047248467803001404,
"sampling/sampling_logp_difference/max": 1.4376678466796875,
"sampling/sampling_logp_difference/mean": 0.05947484076023102,
"step": 77,
"step_time": 38.8434166800007
},
{
"clip_ratio/high_max": 0.01171875,
"clip_ratio/high_mean": 0.005859375,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005859375,
"entropy": 0.15612738858908415,
"epoch": 0.00156,
"grad_norm": 0.6284993886947632,
"kl": 0.2821835596114397,
"learning_rate": 9.99999673334256e-06,
"loss": 0.0397,
"step": 78,
"step_time": 5.918868127000678
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.00390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005859375,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1762.0,
"completions/max_terminated_length": 1762.0,
"completions/mean_length": 1664.59375,
"completions/mean_terminated_length": 1664.59375,
"completions/min_length": 1434.0,
"completions/min_terminated_length": 1434.0,
"entropy": 0.17359685897827148,
"epoch": 0.00158,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.977432370185852,
"kl": 0.6586176492273808,
"learning_rate": 9.99999657593562e-06,
"loss": -0.2064,
"num_tokens": 2963303.0,
"reward": -4.805361270904541,
"reward_std": 6.155376434326172,
"rewards/rollout_reward_func/mean": -4.805361270904541,
"rewards/rollout_reward_func/std": 7.987611293792725,
"sampling/importance_sampling_ratio/max": 2.683112144470215,
"sampling/importance_sampling_ratio/mean": 0.8393880128860474,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 2.071775197982788,
"sampling/sampling_logp_difference/mean": 0.07382857799530029,
"step": 79,
"step_time": 37.2499062830002
},
{
"clip_ratio/high_max": 0.01171875,
"clip_ratio/high_mean": 0.005859375,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005859375,
"entropy": 0.1753261275589466,
"epoch": 0.0016,
"grad_norm": 0.7493237257003784,
"kl": 0.522200190462172,
"learning_rate": 9.99999641482499e-06,
"loss": -0.2097,
"step": 80,
"step_time": 5.753503210998133
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00390625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1832.0,
"completions/max_terminated_length": 1832.0,
"completions/mean_length": 1708.34375,
"completions/mean_terminated_length": 1708.34375,
"completions/min_length": 1412.0,
"completions/min_terminated_length": 1412.0,
"entropy": 0.2533543687313795,
"epoch": 0.00162,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.9206402897834778,
"kl": 0.49991538375616074,
"learning_rate": 9.999996250010671e-06,
"loss": -0.1446,
"num_tokens": 3039686.0,
"reward": -3.956460952758789,
"reward_std": 7.495929718017578,
"rewards/rollout_reward_func/mean": -3.956460952758789,
"rewards/rollout_reward_func/std": 8.32241153717041,
"sampling/importance_sampling_ratio/max": 2.4827659130096436,
"sampling/importance_sampling_ratio/mean": 0.9107170701026917,
"sampling/importance_sampling_ratio/min": 0.11611815541982651,
"sampling/sampling_logp_difference/max": 1.4147658348083496,
"sampling/sampling_logp_difference/mean": 0.07592972368001938,
"step": 81,
"step_time": 37.18665667199912
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005859375,
"entropy": 0.2543573584407568,
"epoch": 0.00164,
"grad_norm": 0.854416012763977,
"kl": 0.47987215034663677,
"learning_rate": 9.999996081492662e-06,
"loss": -0.1459,
"step": 82,
"step_time": 5.884696772999632
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.00390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00390625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1824.0,
"completions/max_terminated_length": 1824.0,
"completions/mean_length": 1733.9375,
"completions/mean_terminated_length": 1733.9375,
"completions/min_length": 1618.0,
"completions/min_terminated_length": 1618.0,
"entropy": 0.15933354571461678,
"epoch": 0.00166,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.8887842893600464,
"kl": 0.3857283741235733,
"learning_rate": 9.999995909270962e-06,
"loss": -0.166,
"num_tokens": 3116205.0,
"reward": -4.7430243492126465,
"reward_std": 5.144591808319092,
"rewards/rollout_reward_func/mean": -4.7430243492126465,
"rewards/rollout_reward_func/std": 8.597419738769531,
"sampling/importance_sampling_ratio/max": 2.8380703926086426,
"sampling/importance_sampling_ratio/mean": 1.0134353637695312,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.3160133361816406,
"sampling/sampling_logp_difference/mean": 0.05554642528295517,
"step": 83,
"step_time": 37.96955776300001
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.00390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0078125,
"entropy": 0.15759241580963135,
"epoch": 0.00168,
"grad_norm": 0.837668240070343,
"kl": 0.40188954304903746,
"learning_rate": 9.999995733345573e-06,
"loss": -0.1676,
"step": 84,
"step_time": 5.867369008998139
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.00390625,
"clip_ratio/low_min": 0.00390625,
"clip_ratio/region_mean": 0.00390625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1805.0,
"completions/max_terminated_length": 1805.0,
"completions/mean_length": 1607.8125,
"completions/mean_terminated_length": 1607.8125,
"completions/min_length": 745.0,
"completions/min_terminated_length": 745.0,
"entropy": 0.2709789536893368,
"epoch": 0.0017,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.9693952798843384,
"kl": 0.7671259762719274,
"learning_rate": 9.999995553716494e-06,
"loss": -0.0133,
"num_tokens": 3189437.0,
"reward": -6.506250381469727,
"reward_std": 6.401736736297607,
"rewards/rollout_reward_func/mean": -6.506250381469727,
"rewards/rollout_reward_func/std": 11.671521186828613,
"sampling/importance_sampling_ratio/max": 2.657285690307617,
"sampling/importance_sampling_ratio/mean": 0.9502277374267578,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 2.1241543292999268,
"sampling/sampling_logp_difference/mean": 0.06842806935310364,
"step": 85,
"step_time": 35.07483531900107
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0047940341755747795,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0047940341755747795,
"entropy": 0.2695111036300659,
"epoch": 0.00172,
"grad_norm": 0.9897550344467163,
"kl": 0.8880385467782617,
"learning_rate": 9.999995370383725e-06,
"loss": -0.0147,
"step": 86,
"step_time": 5.823899960000745
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00390625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1834.0,
"completions/max_terminated_length": 1834.0,
"completions/mean_length": 1718.625,
"completions/mean_terminated_length": 1718.625,
"completions/min_length": 1620.0,
"completions/min_terminated_length": 1620.0,
"entropy": 0.19925166107714176,
"epoch": 0.00174,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.8225500583648682,
"kl": 0.40306959114968777,
"learning_rate": 9.999995183347268e-06,
"loss": -0.1216,
"num_tokens": 3265817.0,
"reward": -4.694127082824707,
"reward_std": 7.164846897125244,
"rewards/rollout_reward_func/mean": -4.694127082824707,
"rewards/rollout_reward_func/std": 10.58739948272705,
"sampling/importance_sampling_ratio/max": 2.249318838119507,
"sampling/importance_sampling_ratio/mean": 0.8394811153411865,
"sampling/importance_sampling_ratio/min": 0.12018804997205734,
"sampling/sampling_logp_difference/max": 1.5835975408554077,
"sampling/sampling_logp_difference/mean": 0.07420962303876877,
"step": 87,
"step_time": 38.998291893000896
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.005859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0078125,
"entropy": 0.1985994167625904,
"epoch": 0.00176,
"grad_norm": 0.7866024374961853,
"kl": 0.4124698657542467,
"learning_rate": 9.999994992607122e-06,
"loss": -0.1228,
"step": 88,
"step_time": 6.345813049000753
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.005430640187114477,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005430640187114477,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1791.0,
"completions/max_terminated_length": 1791.0,
"completions/mean_length": 1672.28125,
"completions/mean_terminated_length": 1672.28125,
"completions/min_length": 1437.0,
"completions/min_terminated_length": 1437.0,
"entropy": 0.15057391114532948,
"epoch": 0.00178,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.8857223987579346,
"kl": 0.7324486412107944,
"learning_rate": 9.999994798163286e-06,
"loss": -0.1942,
"num_tokens": 3340291.0,
"reward": -4.911107540130615,
"reward_std": 5.658788204193115,
"rewards/rollout_reward_func/mean": -4.911107540130615,
"rewards/rollout_reward_func/std": 9.124991416931152,
"sampling/importance_sampling_ratio/max": 1.8904942274093628,
"sampling/importance_sampling_ratio/mean": 0.8380607962608337,
"sampling/importance_sampling_ratio/min": 0.02950156107544899,
"sampling/sampling_logp_difference/max": 1.5451288223266602,
"sampling/sampling_logp_difference/mean": 0.06332387030124664,
"step": 89,
"step_time": 37.11040565100211
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.001953125,
"entropy": 0.15209791343659163,
"epoch": 0.0018,
"grad_norm": 0.7109376788139343,
"kl": 0.6368166394531727,
"learning_rate": 9.999994600015764e-06,
"loss": -0.1955,
"step": 90,
"step_time": 5.825622919000125
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00390625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1821.0,
"completions/max_terminated_length": 1821.0,
"completions/mean_length": 1721.84375,
"completions/mean_terminated_length": 1721.84375,
"completions/min_length": 1530.0,
"completions/min_terminated_length": 1530.0,
"entropy": 0.16620426252484322,
"epoch": 0.00182,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.4581727981567383,
"kl": 1.7564852200448513,
"learning_rate": 9.99999439816455e-06,
"loss": -0.0169,
"num_tokens": 3416936.0,
"reward": -4.905522346496582,
"reward_std": 7.8199005126953125,
"rewards/rollout_reward_func/mean": -4.905522346496582,
"rewards/rollout_reward_func/std": 8.327784538269043,
"sampling/importance_sampling_ratio/max": 1.7173593044281006,
"sampling/importance_sampling_ratio/mean": 0.9146490693092346,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 2.112088203430176,
"sampling/sampling_logp_difference/mean": 0.06157752498984337,
"step": 91,
"step_time": 37.20597630500015
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00390625,
"entropy": 0.17287507839500904,
"epoch": 0.00184,
"grad_norm": 1.492632269859314,
"kl": 1.1148988083004951,
"learning_rate": 9.999994192609649e-06,
"loss": -0.023,
"step": 92,
"step_time": 5.926312706999852
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1774.0,
"completions/max_terminated_length": 1774.0,
"completions/mean_length": 1653.09375,
"completions/mean_terminated_length": 1653.09375,
"completions/min_length": 1566.0,
"completions/min_terminated_length": 1566.0,
"entropy": 0.21760124899446964,
"epoch": 0.00186,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.9200534820556641,
"kl": 0.26406371779739857,
"learning_rate": 9.99999398335106e-06,
"loss": -0.0031,
"num_tokens": 3491478.0,
"reward": -5.420025825500488,
"reward_std": 5.013497829437256,
"rewards/rollout_reward_func/mean": -5.420025825500488,
"rewards/rollout_reward_func/std": 6.584102630615234,
"sampling/importance_sampling_ratio/max": 2.963585376739502,
"sampling/importance_sampling_ratio/mean": 0.8498660326004028,
"sampling/importance_sampling_ratio/min": 0.2295318990945816,
"sampling/sampling_logp_difference/max": 0.9062635898590088,
"sampling/sampling_logp_difference/mean": 0.06188333407044411,
"step": 93,
"step_time": 38.3763018009995
},
{
"clip_ratio/high_max": 0.01953125,
"clip_ratio/high_mean": 0.01171875,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.013671875,
"entropy": 0.22959251329302788,
"epoch": 0.00188,
"grad_norm": 0.6035089492797852,
"kl": 0.23167249467223883,
"learning_rate": 9.999993770388785e-06,
"loss": -0.0067,
"step": 94,
"step_time": 5.798366992999036
},
{
"clip_ratio/high_max": 0.007694128900766373,
"clip_ratio/high_mean": 0.0038470644503831863,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0038470644503831863,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1753.0,
"completions/max_terminated_length": 1753.0,
"completions/mean_length": 1658.03125,
"completions/mean_terminated_length": 1658.03125,
"completions/min_length": 1455.0,
"completions/min_terminated_length": 1455.0,
"entropy": 0.20340878516435623,
"epoch": 0.0019,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.7779289484024048,
"kl": 0.3094237130135298,
"learning_rate": 9.99999355372282e-06,
"loss": -0.1794,
"num_tokens": 3565673.0,
"reward": -5.6944780349731445,
"reward_std": 3.949958562850952,
"rewards/rollout_reward_func/mean": -5.6944780349731445,
"rewards/rollout_reward_func/std": 5.5989580154418945,
"sampling/importance_sampling_ratio/max": 1.688936471939087,
"sampling/importance_sampling_ratio/mean": 0.9206300973892212,
"sampling/importance_sampling_ratio/min": 0.1527530699968338,
"sampling/sampling_logp_difference/max": 0.9441490173339844,
"sampling/sampling_logp_difference/mean": 0.050117556005716324,
"step": 95,
"step_time": 37.41757664999841
},
{
"clip_ratio/high_max": 0.01171875,
"clip_ratio/high_mean": 0.005859375,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005859375,
"entropy": 0.2086612544953823,
"epoch": 0.00192,
"grad_norm": 0.7833954691886902,
"kl": 0.29302166029810905,
"learning_rate": 9.999993333353169e-06,
"loss": -0.1799,
"step": 96,
"step_time": 5.725412834000053
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0028409091755747795,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0028409091755747795,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1804.0,
"completions/max_terminated_length": 1804.0,
"completions/mean_length": 1496.625,
"completions/mean_terminated_length": 1496.625,
"completions/min_length": 185.0,
"completions/min_terminated_length": 185.0,
"entropy": 0.2479400299489498,
"epoch": 0.00194,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.1320165395736694,
"kl": 0.3989646164700389,
"learning_rate": 9.999993109279829e-06,
"loss": -0.0546,
"num_tokens": 3635185.0,
"reward": -5.023990154266357,
"reward_std": 3.9877772331237793,
"rewards/rollout_reward_func/mean": -5.023990154266357,
"rewards/rollout_reward_func/std": 8.748732566833496,
"sampling/importance_sampling_ratio/max": 2.214284896850586,
"sampling/importance_sampling_ratio/mean": 0.9846020936965942,
"sampling/importance_sampling_ratio/min": 0.30134767293930054,
"sampling/sampling_logp_difference/max": 0.9754681587219238,
"sampling/sampling_logp_difference/mean": 0.061219509690999985,
"step": 97,
"step_time": 36.602559436000774
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.008984375046566129,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010937500046566129,
"entropy": 0.25142903439700603,
"epoch": 0.00196,
"grad_norm": 1.64491868019104,
"kl": 0.39512724056839943,
"learning_rate": 9.999992881502803e-06,
"loss": -0.0574,
"step": 98,
"step_time": 6.578721888999098
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.001953125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1808.0,
"completions/max_terminated_length": 1808.0,
"completions/mean_length": 1710.1875,
"completions/mean_terminated_length": 1710.1875,
"completions/min_length": 1435.0,
"completions/min_terminated_length": 1435.0,
"entropy": 0.23437649384140968,
"epoch": 0.00198,
"frac_reward_zero_std": 0.125,
"grad_norm": 0.9414642453193665,
"kl": 0.3236832795664668,
"learning_rate": 9.999992650022092e-06,
"loss": -0.1719,
"num_tokens": 3711125.0,
"reward": -2.0210158824920654,
"reward_std": 4.779875755310059,
"rewards/rollout_reward_func/mean": -2.0210158824920654,
"rewards/rollout_reward_func/std": 6.4040093421936035,
"sampling/importance_sampling_ratio/max": 2.3287320137023926,
"sampling/importance_sampling_ratio/mean": 1.0207520723342896,
"sampling/importance_sampling_ratio/min": 0.35892435908317566,
"sampling/sampling_logp_difference/max": 1.0962285995483398,
"sampling/sampling_logp_difference/mean": 0.044818222522735596,
"step": 99,
"step_time": 36.753581342999496
},
{
"clip_ratio/high_max": 0.01171875,
"clip_ratio/high_mean": 0.005859375,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005859375,
"entropy": 0.2348782755434513,
"epoch": 0.002,
"grad_norm": 0.9503746628761292,
"kl": 0.3419312732294202,
"learning_rate": 9.999992414837692e-06,
"loss": -0.1746,
"step": 100,
"step_time": 5.861453168999105
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00390625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1808.0,
"completions/max_terminated_length": 1808.0,
"completions/mean_length": 1658.5,
"completions/mean_terminated_length": 1658.5,
"completions/min_length": 665.0,
"completions/min_terminated_length": 665.0,
"entropy": 0.2949713133275509,
"epoch": 0.00202,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.8095335960388184,
"kl": 0.4149684626609087,
"learning_rate": 9.999992175949606e-06,
"loss": -0.1327,
"num_tokens": 3785245.0,
"reward": -1.9772329330444336,
"reward_std": 6.376922607421875,
"rewards/rollout_reward_func/mean": -1.9772329330444336,
"rewards/rollout_reward_func/std": 9.654932022094727,
"sampling/importance_sampling_ratio/max": 1.6300160884857178,
"sampling/importance_sampling_ratio/mean": 0.9628180265426636,
"sampling/importance_sampling_ratio/min": 0.14600704610347748,
"sampling/sampling_logp_difference/max": 1.1694939136505127,
"sampling/sampling_logp_difference/mean": 0.055236026644706726,
"step": 101,
"step_time": 36.26178865900147
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.2924080714583397,
"epoch": 0.00204,
"grad_norm": 0.8164064884185791,
"kl": 0.48169367760419846,
"learning_rate": 9.999991933357835e-06,
"loss": -0.1325,
"step": 102,
"step_time": 5.8681000480000876
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1814.0,
"completions/max_terminated_length": 1814.0,
"completions/mean_length": 1653.375,
"completions/mean_terminated_length": 1653.375,
"completions/min_length": 663.0,
"completions/min_terminated_length": 663.0,
"entropy": 0.2441821303218603,
"epoch": 0.00206,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.1374014616012573,
"kl": 0.19944205041974783,
"learning_rate": 9.999991687062379e-06,
"loss": -0.1332,
"num_tokens": 3859549.0,
"reward": -3.1205766201019287,
"reward_std": 6.89734411239624,
"rewards/rollout_reward_func/mean": -3.1205766201019287,
"rewards/rollout_reward_func/std": 8.786608695983887,
"sampling/importance_sampling_ratio/max": 2.907442092895508,
"sampling/importance_sampling_ratio/mean": 0.9543389678001404,
"sampling/importance_sampling_ratio/min": 0.26082849502563477,
"sampling/sampling_logp_difference/max": 1.416438102722168,
"sampling/sampling_logp_difference/mean": 0.04272625967860222,
"step": 103,
"step_time": 35.06253007600026
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.006510416744276881,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.008463541744276881,
"entropy": 0.24291709996759892,
"epoch": 0.00208,
"grad_norm": 1.0697972774505615,
"kl": 0.21361435670405626,
"learning_rate": 9.999991437063234e-06,
"loss": -0.1369,
"step": 104,
"step_time": 6.35797552799977
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1813.0,
"completions/max_terminated_length": 1813.0,
"completions/mean_length": 1677.625,
"completions/mean_terminated_length": 1677.625,
"completions/min_length": 1535.0,
"completions/min_terminated_length": 1535.0,
"entropy": 0.2205460276454687,
"epoch": 0.0021,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.7834408283233643,
"kl": 0.41426409501582384,
"learning_rate": 9.999991183360406e-06,
"loss": -0.1262,
"num_tokens": 3934186.0,
"reward": -1.3775752782821655,
"reward_std": 6.934841156005859,
"rewards/rollout_reward_func/mean": -1.3775752782821655,
"rewards/rollout_reward_func/std": 9.078507423400879,
"sampling/importance_sampling_ratio/max": 1.6336519718170166,
"sampling/importance_sampling_ratio/mean": 0.9894572496414185,
"sampling/importance_sampling_ratio/min": 0.09595068544149399,
"sampling/sampling_logp_difference/max": 1.2380528450012207,
"sampling/sampling_logp_difference/mean": 0.04348953068256378,
"step": 105,
"step_time": 37.434679850000975
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.001953125,
"entropy": 0.21956982091069221,
"epoch": 0.00212,
"grad_norm": 0.7701007723808289,
"kl": 0.435552092269063,
"learning_rate": 9.999990925953894e-06,
"loss": -0.1276,
"step": 106,
"step_time": 5.842552839000746
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.001953125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1784.0,
"completions/max_terminated_length": 1784.0,
"completions/mean_length": 1683.90625,
"completions/mean_terminated_length": 1683.90625,
"completions/min_length": 1482.0,
"completions/min_terminated_length": 1482.0,
"entropy": 0.23681390658020973,
"epoch": 0.00214,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.7701250314712524,
"kl": 0.675053995102644,
"learning_rate": 9.999990664843696e-06,
"loss": -0.2223,
"num_tokens": 4009166.0,
"reward": -3.5597405433654785,
"reward_std": 6.297412395477295,
"rewards/rollout_reward_func/mean": -3.5597405433654785,
"rewards/rollout_reward_func/std": 8.763272285461426,
"sampling/importance_sampling_ratio/max": 1.9215384721755981,
"sampling/importance_sampling_ratio/mean": 0.8674914836883545,
"sampling/importance_sampling_ratio/min": 0.052470579743385315,
"sampling/sampling_logp_difference/max": 1.6477103233337402,
"sampling/sampling_logp_difference/mean": 0.06225297600030899,
"step": 107,
"step_time": 38.8391734249999
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00390625,
"entropy": 0.23528996109962463,
"epoch": 0.00216,
"grad_norm": 0.7644266486167908,
"kl": 0.6335257366299629,
"learning_rate": 9.999990400029814e-06,
"loss": -0.2237,
"step": 108,
"step_time": 5.8248516069998
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.001953125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1759.0,
"completions/max_terminated_length": 1759.0,
"completions/mean_length": 1650.09375,
"completions/mean_terminated_length": 1650.09375,
"completions/min_length": 1436.0,
"completions/min_terminated_length": 1436.0,
"entropy": 0.2473286334425211,
"epoch": 0.00218,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.9451829791069031,
"kl": 0.3451357875019312,
"learning_rate": 9.999990131512245e-06,
"loss": -0.1751,
"num_tokens": 4083095.0,
"reward": -3.8237268924713135,
"reward_std": 5.033920764923096,
"rewards/rollout_reward_func/mean": -3.8237268924713135,
"rewards/rollout_reward_func/std": 6.137859344482422,
"sampling/importance_sampling_ratio/max": 1.8476587533950806,
"sampling/importance_sampling_ratio/mean": 1.0128724575042725,
"sampling/importance_sampling_ratio/min": 0.1566690355539322,
"sampling/sampling_logp_difference/max": 1.2108018398284912,
"sampling/sampling_logp_difference/mean": 0.03772260248661041,
"step": 109,
"step_time": 37.583575454999846
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.00390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00390625,
"entropy": 0.24676945246756077,
"epoch": 0.0022,
"grad_norm": 0.8795206546783447,
"kl": 0.3198219258338213,
"learning_rate": 9.999989859290995e-06,
"loss": -0.1785,
"step": 110,
"step_time": 6.2266275209995
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00390625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1813.0,
"completions/max_terminated_length": 1813.0,
"completions/mean_length": 1680.4375,
"completions/mean_terminated_length": 1680.4375,
"completions/min_length": 1474.0,
"completions/min_terminated_length": 1474.0,
"entropy": 0.2194829098880291,
"epoch": 0.00222,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.7685462832450867,
"kl": 0.49289337545633316,
"learning_rate": 9.99998958336606e-06,
"loss": -0.1256,
"num_tokens": 4158181.0,
"reward": -5.0853400230407715,
"reward_std": 3.0313239097595215,
"rewards/rollout_reward_func/mean": -5.0853400230407715,
"rewards/rollout_reward_func/std": 4.776071548461914,
"sampling/importance_sampling_ratio/max": 1.744321584701538,
"sampling/importance_sampling_ratio/mean": 0.8627097606658936,
"sampling/importance_sampling_ratio/min": 0.1638958901166916,
"sampling/sampling_logp_difference/max": 1.4669370651245117,
"sampling/sampling_logp_difference/mean": 0.04983227327466011,
"step": 111,
"step_time": 37.50297103200319
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00390625,
"entropy": 0.2189310658723116,
"epoch": 0.00224,
"grad_norm": 0.7485074400901794,
"kl": 0.480657372623682,
"learning_rate": 9.999989303737442e-06,
"loss": -0.1253,
"step": 112,
"step_time": 5.845547954002541
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.00390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00390625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1802.0,
"completions/max_terminated_length": 1802.0,
"completions/mean_length": 1632.78125,
"completions/mean_terminated_length": 1632.78125,
"completions/min_length": 280.0,
"completions/min_terminated_length": 280.0,
"entropy": 0.25709761306643486,
"epoch": 0.00226,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.0180292129516602,
"kl": 0.28133167419582605,
"learning_rate": 9.999989020405141e-06,
"loss": -0.0265,
"num_tokens": 4231873.0,
"reward": -4.746943950653076,
"reward_std": 5.858287811279297,
"rewards/rollout_reward_func/mean": -4.746943950653076,
"rewards/rollout_reward_func/std": 8.34781551361084,
"sampling/importance_sampling_ratio/max": 1.501573920249939,
"sampling/importance_sampling_ratio/mean": 0.8685078024864197,
"sampling/importance_sampling_ratio/min": 0.08650124073028564,
"sampling/sampling_logp_difference/max": 1.4456124305725098,
"sampling/sampling_logp_difference/mean": 0.04686921089887619,
"step": 113,
"step_time": 35.9479068110013
},
{
"clip_ratio/high_max": 0.014062500093132257,
"clip_ratio/high_mean": 0.007031250046566129,
"clip_ratio/low_mean": 0.00390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010937500046566129,
"entropy": 0.2569599896669388,
"epoch": 0.00228,
"grad_norm": 0.9531951546669006,
"kl": 0.2381299063563347,
"learning_rate": 9.999988733369157e-06,
"loss": -0.0309,
"step": 114,
"step_time": 6.549294945001748
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.001953125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1823.0,
"completions/max_terminated_length": 1823.0,
"completions/mean_length": 1726.15625,
"completions/mean_terminated_length": 1726.15625,
"completions/min_length": 1611.0,
"completions/min_terminated_length": 1611.0,
"entropy": 0.23173769749701023,
"epoch": 0.0023,
"frac_reward_zero_std": 0.125,
"grad_norm": 0.8843880891799927,
"kl": 0.37245292216539383,
"learning_rate": 9.999988442629489e-06,
"loss": 0.0203,
"num_tokens": 4308553.0,
"reward": -4.190584182739258,
"reward_std": 2.972388744354248,
"rewards/rollout_reward_func/mean": -4.190584182739258,
"rewards/rollout_reward_func/std": 5.20486307144165,
"sampling/importance_sampling_ratio/max": 1.6944365501403809,
"sampling/importance_sampling_ratio/mean": 0.9236411452293396,
"sampling/importance_sampling_ratio/min": 0.13073918223381042,
"sampling/sampling_logp_difference/max": 1.3462402820587158,
"sampling/sampling_logp_difference/mean": 0.052857253700494766,
"step": 115,
"step_time": 38.24738468399846
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.005859375,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0078125,
"entropy": 0.23898235149681568,
"epoch": 0.00232,
"grad_norm": 0.7934396266937256,
"kl": 0.37963598500937223,
"learning_rate": 9.99998814818614e-06,
"loss": 0.0198,
"step": 116,
"step_time": 5.872056240998063
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.001953125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1798.0,
"completions/max_terminated_length": 1798.0,
"completions/mean_length": 1668.34375,
"completions/mean_terminated_length": 1668.34375,
"completions/min_length": 1463.0,
"completions/min_terminated_length": 1463.0,
"entropy": 0.2294948324561119,
"epoch": 0.00234,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.2376160621643066,
"kl": 0.2936890870332718,
"learning_rate": 9.999987850039108e-06,
"loss": -0.1774,
"num_tokens": 4383556.0,
"reward": -3.607532501220703,
"reward_std": 5.6845526695251465,
"rewards/rollout_reward_func/mean": -3.607532501220703,
"rewards/rollout_reward_func/std": 8.885178565979004,
"sampling/importance_sampling_ratio/max": 2.275712251663208,
"sampling/importance_sampling_ratio/mean": 1.1071913242340088,
"sampling/importance_sampling_ratio/min": 0.16783574223518372,
"sampling/sampling_logp_difference/max": 1.2079877853393555,
"sampling/sampling_logp_difference/mean": 0.04779823124408722,
"step": 117,
"step_time": 37.413396432997615
},
{
"clip_ratio/high_max": 0.01171875,
"clip_ratio/high_mean": 0.005859375,
"clip_ratio/low_mean": 0.00390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.009765625,
"entropy": 0.23724446073174477,
"epoch": 0.00236,
"grad_norm": 1.1334284543991089,
"kl": 0.257572659291327,
"learning_rate": 9.999987548188395e-06,
"loss": -0.1829,
"step": 118,
"step_time": 5.831991403998472
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.001953125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1806.0,
"completions/max_terminated_length": 1806.0,
"completions/mean_length": 1690.75,
"completions/mean_terminated_length": 1690.75,
"completions/min_length": 1531.0,
"completions/min_terminated_length": 1531.0,
"entropy": 0.27842383086681366,
"epoch": 0.00238,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.7958987355232239,
"kl": 0.3204949628561735,
"learning_rate": 9.999987242634e-06,
"loss": -0.104,
"num_tokens": 4458969.0,
"reward": -8.779083251953125,
"reward_std": 6.466405868530273,
"rewards/rollout_reward_func/mean": -8.779083251953125,
"rewards/rollout_reward_func/std": 12.54110336303711,
"sampling/importance_sampling_ratio/max": 1.7512174844741821,
"sampling/importance_sampling_ratio/mean": 0.9728891849517822,
"sampling/importance_sampling_ratio/min": 0.23163382709026337,
"sampling/sampling_logp_difference/max": 1.429762363433838,
"sampling/sampling_logp_difference/mean": 0.059581462293863297,
"step": 119,
"step_time": 37.53063563500109
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.005859375,
"clip_ratio/low_mean": 0.00390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.009765625,
"entropy": 0.2830961886793375,
"epoch": 0.0024,
"grad_norm": 0.7229918837547302,
"kl": 0.32604870945215225,
"learning_rate": 9.999986933375924e-06,
"loss": -0.107,
"step": 120,
"step_time": 6.557443951995083
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 1802.0,
"completions/max_terminated_length": 1802.0,
"completions/mean_length": 1673.0625,
"completions/mean_terminated_length": 1669.4515380859375,
"completions/min_length": 1072.0,
"completions/min_terminated_length": 1072.0,
"entropy": 0.3067500479519367,
"epoch": 0.00242,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.1562621593475342,
"kl": 0.21205937396734953,
"learning_rate": 9.999986620414169e-06,
"loss": -0.1476,
"num_tokens": 4533539.0,
"reward": -5.477802276611328,
"reward_std": 3.7002973556518555,
"rewards/rollout_reward_func/mean": -5.477802276611328,
"rewards/rollout_reward_func/std": 4.9684367179870605,
"sampling/importance_sampling_ratio/max": 2.1120123863220215,
"sampling/importance_sampling_ratio/mean": 1.002963900566101,
"sampling/importance_sampling_ratio/min": 0.1644321084022522,
"sampling/sampling_logp_difference/max": 0.7201070785522461,
"sampling/sampling_logp_difference/mean": 0.044744931161403656,
"step": 121,
"step_time": 37.94513329599977
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.001953125,
"entropy": 0.30317614786326885,
"epoch": 0.00244,
"grad_norm": 1.1647439002990723,
"kl": 0.2262652236968279,
"learning_rate": 9.999986303748731e-06,
"loss": -0.1508,
"step": 122,
"step_time": 5.857959121001841
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.001953125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1785.0,
"completions/max_terminated_length": 1785.0,
"completions/mean_length": 1429.46875,
"completions/mean_terminated_length": 1429.46875,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.22994763404130936,
"epoch": 0.00246,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.0889474153518677,
"kl": 0.7471896391361952,
"learning_rate": 9.999985983379614e-06,
"loss": -0.1179,
"num_tokens": 4600314.0,
"reward": -1.682168960571289,
"reward_std": 8.262513160705566,
"rewards/rollout_reward_func/mean": -1.682168960571289,
"rewards/rollout_reward_func/std": 16.95815658569336,
"sampling/importance_sampling_ratio/max": 2.0507054328918457,
"sampling/importance_sampling_ratio/mean": 0.9804076552391052,
"sampling/importance_sampling_ratio/min": 0.13134591281414032,
"sampling/sampling_logp_difference/max": 1.8472480773925781,
"sampling/sampling_logp_difference/mean": 0.06341268122196198,
"step": 123,
"step_time": 33.21367415700297
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.005859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005859375,
"entropy": 0.22646107524633408,
"epoch": 0.00248,
"grad_norm": 1.079579472541809,
"kl": 0.8753251153975725,
"learning_rate": 9.999985659306817e-06,
"loss": -0.121,
"step": 124,
"step_time": 5.792652656999053
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.001953125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1804.0,
"completions/max_terminated_length": 1804.0,
"completions/mean_length": 1706.4375,
"completions/mean_terminated_length": 1706.4375,
"completions/min_length": 1559.0,
"completions/min_terminated_length": 1559.0,
"entropy": 0.282099112868309,
"epoch": 0.0025,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.3554356098175049,
"kl": 0.2657380551099777,
"learning_rate": 9.999985331530339e-06,
"loss": -0.0685,
"num_tokens": 4676158.0,
"reward": 1.4572546482086182,
"reward_std": 7.827357292175293,
"rewards/rollout_reward_func/mean": 1.4572546482086182,
"rewards/rollout_reward_func/std": 8.701656341552734,
"sampling/importance_sampling_ratio/max": 2.6973798274993896,
"sampling/importance_sampling_ratio/mean": 0.9617570638656616,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.366655945777893,
"sampling/sampling_logp_difference/mean": 0.06495144963264465,
"step": 125,
"step_time": 38.350035333998676
},
{
"clip_ratio/high_max": 0.01171875,
"clip_ratio/high_mean": 0.005859375,
"clip_ratio/low_mean": 0.005859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.01171875,
"entropy": 0.2868986092507839,
"epoch": 0.00252,
"grad_norm": 1.1578820943832397,
"kl": 0.2646348997950554,
"learning_rate": 9.999985000050181e-06,
"loss": -0.0723,
"step": 126,
"step_time": 6.268096281999533
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.001953125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1826.0,
"completions/max_terminated_length": 1826.0,
"completions/mean_length": 1739.8125,
"completions/mean_terminated_length": 1739.8125,
"completions/min_length": 1644.0,
"completions/min_terminated_length": 1644.0,
"entropy": 0.26093689538538456,
"epoch": 0.00254,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.2453805208206177,
"kl": 0.4399577025324106,
"learning_rate": 9.999984664866347e-06,
"loss": -0.0086,
"num_tokens": 4753406.0,
"reward": -2.7315988540649414,
"reward_std": 4.536945343017578,
"rewards/rollout_reward_func/mean": -2.7315988540649414,
"rewards/rollout_reward_func/std": 7.6850104331970215,
"sampling/importance_sampling_ratio/max": 2.3371880054473877,
"sampling/importance_sampling_ratio/mean": 1.0783873796463013,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.2284293174743652,
"sampling/sampling_logp_difference/mean": 0.04868567734956741,
"step": 127,
"step_time": 37.24148940499981
},
{
"clip_ratio/high_max": 0.01171875,
"clip_ratio/high_mean": 0.0078125,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.009765625,
"entropy": 0.26575652323663235,
"epoch": 0.00256,
"grad_norm": 1.235435962677002,
"kl": 0.4320835890248418,
"learning_rate": 9.999984325978833e-06,
"loss": -0.0116,
"step": 128,
"step_time": 5.902758816999267
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1805.0,
"completions/max_terminated_length": 1805.0,
"completions/mean_length": 1686.875,
"completions/mean_terminated_length": 1686.875,
"completions/min_length": 1159.0,
"completions/min_terminated_length": 1159.0,
"entropy": 0.3018810376524925,
"epoch": 0.00258,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.9894475936889648,
"kl": 0.32896456494927406,
"learning_rate": 9.99998398338764e-06,
"loss": -0.089,
"num_tokens": 4828478.0,
"reward": -1.9806309938430786,
"reward_std": 5.783495903015137,
"rewards/rollout_reward_func/mean": -1.9806309938430786,
"rewards/rollout_reward_func/std": 9.691821098327637,
"sampling/importance_sampling_ratio/max": 2.419085741043091,
"sampling/importance_sampling_ratio/mean": 0.9663759469985962,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.6888089179992676,
"sampling/sampling_logp_difference/mean": 0.06528542190790176,
"step": 129,
"step_time": 37.874985575997925
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.00390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005859375,
"entropy": 0.3036986291408539,
"epoch": 0.0026,
"grad_norm": 0.9928931593894958,
"kl": 0.33502755127847195,
"learning_rate": 9.99998363709277e-06,
"loss": -0.0895,
"step": 130,
"step_time": 5.84719717599728
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.001953125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1772.0,
"completions/max_terminated_length": 1772.0,
"completions/mean_length": 1671.9375,
"completions/mean_terminated_length": 1671.9375,
"completions/min_length": 1508.0,
"completions/min_terminated_length": 1508.0,
"entropy": 0.276044437661767,
"epoch": 0.00262,
"frac_reward_zero_std": 0.125,
"grad_norm": 0.8475321531295776,
"kl": 0.3156882934272289,
"learning_rate": 9.999983287094222e-06,
"loss": -0.0246,
"num_tokens": 4903229.0,
"reward": -4.237326145172119,
"reward_std": 7.174188613891602,
"rewards/rollout_reward_func/mean": -4.237326145172119,
"rewards/rollout_reward_func/std": 13.78705883026123,
"sampling/importance_sampling_ratio/max": 2.0592706203460693,
"sampling/importance_sampling_ratio/mean": 0.9671538472175598,
"sampling/importance_sampling_ratio/min": 0.17537109553813934,
"sampling/sampling_logp_difference/max": 1.267343282699585,
"sampling/sampling_logp_difference/mean": 0.05497532710433006,
"step": 131,
"step_time": 38.15398663500309
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00390625,
"entropy": 0.27523134648799896,
"epoch": 0.00264,
"grad_norm": 0.8275483250617981,
"kl": 0.3333571758121252,
"learning_rate": 9.999982933391998e-06,
"loss": -0.0265,
"step": 132,
"step_time": 6.213580482999532
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.001953125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1787.0,
"completions/max_terminated_length": 1787.0,
"completions/mean_length": 1690.6875,
"completions/mean_terminated_length": 1690.6875,
"completions/min_length": 1552.0,
"completions/min_terminated_length": 1552.0,
"entropy": 0.31048087403178215,
"epoch": 0.00266,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.9148619771003723,
"kl": 0.5922599658370018,
"learning_rate": 9.999982575986095e-06,
"loss": -0.1689,
"num_tokens": 4978651.0,
"reward": -7.106402397155762,
"reward_std": 7.335752487182617,
"rewards/rollout_reward_func/mean": -7.106402397155762,
"rewards/rollout_reward_func/std": 10.287908554077148,
"sampling/importance_sampling_ratio/max": 2.353391170501709,
"sampling/importance_sampling_ratio/mean": 0.7480576038360596,
"sampling/importance_sampling_ratio/min": 0.10871558636426926,
"sampling/sampling_logp_difference/max": 1.7691650390625,
"sampling/sampling_logp_difference/mean": 0.08270461857318878,
"step": 133,
"step_time": 37.87937480399705
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.005859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0078125,
"entropy": 0.3103002533316612,
"epoch": 0.00268,
"grad_norm": 0.8720380067825317,
"kl": 0.596416313201189,
"learning_rate": 9.999982214876516e-06,
"loss": -0.1711,
"step": 134,
"step_time": 5.838397514999087
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.001953125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1810.0,
"completions/max_terminated_length": 1810.0,
"completions/mean_length": 1734.5,
"completions/mean_terminated_length": 1734.5,
"completions/min_length": 1659.0,
"completions/min_terminated_length": 1659.0,
"entropy": 0.296902384608984,
"epoch": 0.0027,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.5514373779296875,
"kl": 0.32854650542140007,
"learning_rate": 9.999981850063262e-06,
"loss": -0.2692,
"num_tokens": 5055484.0,
"reward": 2.0809240341186523,
"reward_std": 5.269416809082031,
"rewards/rollout_reward_func/mean": 2.0809240341186523,
"rewards/rollout_reward_func/std": 7.5051188468933105,
"sampling/importance_sampling_ratio/max": 2.7047743797302246,
"sampling/importance_sampling_ratio/mean": 1.0847183465957642,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.3921051025390625,
"sampling/sampling_logp_difference/mean": 0.07780618220567703,
"step": 135,
"step_time": 37.11384174500017
},
{
"clip_ratio/high_max": 0.01171875,
"clip_ratio/high_mean": 0.005859375,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0078125,
"entropy": 0.2912406101822853,
"epoch": 0.00272,
"grad_norm": 1.203852653503418,
"kl": 0.34848837181925774,
"learning_rate": 9.99998148154633e-06,
"loss": -0.2722,
"step": 136,
"step_time": 6.586184759000389
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.001953125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1788.0,
"completions/max_terminated_length": 1788.0,
"completions/mean_length": 1674.96875,
"completions/mean_terminated_length": 1674.96875,
"completions/min_length": 1567.0,
"completions/min_terminated_length": 1567.0,
"entropy": 0.2682835068553686,
"epoch": 0.00274,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.322084903717041,
"kl": 0.4422433190047741,
"learning_rate": 9.999981109325725e-06,
"loss": 0.0099,
"num_tokens": 5130363.0,
"reward": -0.8909265995025635,
"reward_std": 6.19950008392334,
"rewards/rollout_reward_func/mean": -0.8909265995025635,
"rewards/rollout_reward_func/std": 11.643590927124023,
"sampling/importance_sampling_ratio/max": 2.353513717651367,
"sampling/importance_sampling_ratio/mean": 1.027718186378479,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 0.9235548973083496,
"sampling/sampling_logp_difference/mean": 0.05995417386293411,
"step": 137,
"step_time": 37.823591190001025
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.2614995054900646,
"epoch": 0.00276,
"grad_norm": 1.342252254486084,
"kl": 0.4471647199243307,
"learning_rate": 9.999980733401442e-06,
"loss": 0.0087,
"step": 138,
"step_time": 5.804548355996303
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.001953125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1774.0,
"completions/max_terminated_length": 1774.0,
"completions/mean_length": 1635.40625,
"completions/mean_terminated_length": 1635.40625,
"completions/min_length": 637.0,
"completions/min_terminated_length": 637.0,
"entropy": 0.28644070588052273,
"epoch": 0.00278,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.5117998123168945,
"kl": 0.37019682209938765,
"learning_rate": 9.999980353773486e-06,
"loss": -0.0649,
"num_tokens": 5204087.0,
"reward": -2.6386919021606445,
"reward_std": 7.861667156219482,
"rewards/rollout_reward_func/mean": -2.6386919021606445,
"rewards/rollout_reward_func/std": 9.34830093383789,
"sampling/importance_sampling_ratio/max": 2.392774820327759,
"sampling/importance_sampling_ratio/mean": 0.8888267278671265,
"sampling/importance_sampling_ratio/min": 0.06766009330749512,
"sampling/sampling_logp_difference/max": 0.964139461517334,
"sampling/sampling_logp_difference/mean": 0.07269679009914398,
"step": 139,
"step_time": 36.273601793001944
},
{
"clip_ratio/high_max": 0.02734375,
"clip_ratio/high_mean": 0.013671875,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.015625,
"entropy": 0.2876611240208149,
"epoch": 0.0028,
"grad_norm": 1.0806161165237427,
"kl": 0.3748003738000989,
"learning_rate": 9.999979970441856e-06,
"loss": -0.066,
"step": 140,
"step_time": 5.793748694000897
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00390625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1778.0,
"completions/max_terminated_length": 1778.0,
"completions/mean_length": 1630.625,
"completions/mean_terminated_length": 1630.625,
"completions/min_length": 1075.0,
"completions/min_terminated_length": 1075.0,
"entropy": 0.2753357030451298,
"epoch": 0.00282,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.2243664264678955,
"kl": 0.3961847685277462,
"learning_rate": 9.999979583406551e-06,
"loss": -0.1401,
"num_tokens": 5277342.0,
"reward": -0.9446412324905396,
"reward_std": 6.179128646850586,
"rewards/rollout_reward_func/mean": -0.9446412324905396,
"rewards/rollout_reward_func/std": 7.662261009216309,
"sampling/importance_sampling_ratio/max": 2.07578706741333,
"sampling/importance_sampling_ratio/mean": 0.9215522408485413,
"sampling/importance_sampling_ratio/min": 0.17828358709812164,
"sampling/sampling_logp_difference/max": 1.095733880996704,
"sampling/sampling_logp_difference/mean": 0.06696178764104843,
"step": 141,
"step_time": 37.386809142004495
},
{
"clip_ratio/high_max": 0.01171875,
"clip_ratio/high_mean": 0.005859375,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005859375,
"entropy": 0.2797367610037327,
"epoch": 0.00284,
"grad_norm": 1.1719934940338135,
"kl": 0.36871890537440777,
"learning_rate": 9.999979192667574e-06,
"loss": -0.1444,
"step": 142,
"step_time": 6.457391539001037
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.001953125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1805.0,
"completions/max_terminated_length": 1805.0,
"completions/mean_length": 1678.625,
"completions/mean_terminated_length": 1678.625,
"completions/min_length": 1457.0,
"completions/min_terminated_length": 1457.0,
"entropy": 0.2741607278585434,
"epoch": 0.00286,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.8713260293006897,
"kl": 0.43569475039839745,
"learning_rate": 9.999978798224922e-06,
"loss": -0.138,
"num_tokens": 5352366.0,
"reward": 0.8101233839988708,
"reward_std": 3.6327834129333496,
"rewards/rollout_reward_func/mean": 0.8101233839988708,
"rewards/rollout_reward_func/std": 7.983520030975342,
"sampling/importance_sampling_ratio/max": 2.63127064704895,
"sampling/importance_sampling_ratio/mean": 0.9456866979598999,
"sampling/importance_sampling_ratio/min": 0.17168530821800232,
"sampling/sampling_logp_difference/max": 1.6172382831573486,
"sampling/sampling_logp_difference/mean": 0.05936865881085396,
"step": 143,
"step_time": 38.01681525400272
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.001953125,
"entropy": 0.28014545887708664,
"epoch": 0.00288,
"grad_norm": 0.8923928737640381,
"kl": 0.40905678272247314,
"learning_rate": 9.999978400078598e-06,
"loss": -0.1408,
"step": 144,
"step_time": 5.900416173997655
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.001953125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1773.0,
"completions/max_terminated_length": 1773.0,
"completions/mean_length": 1679.46875,
"completions/mean_terminated_length": 1679.46875,
"completions/min_length": 1426.0,
"completions/min_terminated_length": 1426.0,
"entropy": 0.28977033123373985,
"epoch": 0.0029,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.2021753787994385,
"kl": 0.9492019787430763,
"learning_rate": 9.9999779982286e-06,
"loss": -0.1518,
"num_tokens": 5427370.0,
"reward": -1.4256936311721802,
"reward_std": 5.131044387817383,
"rewards/rollout_reward_func/mean": -1.4256936311721802,
"rewards/rollout_reward_func/std": 6.864547252655029,
"sampling/importance_sampling_ratio/max": 2.061087131500244,
"sampling/importance_sampling_ratio/mean": 0.7186421155929565,
"sampling/importance_sampling_ratio/min": 0.03768601268529892,
"sampling/sampling_logp_difference/max": 1.7941226959228516,
"sampling/sampling_logp_difference/mean": 0.0829053670167923,
"step": 145,
"step_time": 35.9784139159965
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005859375,
"entropy": 0.2919359765946865,
"epoch": 0.00292,
"grad_norm": 0.8570596575737,
"kl": 0.9193199034780264,
"learning_rate": 9.999977592674933e-06,
"loss": -0.1533,
"step": 146,
"step_time": 5.8281254610010365
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1779.0,
"completions/max_terminated_length": 1779.0,
"completions/mean_length": 1666.84375,
"completions/mean_terminated_length": 1666.84375,
"completions/min_length": 1414.0,
"completions/min_terminated_length": 1414.0,
"entropy": 0.28393640369176865,
"epoch": 0.00294,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.9220053553581238,
"kl": 0.4149657338857651,
"learning_rate": 9.999977183417593e-06,
"loss": -0.1243,
"num_tokens": 5502317.0,
"reward": -4.950237274169922,
"reward_std": 7.430306434631348,
"rewards/rollout_reward_func/mean": -4.950237274169922,
"rewards/rollout_reward_func/std": 8.644596099853516,
"sampling/importance_sampling_ratio/max": 1.6269994974136353,
"sampling/importance_sampling_ratio/mean": 0.8315407633781433,
"sampling/importance_sampling_ratio/min": 0.07847892493009567,
"sampling/sampling_logp_difference/max": 1.8345155715942383,
"sampling/sampling_logp_difference/mean": 0.07923141121864319,
"step": 147,
"step_time": 37.406879453998044
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00390625,
"entropy": 0.28766966238617897,
"epoch": 0.00296,
"grad_norm": 0.9152698516845703,
"kl": 0.3896927610039711,
"learning_rate": 9.999976770456581e-06,
"loss": -0.126,
"step": 148,
"step_time": 6.265031434000775
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.00390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00390625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1810.0,
"completions/max_terminated_length": 1810.0,
"completions/mean_length": 1691.28125,
"completions/mean_terminated_length": 1691.28125,
"completions/min_length": 1595.0,
"completions/min_terminated_length": 1595.0,
"entropy": 0.251856479793787,
"epoch": 0.00298,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.9210507869720459,
"kl": 0.4119174964725971,
"learning_rate": 9.999976353791898e-06,
"loss": -0.1814,
"num_tokens": 5577778.0,
"reward": -1.1243976354599,
"reward_std": 5.285574436187744,
"rewards/rollout_reward_func/mean": -1.1243976354599,
"rewards/rollout_reward_func/std": 7.979835510253906,
"sampling/importance_sampling_ratio/max": 2.8175506591796875,
"sampling/importance_sampling_ratio/mean": 1.1608731746673584,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.2149560451507568,
"sampling/sampling_logp_difference/mean": 0.06952120363712311,
"step": 149,
"step_time": 37.22757341200122
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.00390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00390625,
"entropy": 0.25096623599529266,
"epoch": 0.003,
"grad_norm": 0.9395397305488586,
"kl": 0.4342615343630314,
"learning_rate": 9.999975933423546e-06,
"loss": -0.184,
"step": 150,
"step_time": 5.87532146200283
}
],
"logging_steps": 1.0,
"max_steps": 100000,
"num_input_tokens_seen": 5577778,
"num_train_epochs": 2,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}