envgnr-Qwen3b-hyperGG-commit-1 / trainer_state.json
Gege24's picture
Upload folder using huggingface_hub
3770953 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.0093,
"eval_steps": 500,
"global_step": 465,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 822.0,
"completions/max_terminated_length": 822.0,
"completions/mean_length": 741.390625,
"completions/mean_terminated_length": 741.390625,
"completions/min_length": 296.0,
"completions/min_terminated_length": 296.0,
"entropy": 0.19560225727036595,
"epoch": 2e-05,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.6278125643730164,
"kl": 0.0,
"learning_rate": 0.0,
"loss": 0.0026,
"num_tokens": 102275.0,
"reward": -0.2660611569881439,
"reward_std": 9.006877899169922,
"rewards/rollout_reward_func/mean": -0.26606130599975586,
"rewards/rollout_reward_func/std": 10.133543014526367,
"sampling/importance_sampling_ratio/max": 1.4521965980529785,
"sampling/importance_sampling_ratio/mean": 1.0252978801727295,
"sampling/importance_sampling_ratio/min": 0.6192880272865295,
"sampling/sampling_logp_difference/max": 0.35935235023498535,
"sampling/sampling_logp_difference/mean": 0.013161457143723965,
"step": 1,
"step_time": 18.950907858999926
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.19560225727036595,
"epoch": 4e-05,
"grad_norm": 0.6270994544029236,
"kl": 0.0,
"learning_rate": 2.8571428571428573e-06,
"loss": 0.0026,
"step": 2,
"step_time": 6.845600487000297
},
{
"clip_ratio/high_max": 0.010416666977107525,
"clip_ratio/high_mean": 0.0026041667442768812,
"clip_ratio/low_mean": 0.0026041667442768812,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0052083334885537624,
"completions/clipped_ratio": 0.0,
"completions/max_length": 817.0,
"completions/max_terminated_length": 817.0,
"completions/mean_length": 745.078125,
"completions/mean_terminated_length": 745.078125,
"completions/min_length": 290.0,
"completions/min_terminated_length": 290.0,
"entropy": 0.1830942602828145,
"epoch": 6e-05,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.6748153567314148,
"kl": 0.0004804102204616356,
"learning_rate": 5.7142857142857145e-06,
"loss": -0.0139,
"num_tokens": 204643.0,
"reward": 0.07987305521965027,
"reward_std": 6.112407207489014,
"rewards/rollout_reward_func/mean": 0.07987302541732788,
"rewards/rollout_reward_func/std": 6.9746317863464355,
"sampling/importance_sampling_ratio/max": 1.6137751340866089,
"sampling/importance_sampling_ratio/mean": 1.0131056308746338,
"sampling/importance_sampling_ratio/min": 0.5117371678352356,
"sampling/sampling_logp_difference/max": 0.6347737312316895,
"sampling/sampling_logp_difference/mean": 0.013132400810718536,
"step": 3,
"step_time": 20.457778603000065
},
{
"clip_ratio/high_max": 0.0062500000931322575,
"clip_ratio/high_mean": 0.0015625000232830644,
"clip_ratio/low_mean": 0.0013020833721384406,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.002864583395421505,
"entropy": 0.18449228629469872,
"epoch": 8e-05,
"grad_norm": 0.7855743169784546,
"kl": 0.0004326992366259219,
"learning_rate": 8.571428571428573e-06,
"loss": -0.0127,
"step": 4,
"step_time": 7.153126219000001
},
{
"clip_ratio/high_max": 0.015625000465661287,
"clip_ratio/high_mean": 0.003906250116415322,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.003906250116415322,
"completions/clipped_ratio": 0.0,
"completions/max_length": 830.0,
"completions/max_terminated_length": 830.0,
"completions/mean_length": 773.3125,
"completions/mean_terminated_length": 773.3125,
"completions/min_length": 691.0,
"completions/min_terminated_length": 691.0,
"entropy": 0.19608404766768217,
"epoch": 0.0001,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.6154729723930359,
"kl": 0.0007404440693790093,
"learning_rate": 1.1428571428571429e-05,
"loss": -0.0267,
"num_tokens": 308926.0,
"reward": -2.357975721359253,
"reward_std": 5.998347282409668,
"rewards/rollout_reward_func/mean": -2.357975721359253,
"rewards/rollout_reward_func/std": 6.508192539215088,
"sampling/importance_sampling_ratio/max": 1.5696072578430176,
"sampling/importance_sampling_ratio/mean": 1.0018606185913086,
"sampling/importance_sampling_ratio/min": 0.6378414630889893,
"sampling/sampling_logp_difference/max": 0.4687232971191406,
"sampling/sampling_logp_difference/mean": 0.014497373253107071,
"step": 5,
"step_time": 21.077881563999767
},
{
"clip_ratio/high_max": 0.010416666977107525,
"clip_ratio/high_mean": 0.0026041667442768812,
"clip_ratio/low_mean": 0.0027225379599258304,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005326704704202712,
"entropy": 0.20008834172040224,
"epoch": 0.00012,
"grad_norm": 0.613211989402771,
"kl": 0.0017206422435265267,
"learning_rate": 1.4285714285714285e-05,
"loss": -0.0283,
"step": 6,
"step_time": 8.075609097999632
},
{
"clip_ratio/high_max": 0.0052083334885537624,
"clip_ratio/high_mean": 0.0013020833721384406,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0013020833721384406,
"completions/clipped_ratio": 0.0,
"completions/max_length": 843.0,
"completions/max_terminated_length": 843.0,
"completions/mean_length": 754.3125,
"completions/mean_terminated_length": 754.3125,
"completions/min_length": 302.0,
"completions/min_terminated_length": 302.0,
"entropy": 0.21214309986680746,
"epoch": 0.00014,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5866758823394775,
"kl": 0.0038609652619925328,
"learning_rate": 1.7142857142857145e-05,
"loss": 0.0015,
"num_tokens": 413194.0,
"reward": -0.5192327499389648,
"reward_std": 8.747434616088867,
"rewards/rollout_reward_func/mean": -0.5192328095436096,
"rewards/rollout_reward_func/std": 9.696125030517578,
"sampling/importance_sampling_ratio/max": 1.3741450309753418,
"sampling/importance_sampling_ratio/mean": 0.988805890083313,
"sampling/importance_sampling_ratio/min": 0.6078794002532959,
"sampling/sampling_logp_difference/max": 0.25654804706573486,
"sampling/sampling_logp_difference/mean": 0.012450095266103745,
"step": 7,
"step_time": 21.18102895699974
},
{
"clip_ratio/high_max": 0.042140152771025896,
"clip_ratio/high_mean": 0.010535038192756474,
"clip_ratio/low_mean": 0.011718750349245965,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.02225378854200244,
"entropy": 0.2212895406410098,
"epoch": 0.00016,
"grad_norm": 0.5727657675743103,
"kl": 0.01148045047011692,
"learning_rate": 2e-05,
"loss": 0.0002,
"step": 8,
"step_time": 8.206865795999875
},
{
"clip_ratio/high_max": 0.010416666977107525,
"clip_ratio/high_mean": 0.0026041667442768812,
"clip_ratio/low_mean": 0.004142992664128542,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.006747159408405423,
"completions/clipped_ratio": 0.0,
"completions/max_length": 822.0,
"completions/max_terminated_length": 822.0,
"completions/mean_length": 742.765625,
"completions/mean_terminated_length": 742.765625,
"completions/min_length": 286.0,
"completions/min_terminated_length": 286.0,
"entropy": 0.25475312024354935,
"epoch": 0.00018,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.7087565064430237,
"kl": 0.03194100991822779,
"learning_rate": 2.2857142857142858e-05,
"loss": 0.0241,
"num_tokens": 516181.0,
"reward": -2.378840684890747,
"reward_std": 6.36100959777832,
"rewards/rollout_reward_func/mean": -2.378840446472168,
"rewards/rollout_reward_func/std": 7.315836429595947,
"sampling/importance_sampling_ratio/max": 1.6080894470214844,
"sampling/importance_sampling_ratio/mean": 1.0152499675750732,
"sampling/importance_sampling_ratio/min": 0.4359276592731476,
"sampling/sampling_logp_difference/max": 0.4399428367614746,
"sampling/sampling_logp_difference/mean": 0.028559193015098572,
"step": 9,
"step_time": 22.1539967839999
},
{
"clip_ratio/high_max": 0.04734848625957966,
"clip_ratio/high_mean": 0.013139204937033355,
"clip_ratio/low_mean": 0.007930871448479593,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.021070076385512948,
"entropy": 0.26416848599910736,
"epoch": 0.0002,
"grad_norm": 0.6573855876922607,
"kl": 0.03966027498245239,
"learning_rate": 2.5714285714285714e-05,
"loss": 0.021,
"step": 10,
"step_time": 7.0971175310000945
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 822.0,
"completions/max_terminated_length": 822.0,
"completions/mean_length": 750.75,
"completions/mean_terminated_length": 750.75,
"completions/min_length": 608.0,
"completions/min_terminated_length": 608.0,
"entropy": 0.2343001812696457,
"epoch": 0.00022,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.864137589931488,
"kl": 0.03614223480690271,
"learning_rate": 2.857142857142857e-05,
"loss": -0.018,
"num_tokens": 619774.0,
"reward": -1.144383430480957,
"reward_std": 9.403154373168945,
"rewards/rollout_reward_func/mean": -1.144383192062378,
"rewards/rollout_reward_func/std": 10.208455085754395,
"sampling/importance_sampling_ratio/max": 1.6737509965896606,
"sampling/importance_sampling_ratio/mean": 1.0005735158920288,
"sampling/importance_sampling_ratio/min": 0.5264889001846313,
"sampling/sampling_logp_difference/max": 0.7381381988525391,
"sampling/sampling_logp_difference/mean": 0.03099803999066353,
"step": 11,
"step_time": 24.1759815060002
},
{
"clip_ratio/high_max": 0.04261363763362169,
"clip_ratio/high_mean": 0.011955492780543864,
"clip_ratio/low_mean": 0.018229166977107525,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.030184659990482032,
"entropy": 0.23695118725299835,
"epoch": 0.00024,
"grad_norm": 0.5675711631774902,
"kl": 0.05231437139445916,
"learning_rate": 3.142857142857143e-05,
"loss": -0.0247,
"step": 12,
"step_time": 7.241505567000331
},
{
"clip_ratio/high_max": 0.0052083334885537624,
"clip_ratio/high_mean": 0.0013020833721384406,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0013020833721384406,
"completions/clipped_ratio": 0.0,
"completions/max_length": 827.0,
"completions/max_terminated_length": 827.0,
"completions/mean_length": 766.03125,
"completions/mean_terminated_length": 766.03125,
"completions/min_length": 638.0,
"completions/min_terminated_length": 638.0,
"entropy": 0.23159058205783367,
"epoch": 0.00026,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.560762345790863,
"kl": 0.10253941919654608,
"learning_rate": 3.428571428571429e-05,
"loss": -0.0132,
"num_tokens": 725286.0,
"reward": 0.9126645922660828,
"reward_std": 8.317488670349121,
"rewards/rollout_reward_func/mean": 0.9126646518707275,
"rewards/rollout_reward_func/std": 9.508187294006348,
"sampling/importance_sampling_ratio/max": 1.4912891387939453,
"sampling/importance_sampling_ratio/mean": 0.9157562255859375,
"sampling/importance_sampling_ratio/min": 0.15846048295497894,
"sampling/sampling_logp_difference/max": 0.9116353988647461,
"sampling/sampling_logp_difference/mean": 0.03342486917972565,
"step": 13,
"step_time": 24.797564555999315
},
{
"clip_ratio/high_max": 0.02083333395421505,
"clip_ratio/high_mean": 0.0052083334885537624,
"clip_ratio/low_mean": 0.04107481171377003,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.046283145202323794,
"entropy": 0.21706843469291925,
"epoch": 0.00028,
"grad_norm": 0.737306535243988,
"kl": 0.20574123412370682,
"learning_rate": 3.7142857142857143e-05,
"loss": -0.0141,
"step": 14,
"step_time": 8.782559869000124
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.002864583395421505,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.002864583395421505,
"completions/clipped_ratio": 0.0,
"completions/max_length": 838.0,
"completions/max_terminated_length": 838.0,
"completions/mean_length": 730.265625,
"completions/mean_terminated_length": 730.265625,
"completions/min_length": 293.0,
"completions/min_terminated_length": 293.0,
"entropy": 0.20442467741668224,
"epoch": 0.0003,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.7728816866874695,
"kl": 0.10564538510516286,
"learning_rate": 4e-05,
"loss": 0.0314,
"num_tokens": 827749.0,
"reward": -1.6128692626953125,
"reward_std": 6.231240272521973,
"rewards/rollout_reward_func/mean": -1.6128690242767334,
"rewards/rollout_reward_func/std": 6.545647621154785,
"sampling/importance_sampling_ratio/max": 1.7540509700775146,
"sampling/importance_sampling_ratio/mean": 1.0142356157302856,
"sampling/importance_sampling_ratio/min": 0.45990973711013794,
"sampling/sampling_logp_difference/max": 0.7248215675354004,
"sampling/sampling_logp_difference/mean": 0.030622530728578568,
"step": 15,
"step_time": 24.38517581200017
},
{
"clip_ratio/high_max": 0.047821971122175455,
"clip_ratio/high_mean": 0.013257576036266983,
"clip_ratio/low_mean": 0.025236743036657572,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0384943193057552,
"entropy": 0.19760818500071764,
"epoch": 0.00032,
"grad_norm": 0.6611685752868652,
"kl": 0.11387888877652586,
"learning_rate": 4.2857142857142856e-05,
"loss": 0.0262,
"step": 16,
"step_time": 7.110903799999505
},
{
"clip_ratio/high_max": 0.0052083334885537624,
"clip_ratio/high_mean": 0.0013020833721384406,
"clip_ratio/low_mean": 0.0013020833721384406,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0026041667442768812,
"completions/clipped_ratio": 0.0,
"completions/max_length": 826.0,
"completions/max_terminated_length": 826.0,
"completions/mean_length": 760.359375,
"completions/mean_terminated_length": 760.359375,
"completions/min_length": 659.0,
"completions/min_terminated_length": 659.0,
"entropy": 0.19120646081864834,
"epoch": 0.00034,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.0170923471450806,
"kl": 0.08101693401113153,
"learning_rate": 4.5714285714285716e-05,
"loss": -0.015,
"num_tokens": 931841.0,
"reward": -1.6879972219467163,
"reward_std": 9.023077011108398,
"rewards/rollout_reward_func/mean": -1.6879971027374268,
"rewards/rollout_reward_func/std": 10.298378944396973,
"sampling/importance_sampling_ratio/max": 2.430154800415039,
"sampling/importance_sampling_ratio/mean": 1.065093755722046,
"sampling/importance_sampling_ratio/min": 0.6535128951072693,
"sampling/sampling_logp_difference/max": 0.7661471366882324,
"sampling/sampling_logp_difference/mean": 0.024486079812049866,
"step": 17,
"step_time": 27.987481355
},
{
"clip_ratio/high_max": 0.043560607358813286,
"clip_ratio/high_mean": 0.016335227992385626,
"clip_ratio/low_mean": 0.01846590987406671,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.03480113763362169,
"entropy": 0.19553834106773138,
"epoch": 0.00036,
"grad_norm": 0.5111234784126282,
"kl": 0.088710677344352,
"learning_rate": 4.8571428571428576e-05,
"loss": -0.0206,
"step": 18,
"step_time": 7.182192339999801
},
{
"clip_ratio/high_max": 0.010416666977107525,
"clip_ratio/high_mean": 0.0026041667442768812,
"clip_ratio/low_mean": 0.0013020833721384406,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.003906250116415322,
"completions/clipped_ratio": 0.0,
"completions/max_length": 816.0,
"completions/max_terminated_length": 816.0,
"completions/mean_length": 733.640625,
"completions/mean_terminated_length": 733.640625,
"completions/min_length": 296.0,
"completions/min_terminated_length": 296.0,
"entropy": 0.1935133864171803,
"epoch": 0.00038,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.7022229433059692,
"kl": 0.1404350029770285,
"learning_rate": 5.142857142857143e-05,
"loss": -0.0003,
"num_tokens": 1033723.0,
"reward": -1.2022110223770142,
"reward_std": 10.956363677978516,
"rewards/rollout_reward_func/mean": -1.2022109031677246,
"rewards/rollout_reward_func/std": 12.292625427246094,
"sampling/importance_sampling_ratio/max": 1.6157236099243164,
"sampling/importance_sampling_ratio/mean": 0.9594892263412476,
"sampling/importance_sampling_ratio/min": 0.3754613697528839,
"sampling/sampling_logp_difference/max": 0.9176025390625,
"sampling/sampling_logp_difference/mean": 0.028035998344421387,
"step": 19,
"step_time": 27.688288005999993
},
{
"clip_ratio/high_max": 0.04876894084736705,
"clip_ratio/high_mean": 0.012192235211841762,
"clip_ratio/low_mean": 0.018584280740469694,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0307765161851421,
"entropy": 0.20130611211061478,
"epoch": 0.0004,
"grad_norm": 0.4695027768611908,
"kl": 0.18750765593722463,
"learning_rate": 5.428571428571428e-05,
"loss": -0.0054,
"step": 20,
"step_time": 7.739605327000618
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0014204545877873898,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0014204545877873898,
"completions/clipped_ratio": 0.0,
"completions/max_length": 823.0,
"completions/max_terminated_length": 823.0,
"completions/mean_length": 737.9375,
"completions/mean_terminated_length": 737.9375,
"completions/min_length": 618.0,
"completions/min_terminated_length": 618.0,
"entropy": 0.18132759165018797,
"epoch": 0.00042,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.1652212142944336,
"kl": 0.13582510640844703,
"learning_rate": 5.714285714285714e-05,
"loss": 0.0262,
"num_tokens": 1135968.0,
"reward": -0.28913062810897827,
"reward_std": 7.3008809089660645,
"rewards/rollout_reward_func/mean": -0.28913065791130066,
"rewards/rollout_reward_func/std": 7.988962650299072,
"sampling/importance_sampling_ratio/max": 2.336996555328369,
"sampling/importance_sampling_ratio/mean": 1.0362560749053955,
"sampling/importance_sampling_ratio/min": 0.6398296356201172,
"sampling/sampling_logp_difference/max": 0.6417920589447021,
"sampling/sampling_logp_difference/mean": 0.022837379947304726,
"step": 21,
"step_time": 28.57662482000046
},
{
"clip_ratio/high_max": 0.02083333395421505,
"clip_ratio/high_mean": 0.006510416860692203,
"clip_ratio/low_mean": 0.025386679684743285,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.03189709666185081,
"entropy": 0.1763849752023816,
"epoch": 0.00044,
"grad_norm": 0.3849461078643799,
"kl": 0.16632835287600756,
"learning_rate": 6e-05,
"loss": 0.0212,
"step": 22,
"step_time": 8.287740409000207
},
{
"clip_ratio/high_max": 0.010416666977107525,
"clip_ratio/high_mean": 0.0026041667442768812,
"clip_ratio/low_mean": 0.0014204545877873898,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.004024621332064271,
"completions/clipped_ratio": 0.0,
"completions/max_length": 826.0,
"completions/max_terminated_length": 826.0,
"completions/mean_length": 723.875,
"completions/mean_terminated_length": 723.875,
"completions/min_length": 305.0,
"completions/min_terminated_length": 305.0,
"entropy": 0.1840990763157606,
"epoch": 0.00046,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.860249936580658,
"kl": 0.25097968662157655,
"learning_rate": 6.285714285714286e-05,
"loss": 0.0286,
"num_tokens": 1237057.0,
"reward": 0.4839830696582794,
"reward_std": 10.420938491821289,
"rewards/rollout_reward_func/mean": 0.4839830994606018,
"rewards/rollout_reward_func/std": 11.429144859313965,
"sampling/importance_sampling_ratio/max": 2.106267213821411,
"sampling/importance_sampling_ratio/mean": 1.0313048362731934,
"sampling/importance_sampling_ratio/min": 0.574251651763916,
"sampling/sampling_logp_difference/max": 0.8508915901184082,
"sampling/sampling_logp_difference/mean": 0.02066868171095848,
"step": 23,
"step_time": 28.494462327999827
},
{
"clip_ratio/high_max": 0.06818181974813342,
"clip_ratio/high_mean": 0.021070076152682304,
"clip_ratio/low_mean": 0.018347538076341152,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.03941761387977749,
"entropy": 0.19043638091534376,
"epoch": 0.00048,
"grad_norm": 0.6448091864585876,
"kl": 0.35418248968198895,
"learning_rate": 6.571428571428571e-05,
"loss": 0.0215,
"step": 24,
"step_time": 7.416647947999536
},
{
"clip_ratio/high_max": 0.0052083334885537624,
"clip_ratio/high_mean": 0.0013020833721384406,
"clip_ratio/low_mean": 0.004024621332064271,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005326704704202712,
"completions/clipped_ratio": 0.0,
"completions/max_length": 829.0,
"completions/max_terminated_length": 829.0,
"completions/mean_length": 731.0625,
"completions/mean_terminated_length": 731.0625,
"completions/min_length": 615.0,
"completions/min_terminated_length": 615.0,
"entropy": 0.1908296812325716,
"epoch": 0.0005,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.66495680809021,
"kl": 0.21043909061700106,
"learning_rate": 6.857142857142858e-05,
"loss": -0.0275,
"num_tokens": 1337760.0,
"reward": 0.9224299788475037,
"reward_std": 10.655890464782715,
"rewards/rollout_reward_func/mean": 0.9224300384521484,
"rewards/rollout_reward_func/std": 12.821269989013672,
"sampling/importance_sampling_ratio/max": 1.5019664764404297,
"sampling/importance_sampling_ratio/mean": 1.0262192487716675,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 0.9519531726837158,
"sampling/sampling_logp_difference/mean": 0.018259627744555473,
"step": 25,
"step_time": 29.797745564000707
},
{
"clip_ratio/high_max": 0.05823863809928298,
"clip_ratio/high_mean": 0.017163826269097626,
"clip_ratio/low_mean": 0.024147727992385626,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.041311554377898574,
"entropy": 0.1913931304588914,
"epoch": 0.00052,
"grad_norm": 0.5575593709945679,
"kl": 0.26408666698262095,
"learning_rate": 7.142857142857143e-05,
"loss": -0.0322,
"step": 26,
"step_time": 7.109563219000847
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 827.0,
"completions/max_terminated_length": 827.0,
"completions/mean_length": 731.640625,
"completions/mean_terminated_length": 731.640625,
"completions/min_length": 185.0,
"completions/min_terminated_length": 185.0,
"entropy": 0.19422233663499355,
"epoch": 0.00054,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.6210283637046814,
"kl": 0.21635392913594842,
"learning_rate": 7.428571428571429e-05,
"loss": -0.0185,
"num_tokens": 1439214.0,
"reward": 0.326141357421875,
"reward_std": 13.388666152954102,
"rewards/rollout_reward_func/mean": 0.32614123821258545,
"rewards/rollout_reward_func/std": 14.97364616394043,
"sampling/importance_sampling_ratio/max": 1.5914506912231445,
"sampling/importance_sampling_ratio/mean": 1.0221253633499146,
"sampling/importance_sampling_ratio/min": 0.7667937874794006,
"sampling/sampling_logp_difference/max": 0.37548696994781494,
"sampling/sampling_logp_difference/mean": 0.012905368581414223,
"step": 27,
"step_time": 28.513997486000562
},
{
"clip_ratio/high_max": 0.05255681974813342,
"clip_ratio/high_mean": 0.01574337179772556,
"clip_ratio/low_mean": 0.01661931863054633,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.032362690777517855,
"entropy": 0.1939925504848361,
"epoch": 0.00056,
"grad_norm": 0.2964678406715393,
"kl": 0.22840850101783872,
"learning_rate": 7.714285714285715e-05,
"loss": -0.0252,
"step": 28,
"step_time": 8.46359607699992
},
{
"clip_ratio/high_max": 0.010890151839703321,
"clip_ratio/high_mean": 0.0027225379599258304,
"clip_ratio/low_mean": 0.0013020833721384406,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.004024621448479593,
"completions/clipped_ratio": 0.0,
"completions/max_length": 834.0,
"completions/max_terminated_length": 834.0,
"completions/mean_length": 714.359375,
"completions/mean_terminated_length": 714.359375,
"completions/min_length": 503.0,
"completions/min_terminated_length": 503.0,
"entropy": 0.1717732958495617,
"epoch": 0.00058,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.3590966761112213,
"kl": 0.24717363435775042,
"learning_rate": 8e-05,
"loss": 0.0036,
"num_tokens": 1539055.0,
"reward": 1.930895447731018,
"reward_std": 8.148633003234863,
"rewards/rollout_reward_func/mean": 1.930895447731018,
"rewards/rollout_reward_func/std": 9.020356178283691,
"sampling/importance_sampling_ratio/max": 1.6024476289749146,
"sampling/importance_sampling_ratio/mean": 1.0161041021347046,
"sampling/importance_sampling_ratio/min": 0.7807760238647461,
"sampling/sampling_logp_difference/max": 0.35602256655693054,
"sampling/sampling_logp_difference/mean": 0.011149970814585686,
"step": 29,
"step_time": 28.064759372000253
},
{
"clip_ratio/high_max": 0.027083334047347307,
"clip_ratio/high_mean": 0.006770833511836827,
"clip_ratio/low_mean": 0.029711175127886236,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.03648200852330774,
"entropy": 0.16414203867316246,
"epoch": 0.0006,
"grad_norm": 0.38951048254966736,
"kl": 0.28005583630874753,
"learning_rate": 8.285714285714287e-05,
"loss": 0.0013,
"step": 30,
"step_time": 7.401456857000312
},
{
"clip_ratio/high_max": 0.0052083334885537624,
"clip_ratio/high_mean": 0.0013020833721384406,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0013020833721384406,
"completions/clipped_ratio": 0.0,
"completions/max_length": 809.0,
"completions/max_terminated_length": 809.0,
"completions/mean_length": 708.046875,
"completions/mean_terminated_length": 708.046875,
"completions/min_length": 475.0,
"completions/min_terminated_length": 475.0,
"entropy": 0.16439654119312763,
"epoch": 0.00062,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5445168614387512,
"kl": 0.2800124539062381,
"learning_rate": 8.571428571428571e-05,
"loss": 0.0097,
"num_tokens": 1638113.0,
"reward": 0.29781579971313477,
"reward_std": 10.009416580200195,
"rewards/rollout_reward_func/mean": 0.29781582951545715,
"rewards/rollout_reward_func/std": 11.176705360412598,
"sampling/importance_sampling_ratio/max": 1.755067229270935,
"sampling/importance_sampling_ratio/mean": 1.0180511474609375,
"sampling/importance_sampling_ratio/min": 0.580125629901886,
"sampling/sampling_logp_difference/max": 0.5197739601135254,
"sampling/sampling_logp_difference/mean": 0.013791397213935852,
"step": 31,
"step_time": 30.773730244999797
},
{
"clip_ratio/high_max": 0.03645833395421505,
"clip_ratio/high_mean": 0.013139204704202712,
"clip_ratio/low_mean": 0.03042140242177993,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.04356060677673668,
"entropy": 0.15187342395074666,
"epoch": 0.00064,
"grad_norm": 0.30164626240730286,
"kl": 0.32055927254259586,
"learning_rate": 8.857142857142857e-05,
"loss": 0.0037,
"step": 32,
"step_time": 7.328695028999618
},
{
"clip_ratio/high_max": 0.0052083334885537624,
"clip_ratio/high_mean": 0.0013020833721384406,
"clip_ratio/low_mean": 0.0027225379599258304,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.004024621332064271,
"completions/clipped_ratio": 0.0,
"completions/max_length": 835.0,
"completions/max_terminated_length": 835.0,
"completions/mean_length": 701.5,
"completions/mean_terminated_length": 701.5,
"completions/min_length": 502.0,
"completions/min_terminated_length": 502.0,
"entropy": 0.1332990936934948,
"epoch": 0.00066,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.43104609847068787,
"kl": 0.32164820563048124,
"learning_rate": 9.142857142857143e-05,
"loss": 0.0025,
"num_tokens": 1738075.0,
"reward": 3.1038765907287598,
"reward_std": 11.951395988464355,
"rewards/rollout_reward_func/mean": 3.1038765907287598,
"rewards/rollout_reward_func/std": 12.847871780395508,
"sampling/importance_sampling_ratio/max": 1.3508435487747192,
"sampling/importance_sampling_ratio/mean": 0.9952214360237122,
"sampling/importance_sampling_ratio/min": 0.6407750844955444,
"sampling/sampling_logp_difference/max": 0.47523796558380127,
"sampling/sampling_logp_difference/mean": 0.013571259565651417,
"step": 33,
"step_time": 27.829260915000077
},
{
"clip_ratio/high_max": 0.03219697065651417,
"clip_ratio/high_mean": 0.010653409408405423,
"clip_ratio/low_mean": 0.029000947601161897,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.03965435700956732,
"entropy": 0.12380115175619721,
"epoch": 0.00068,
"grad_norm": 0.27367016673088074,
"kl": 0.423783166334033,
"learning_rate": 9.428571428571429e-05,
"loss": -0.0,
"step": 34,
"step_time": 7.799126809999507
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 819.0,
"completions/max_terminated_length": 819.0,
"completions/mean_length": 687.4375,
"completions/mean_terminated_length": 687.4375,
"completions/min_length": 618.0,
"completions/min_terminated_length": 618.0,
"entropy": 0.10798696288838983,
"epoch": 0.0007,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.4482150673866272,
"kl": 0.3214763030409813,
"learning_rate": 9.714285714285715e-05,
"loss": 0.028,
"num_tokens": 1836043.0,
"reward": 3.037400960922241,
"reward_std": 12.985002517700195,
"rewards/rollout_reward_func/mean": 3.037400960922241,
"rewards/rollout_reward_func/std": 13.425616264343262,
"sampling/importance_sampling_ratio/max": 1.4862518310546875,
"sampling/importance_sampling_ratio/mean": 1.0146703720092773,
"sampling/importance_sampling_ratio/min": 0.5140225291252136,
"sampling/sampling_logp_difference/max": 0.8002816438674927,
"sampling/sampling_logp_difference/mean": 0.01363956555724144,
"step": 35,
"step_time": 28.662696071000028
},
{
"clip_ratio/high_max": 0.042140152771025896,
"clip_ratio/high_mean": 0.010535038192756474,
"clip_ratio/low_mean": 0.014441288309171796,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.024976326269097626,
"entropy": 0.1118780323304236,
"epoch": 0.00072,
"grad_norm": 0.1983855962753296,
"kl": 0.373223016038537,
"learning_rate": 0.0001,
"loss": 0.0232,
"step": 36,
"step_time": 8.269840026000338
},
{
"clip_ratio/high_max": 0.010416666977107525,
"clip_ratio/high_mean": 0.0026041667442768812,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0026041667442768812,
"completions/clipped_ratio": 0.0,
"completions/max_length": 811.0,
"completions/max_terminated_length": 811.0,
"completions/mean_length": 679.3125,
"completions/mean_terminated_length": 679.3125,
"completions/min_length": 449.0,
"completions/min_terminated_length": 449.0,
"entropy": 0.12342227855697274,
"epoch": 0.00074,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.6195780634880066,
"kl": 0.45714515913277864,
"learning_rate": 9.999999998148153e-05,
"loss": -0.0249,
"num_tokens": 1932947.0,
"reward": 3.72019362449646,
"reward_std": 11.354637145996094,
"rewards/rollout_reward_func/mean": 3.720193862915039,
"rewards/rollout_reward_func/std": 11.66490650177002,
"sampling/importance_sampling_ratio/max": 2.1260557174682617,
"sampling/importance_sampling_ratio/mean": 1.049971580505371,
"sampling/importance_sampling_ratio/min": 0.6164436340332031,
"sampling/sampling_logp_difference/max": 0.5450749397277832,
"sampling/sampling_logp_difference/mean": 0.01501537300646305,
"step": 37,
"step_time": 27.480367904999866
},
{
"clip_ratio/high_max": 0.05303030414506793,
"clip_ratio/high_mean": 0.014678030624054372,
"clip_ratio/low_mean": 0.014322917209938169,
"clip_ratio/low_min": 0.0052083334885537624,
"clip_ratio/region_mean": 0.029000947950407863,
"entropy": 0.13006606698036194,
"epoch": 0.00076,
"grad_norm": 0.2681926488876343,
"kl": 0.4847450293600559,
"learning_rate": 9.999999992592612e-05,
"loss": -0.0318,
"step": 38,
"step_time": 7.225284665000345
},
{
"clip_ratio/high_max": 0.010416666977107525,
"clip_ratio/high_mean": 0.0026041667442768812,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0026041667442768812,
"completions/clipped_ratio": 0.0,
"completions/max_length": 830.0,
"completions/max_terminated_length": 830.0,
"completions/mean_length": 700.09375,
"completions/mean_terminated_length": 700.09375,
"completions/min_length": 300.0,
"completions/min_terminated_length": 300.0,
"entropy": 0.15452369069680572,
"epoch": 0.00078,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.4834868311882019,
"kl": 0.4672291334718466,
"learning_rate": 9.999999983333379e-05,
"loss": -0.0162,
"num_tokens": 2032280.0,
"reward": 5.62964391708374,
"reward_std": 9.88559341430664,
"rewards/rollout_reward_func/mean": 5.629644393920898,
"rewards/rollout_reward_func/std": 12.693258285522461,
"sampling/importance_sampling_ratio/max": 1.5066994428634644,
"sampling/importance_sampling_ratio/mean": 1.0094711780548096,
"sampling/importance_sampling_ratio/min": 0.6512829065322876,
"sampling/sampling_logp_difference/max": 0.4918508529663086,
"sampling/sampling_logp_difference/mean": 0.01460680365562439,
"step": 39,
"step_time": 30.803230847000123
},
{
"clip_ratio/high_max": 0.05823863809928298,
"clip_ratio/high_mean": 0.01976799312978983,
"clip_ratio/low_mean": 0.02734375069849193,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.04711174394469708,
"entropy": 0.14933442790061235,
"epoch": 0.0008,
"grad_norm": 0.34873443841934204,
"kl": 0.5781354140490294,
"learning_rate": 9.99999997037045e-05,
"loss": -0.0203,
"step": 40,
"step_time": 7.3111222899999575
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0028409091755747795,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0028409091755747795,
"completions/clipped_ratio": 0.0,
"completions/max_length": 829.0,
"completions/max_terminated_length": 829.0,
"completions/mean_length": 686.59375,
"completions/mean_terminated_length": 686.59375,
"completions/min_length": 615.0,
"completions/min_terminated_length": 615.0,
"entropy": 0.15176831698045135,
"epoch": 0.00082,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.4504550099372864,
"kl": 0.6600655419752002,
"learning_rate": 9.999999953703829e-05,
"loss": -0.0185,
"num_tokens": 2130497.0,
"reward": 2.0073609352111816,
"reward_std": 8.8825044631958,
"rewards/rollout_reward_func/mean": 2.0073609352111816,
"rewards/rollout_reward_func/std": 9.321340560913086,
"sampling/importance_sampling_ratio/max": 1.5246989727020264,
"sampling/importance_sampling_ratio/mean": 1.0359078645706177,
"sampling/importance_sampling_ratio/min": 0.3844473361968994,
"sampling/sampling_logp_difference/max": 0.955810546875,
"sampling/sampling_logp_difference/mean": 0.012838078662753105,
"step": 41,
"step_time": 28.587795755999878
},
{
"clip_ratio/high_max": 0.03172348579391837,
"clip_ratio/high_mean": 0.009232954820618033,
"clip_ratio/low_mean": 0.022608901956118643,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.03184185642749071,
"entropy": 0.14899979438632727,
"epoch": 0.00084,
"grad_norm": 2.304894208908081,
"kl": 1.5326191950589418,
"learning_rate": 9.999999933333512e-05,
"loss": -0.0201,
"step": 42,
"step_time": 8.04831712999976
},
{
"clip_ratio/high_max": 0.005681818351149559,
"clip_ratio/high_mean": 0.0014204545877873898,
"clip_ratio/low_mean": 0.0026041667442768812,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.004024621332064271,
"completions/clipped_ratio": 0.0,
"completions/max_length": 825.0,
"completions/max_terminated_length": 825.0,
"completions/mean_length": 685.40625,
"completions/mean_terminated_length": 685.40625,
"completions/min_length": 389.0,
"completions/min_terminated_length": 389.0,
"entropy": 0.1393027831800282,
"epoch": 0.00086,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5649179816246033,
"kl": 0.7229090742766857,
"learning_rate": 9.999999909259503e-05,
"loss": -0.017,
"num_tokens": 2228288.0,
"reward": 1.6912901401519775,
"reward_std": 10.596427917480469,
"rewards/rollout_reward_func/mean": 1.691290020942688,
"rewards/rollout_reward_func/std": 12.0145263671875,
"sampling/importance_sampling_ratio/max": 1.3425889015197754,
"sampling/importance_sampling_ratio/mean": 0.9553788304328918,
"sampling/importance_sampling_ratio/min": 0.5974801778793335,
"sampling/sampling_logp_difference/max": 0.34511590003967285,
"sampling/sampling_logp_difference/mean": 0.01251951139420271,
"step": 43,
"step_time": 27.475775691999615
},
{
"clip_ratio/high_max": 0.04829545598477125,
"clip_ratio/high_mean": 0.015980114112608135,
"clip_ratio/low_mean": 0.024053030996583402,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.040033145574852824,
"entropy": 0.14617095375433564,
"epoch": 0.00088,
"grad_norm": 0.3445337116718292,
"kl": 0.5654929745942354,
"learning_rate": 9.9999998814818e-05,
"loss": -0.023,
"step": 44,
"step_time": 7.598477493999553
},
{
"clip_ratio/high_max": 0.0052083334885537624,
"clip_ratio/high_mean": 0.0013020833721384406,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0013020833721384406,
"completions/clipped_ratio": 0.0,
"completions/max_length": 831.0,
"completions/max_terminated_length": 831.0,
"completions/mean_length": 713.671875,
"completions/mean_terminated_length": 713.671875,
"completions/min_length": 656.0,
"completions/min_terminated_length": 656.0,
"entropy": 0.14390681218355894,
"epoch": 0.0009,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.45263612270355225,
"kl": 0.6904484182596207,
"learning_rate": 9.999999850000404e-05,
"loss": -0.005,
"num_tokens": 2328132.0,
"reward": 2.4324169158935547,
"reward_std": 13.961143493652344,
"rewards/rollout_reward_func/mean": 2.4324169158935547,
"rewards/rollout_reward_func/std": 14.438629150390625,
"sampling/importance_sampling_ratio/max": 1.3720179796218872,
"sampling/importance_sampling_ratio/mean": 1.00229012966156,
"sampling/importance_sampling_ratio/min": 0.6608520746231079,
"sampling/sampling_logp_difference/max": 0.301973819732666,
"sampling/sampling_logp_difference/mean": 0.010271631181240082,
"step": 45,
"step_time": 28.995988180999802
},
{
"clip_ratio/high_max": 0.026041667442768812,
"clip_ratio/high_mean": 0.006510416860692203,
"clip_ratio/low_mean": 0.02043087175115943,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.026941288728266954,
"entropy": 0.1375666274689138,
"epoch": 0.00092,
"grad_norm": 0.3008887469768524,
"kl": 0.6632084101438522,
"learning_rate": 9.999999814815312e-05,
"loss": -0.0106,
"step": 46,
"step_time": 7.42895066499932
},
{
"clip_ratio/high_max": 0.010416666977107525,
"clip_ratio/high_mean": 0.0026041667442768812,
"clip_ratio/low_mean": 0.0027225379599258304,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005326704704202712,
"completions/clipped_ratio": 0.0,
"completions/max_length": 823.0,
"completions/max_terminated_length": 823.0,
"completions/mean_length": 698.640625,
"completions/mean_terminated_length": 698.640625,
"completions/min_length": 393.0,
"completions/min_terminated_length": 393.0,
"entropy": 0.14624580927193165,
"epoch": 0.00094,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.36133161187171936,
"kl": 0.5184649843722582,
"learning_rate": 9.99999977592653e-05,
"loss": -0.0129,
"num_tokens": 2426521.0,
"reward": 1.375571846961975,
"reward_std": 11.66879940032959,
"rewards/rollout_reward_func/mean": 1.3755717277526855,
"rewards/rollout_reward_func/std": 11.796045303344727,
"sampling/importance_sampling_ratio/max": 1.8656487464904785,
"sampling/importance_sampling_ratio/mean": 1.0228910446166992,
"sampling/importance_sampling_ratio/min": 0.505867063999176,
"sampling/sampling_logp_difference/max": 0.6223084926605225,
"sampling/sampling_logp_difference/mean": 0.011709067039191723,
"step": 47,
"step_time": 29.763493531999984
},
{
"clip_ratio/high_max": 0.03172348579391837,
"clip_ratio/high_mean": 0.007930871448479593,
"clip_ratio/low_mean": 0.02568655402865261,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.03361742536071688,
"entropy": 0.14968854701146483,
"epoch": 0.00096,
"grad_norm": 0.17635680735111237,
"kl": 0.5038973540067673,
"learning_rate": 9.999999733334051e-05,
"loss": -0.0167,
"step": 48,
"step_time": 7.652514348999603
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0027225379599258304,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0027225379599258304,
"completions/clipped_ratio": 0.0,
"completions/max_length": 818.0,
"completions/max_terminated_length": 818.0,
"completions/mean_length": 704.453125,
"completions/mean_terminated_length": 704.453125,
"completions/min_length": 635.0,
"completions/min_terminated_length": 635.0,
"entropy": 0.14840606460347772,
"epoch": 0.00098,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5855311751365662,
"kl": 0.5907826572656631,
"learning_rate": 9.99999968703788e-05,
"loss": 0.0381,
"num_tokens": 2526069.0,
"reward": 4.523091793060303,
"reward_std": 11.536006927490234,
"rewards/rollout_reward_func/mean": 4.523091793060303,
"rewards/rollout_reward_func/std": 12.290811538696289,
"sampling/importance_sampling_ratio/max": 2.122157573699951,
"sampling/importance_sampling_ratio/mean": 1.0083321332931519,
"sampling/importance_sampling_ratio/min": 0.6556381583213806,
"sampling/sampling_logp_difference/max": 0.5623667240142822,
"sampling/sampling_logp_difference/mean": 0.012646196410059929,
"step": 49,
"step_time": 27.48595007899985
},
{
"clip_ratio/high_max": 0.05445075919851661,
"clip_ratio/high_mean": 0.017518940148875117,
"clip_ratio/low_mean": 0.03401988744735718,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.051538827014155686,
"entropy": 0.14183657616376877,
"epoch": 0.001,
"grad_norm": 0.3584051728248596,
"kl": 0.5096510350704193,
"learning_rate": 9.999999637038015e-05,
"loss": 0.0365,
"step": 50,
"step_time": 9.165422230000104
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0027225379599258304,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0027225379599258304,
"completions/clipped_ratio": 0.0,
"completions/max_length": 818.0,
"completions/max_terminated_length": 818.0,
"completions/mean_length": 692.375,
"completions/mean_terminated_length": 692.375,
"completions/min_length": 290.0,
"completions/min_terminated_length": 290.0,
"entropy": 0.14247119799256325,
"epoch": 0.00102,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.49646589159965515,
"kl": 0.4570716666057706,
"learning_rate": 9.999999583334457e-05,
"loss": -0.0101,
"num_tokens": 2623145.0,
"reward": 4.133634567260742,
"reward_std": 10.326797485351562,
"rewards/rollout_reward_func/mean": 4.133634567260742,
"rewards/rollout_reward_func/std": 10.82159423828125,
"sampling/importance_sampling_ratio/max": 1.6070019006729126,
"sampling/importance_sampling_ratio/mean": 0.996033787727356,
"sampling/importance_sampling_ratio/min": 0.5886021852493286,
"sampling/sampling_logp_difference/max": 0.543494701385498,
"sampling/sampling_logp_difference/mean": 0.010751021094620228,
"step": 51,
"step_time": 28.26161867099927
},
{
"clip_ratio/high_max": 0.04829545598477125,
"clip_ratio/high_mean": 0.013375947251915932,
"clip_ratio/low_mean": 0.02781723579391837,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0411931830458343,
"entropy": 0.13112169969826937,
"epoch": 0.00104,
"grad_norm": 0.34045207500457764,
"kl": 0.5393304694443941,
"learning_rate": 9.999999525927207e-05,
"loss": -0.016,
"step": 52,
"step_time": 6.901260032999289
},
{
"clip_ratio/high_max": 0.0052083334885537624,
"clip_ratio/high_mean": 0.0013020833721384406,
"clip_ratio/low_mean": 0.0013020833721384406,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0026041667442768812,
"completions/clipped_ratio": 0.0,
"completions/max_length": 825.0,
"completions/max_terminated_length": 825.0,
"completions/mean_length": 691.875,
"completions/mean_terminated_length": 691.875,
"completions/min_length": 462.0,
"completions/min_terminated_length": 462.0,
"entropy": 0.11744047561660409,
"epoch": 0.00106,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.3921552300453186,
"kl": 0.42071591690182686,
"learning_rate": 9.999999464816261e-05,
"loss": 0.0037,
"num_tokens": 2721107.0,
"reward": 4.605119705200195,
"reward_std": 12.441184997558594,
"rewards/rollout_reward_func/mean": 4.605119228363037,
"rewards/rollout_reward_func/std": 14.067066192626953,
"sampling/importance_sampling_ratio/max": 1.3290151357650757,
"sampling/importance_sampling_ratio/mean": 0.9739052057266235,
"sampling/importance_sampling_ratio/min": 0.38011765480041504,
"sampling/sampling_logp_difference/max": 0.929356575012207,
"sampling/sampling_logp_difference/mean": 0.010732135735452175,
"step": 53,
"step_time": 30.069013398000834
},
{
"clip_ratio/high_max": 0.02651515230536461,
"clip_ratio/high_mean": 0.006628788076341152,
"clip_ratio/low_mean": 0.022904830053448677,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.029533618362620473,
"entropy": 0.10777218686416745,
"epoch": 0.00108,
"grad_norm": 0.23905742168426514,
"kl": 0.5194222312420607,
"learning_rate": 9.999999400001624e-05,
"loss": 0.002,
"step": 54,
"step_time": 7.081706939000014
},
{
"clip_ratio/high_max": 0.0052083334885537624,
"clip_ratio/high_mean": 0.0013020833721384406,
"clip_ratio/low_mean": 0.0014204545877873898,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0027225379599258304,
"completions/clipped_ratio": 0.0,
"completions/max_length": 824.0,
"completions/max_terminated_length": 824.0,
"completions/mean_length": 702.796875,
"completions/mean_terminated_length": 702.796875,
"completions/min_length": 614.0,
"completions/min_terminated_length": 614.0,
"entropy": 0.11523706745356321,
"epoch": 0.0011,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.500625491142273,
"kl": 0.5581346470862627,
"learning_rate": 9.999999331483292e-05,
"loss": -0.0203,
"num_tokens": 2818643.0,
"reward": 3.496170997619629,
"reward_std": 14.47857666015625,
"rewards/rollout_reward_func/mean": 3.496170997619629,
"rewards/rollout_reward_func/std": 14.920737266540527,
"sampling/importance_sampling_ratio/max": 1.5530641078948975,
"sampling/importance_sampling_ratio/mean": 1.0201001167297363,
"sampling/importance_sampling_ratio/min": 0.5336768627166748,
"sampling/sampling_logp_difference/max": 0.6660118103027344,
"sampling/sampling_logp_difference/mean": 0.013495232909917831,
"step": 55,
"step_time": 28.81458637300034
},
{
"clip_ratio/high_max": 0.036931819282472134,
"clip_ratio/high_mean": 0.011837121681310236,
"clip_ratio/low_mean": 0.02758049312978983,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.03941761504393071,
"entropy": 0.10888301394879818,
"epoch": 0.00112,
"grad_norm": 0.29490140080451965,
"kl": 0.5603756010532379,
"learning_rate": 9.999999259261268e-05,
"loss": -0.0253,
"step": 56,
"step_time": 8.193311973000164
},
{
"clip_ratio/high_max": 0.010416666977107525,
"clip_ratio/high_mean": 0.0026041667442768812,
"clip_ratio/low_mean": 0.0026041667442768812,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0052083334885537624,
"completions/clipped_ratio": 0.0,
"completions/max_length": 832.0,
"completions/max_terminated_length": 832.0,
"completions/mean_length": 693.90625,
"completions/mean_terminated_length": 693.90625,
"completions/min_length": 494.0,
"completions/min_terminated_length": 494.0,
"entropy": 0.12224696017801762,
"epoch": 0.00114,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.4371468722820282,
"kl": 0.5304271820932627,
"learning_rate": 9.99999918333555e-05,
"loss": 0.0189,
"num_tokens": 2916279.0,
"reward": 3.36903715133667,
"reward_std": 12.011173248291016,
"rewards/rollout_reward_func/mean": 3.369036912918091,
"rewards/rollout_reward_func/std": 12.399989128112793,
"sampling/importance_sampling_ratio/max": 1.8561766147613525,
"sampling/importance_sampling_ratio/mean": 1.0033948421478271,
"sampling/importance_sampling_ratio/min": 0.3815801441669464,
"sampling/sampling_logp_difference/max": 0.957763671875,
"sampling/sampling_logp_difference/mean": 0.011768012307584286,
"step": 57,
"step_time": 28.027659202000223
},
{
"clip_ratio/high_max": 0.05255681974813342,
"clip_ratio/high_mean": 0.015743371564894915,
"clip_ratio/low_mean": 0.019767993013374507,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0355113644618541,
"entropy": 0.12679382599890232,
"epoch": 0.00116,
"grad_norm": 0.3022422790527344,
"kl": 0.5225307196378708,
"learning_rate": 9.999999103706142e-05,
"loss": 0.015,
"step": 58,
"step_time": 8.72335070799977
},
{
"clip_ratio/high_max": 0.0052083334885537624,
"clip_ratio/high_mean": 0.0013020833721384406,
"clip_ratio/low_mean": 0.0026041667442768812,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.003906250116415322,
"completions/clipped_ratio": 0.0,
"completions/max_length": 844.0,
"completions/max_terminated_length": 844.0,
"completions/mean_length": 681.265625,
"completions/mean_terminated_length": 681.265625,
"completions/min_length": 373.0,
"completions/min_terminated_length": 373.0,
"entropy": 0.12399047752842307,
"epoch": 0.00118,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.6583297848701477,
"kl": 0.5364211667329073,
"learning_rate": 9.999999020373037e-05,
"loss": 0.0117,
"num_tokens": 3012934.0,
"reward": 2.8170366287231445,
"reward_std": 12.926514625549316,
"rewards/rollout_reward_func/mean": 2.8170366287231445,
"rewards/rollout_reward_func/std": 13.227665901184082,
"sampling/importance_sampling_ratio/max": 2.4036612510681152,
"sampling/importance_sampling_ratio/mean": 0.9975829720497131,
"sampling/importance_sampling_ratio/min": 0.6259334683418274,
"sampling/sampling_logp_difference/max": 0.720775842666626,
"sampling/sampling_logp_difference/mean": 0.010457618162035942,
"step": 59,
"step_time": 28.9596313010004
},
{
"clip_ratio/high_max": 0.0416666679084301,
"clip_ratio/high_mean": 0.011718750349245965,
"clip_ratio/low_mean": 0.03385416732635349,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.04557291779201478,
"entropy": 0.11637644609436393,
"epoch": 0.0012,
"grad_norm": 1.9307663440704346,
"kl": 1.8184253200888634,
"learning_rate": 9.999998933336241e-05,
"loss": 0.0213,
"step": 60,
"step_time": 7.307322721999981
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0014204545877873898,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0014204545877873898,
"completions/clipped_ratio": 0.0,
"completions/max_length": 816.0,
"completions/max_terminated_length": 816.0,
"completions/mean_length": 685.78125,
"completions/mean_terminated_length": 685.78125,
"completions/min_length": 194.0,
"completions/min_terminated_length": 194.0,
"entropy": 0.11613691644743085,
"epoch": 0.00122,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.44358986616134644,
"kl": 0.5193471424281597,
"learning_rate": 9.999998842595753e-05,
"loss": -0.0024,
"num_tokens": 3109806.0,
"reward": 4.651793479919434,
"reward_std": 12.063810348510742,
"rewards/rollout_reward_func/mean": 4.651793479919434,
"rewards/rollout_reward_func/std": 12.754688262939453,
"sampling/importance_sampling_ratio/max": 1.6620776653289795,
"sampling/importance_sampling_ratio/mean": 0.9981948137283325,
"sampling/importance_sampling_ratio/min": 0.6313586831092834,
"sampling/sampling_logp_difference/max": 0.4394187927246094,
"sampling/sampling_logp_difference/mean": 0.009169764816761017,
"step": 61,
"step_time": 30.805930039000714
},
{
"clip_ratio/high_max": 0.026988637167960405,
"clip_ratio/high_mean": 0.01065340917557478,
"clip_ratio/low_mean": 0.020951705169864,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.03160511434543878,
"entropy": 0.12027787417173386,
"epoch": 0.00124,
"grad_norm": 0.3839333951473236,
"kl": 0.5386558780446649,
"learning_rate": 9.999998748151572e-05,
"loss": -0.0001,
"step": 62,
"step_time": 7.061834261000513
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0013020833721384406,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0013020833721384406,
"completions/clipped_ratio": 0.0,
"completions/max_length": 807.0,
"completions/max_terminated_length": 807.0,
"completions/mean_length": 694.46875,
"completions/mean_terminated_length": 694.46875,
"completions/min_length": 465.0,
"completions/min_terminated_length": 465.0,
"entropy": 0.13281571818515658,
"epoch": 0.00126,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.47250673174858093,
"kl": 0.5621049534529448,
"learning_rate": 9.999998650003696e-05,
"loss": -0.0068,
"num_tokens": 3207160.0,
"reward": 4.072500705718994,
"reward_std": 12.934675216674805,
"rewards/rollout_reward_func/mean": 4.072500705718994,
"rewards/rollout_reward_func/std": 13.5437650680542,
"sampling/importance_sampling_ratio/max": 1.4505815505981445,
"sampling/importance_sampling_ratio/mean": 1.0127054452896118,
"sampling/importance_sampling_ratio/min": 0.644386887550354,
"sampling/sampling_logp_difference/max": 0.46297478675842285,
"sampling/sampling_logp_difference/mean": 0.01112096942961216,
"step": 63,
"step_time": 27.765410665000445
},
{
"clip_ratio/high_max": 0.03645833441987634,
"clip_ratio/high_mean": 0.010416666977107525,
"clip_ratio/low_mean": 0.025236743153072894,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.03565340966451913,
"entropy": 0.11304981098510325,
"epoch": 0.00128,
"grad_norm": 0.23461361229419708,
"kl": 0.707372922450304,
"learning_rate": 9.999998548152131e-05,
"loss": -0.0107,
"step": 64,
"step_time": 9.65409977299987
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0013020833721384406,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0013020833721384406,
"completions/clipped_ratio": 0.0,
"completions/max_length": 827.0,
"completions/max_terminated_length": 827.0,
"completions/mean_length": 715.78125,
"completions/mean_terminated_length": 715.78125,
"completions/min_length": 618.0,
"completions/min_terminated_length": 618.0,
"entropy": 0.11744949175044894,
"epoch": 0.0013,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.5356135368347168,
"kl": 2.483667228370905,
"learning_rate": 9.999998442596872e-05,
"loss": 0.0155,
"num_tokens": 3305784.0,
"reward": 3.657202959060669,
"reward_std": 10.959955215454102,
"rewards/rollout_reward_func/mean": 3.657203197479248,
"rewards/rollout_reward_func/std": 12.17599105834961,
"sampling/importance_sampling_ratio/max": 1.36454439163208,
"sampling/importance_sampling_ratio/mean": 1.0064573287963867,
"sampling/importance_sampling_ratio/min": 0.6259024739265442,
"sampling/sampling_logp_difference/max": 0.4463231563568115,
"sampling/sampling_logp_difference/mean": 0.008584607392549515,
"step": 65,
"step_time": 29.066453170999694
},
{
"clip_ratio/high_max": 0.031250000931322575,
"clip_ratio/high_mean": 0.010416666860692203,
"clip_ratio/low_mean": 0.02178030402865261,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.032196971122175455,
"entropy": 0.14252985129132867,
"epoch": 0.00132,
"grad_norm": 0.2563531696796417,
"kl": 0.6245546955615282,
"learning_rate": 9.999998333337922e-05,
"loss": -0.0004,
"step": 66,
"step_time": 8.02741467600049
},
{
"clip_ratio/high_max": 0.005681818351149559,
"clip_ratio/high_mean": 0.0014204545877873898,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0014204545877873898,
"completions/clipped_ratio": 0.0,
"completions/max_length": 833.0,
"completions/max_terminated_length": 833.0,
"completions/mean_length": 678.890625,
"completions/mean_terminated_length": 678.890625,
"completions/min_length": 413.0,
"completions/min_terminated_length": 413.0,
"entropy": 0.15250376611948013,
"epoch": 0.00134,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5203387141227722,
"kl": 0.695558762177825,
"learning_rate": 9.999998220375278e-05,
"loss": -0.0145,
"num_tokens": 3401864.0,
"reward": 1.054423451423645,
"reward_std": 11.31953239440918,
"rewards/rollout_reward_func/mean": 1.054423451423645,
"rewards/rollout_reward_func/std": 12.172701835632324,
"sampling/importance_sampling_ratio/max": 1.2194569110870361,
"sampling/importance_sampling_ratio/mean": 0.9876125454902649,
"sampling/importance_sampling_ratio/min": 0.550414502620697,
"sampling/sampling_logp_difference/max": 0.5297477841377258,
"sampling/sampling_logp_difference/mean": 0.008570928126573563,
"step": 67,
"step_time": 30.060426205000795
},
{
"clip_ratio/high_max": 0.05965909268707037,
"clip_ratio/high_mean": 0.018939394736662507,
"clip_ratio/low_mean": 0.03338068269658834,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.05232007708400488,
"entropy": 0.17216729745268822,
"epoch": 0.00136,
"grad_norm": 0.27647653222084045,
"kl": 0.6303851045668125,
"learning_rate": 9.999998103708944e-05,
"loss": -0.0169,
"step": 68,
"step_time": 7.55158718300072
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0015625000232830644,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0015625000232830644,
"completions/clipped_ratio": 0.0,
"completions/max_length": 818.0,
"completions/max_terminated_length": 818.0,
"completions/mean_length": 692.171875,
"completions/mean_terminated_length": 692.171875,
"completions/min_length": 387.0,
"completions/min_terminated_length": 387.0,
"entropy": 0.17608004808425903,
"epoch": 0.00138,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.40714362263679504,
"kl": 0.5837149824947119,
"learning_rate": 9.999997983338918e-05,
"loss": 0.0075,
"num_tokens": 3498494.0,
"reward": 4.154041290283203,
"reward_std": 15.997432708740234,
"rewards/rollout_reward_func/mean": 4.154041290283203,
"rewards/rollout_reward_func/std": 18.081926345825195,
"sampling/importance_sampling_ratio/max": 1.240838885307312,
"sampling/importance_sampling_ratio/mean": 0.9964578747749329,
"sampling/importance_sampling_ratio/min": 0.756720781326294,
"sampling/sampling_logp_difference/max": 0.326712965965271,
"sampling/sampling_logp_difference/mean": 0.009719014167785645,
"step": 69,
"step_time": 28.760111235000295
},
{
"clip_ratio/high_max": 0.04450757708400488,
"clip_ratio/high_mean": 0.013731061248108745,
"clip_ratio/low_mean": 0.016698232851922512,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.030429294100031257,
"entropy": 0.1826375536620617,
"epoch": 0.0014,
"grad_norm": 0.4711000919342041,
"kl": 0.5743975602090359,
"learning_rate": 9.999997859265198e-05,
"loss": 0.0045,
"step": 70,
"step_time": 8.15129353600014
},
{
"clip_ratio/high_max": 0.0052083334885537624,
"clip_ratio/high_mean": 0.0013020833721384406,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0013020833721384406,
"completions/clipped_ratio": 0.0,
"completions/max_length": 835.0,
"completions/max_terminated_length": 835.0,
"completions/mean_length": 695.796875,
"completions/mean_terminated_length": 695.796875,
"completions/min_length": 208.0,
"completions/min_terminated_length": 208.0,
"entropy": 0.20647307951003313,
"epoch": 0.00142,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.3142479956150055,
"kl": 0.5285101179033518,
"learning_rate": 9.999997731487787e-05,
"loss": -0.0177,
"num_tokens": 3595387.0,
"reward": 2.402292490005493,
"reward_std": 13.013188362121582,
"rewards/rollout_reward_func/mean": 2.402292251586914,
"rewards/rollout_reward_func/std": 13.636407852172852,
"sampling/importance_sampling_ratio/max": 1.3384240865707397,
"sampling/importance_sampling_ratio/mean": 1.011613368988037,
"sampling/importance_sampling_ratio/min": 0.776378870010376,
"sampling/sampling_logp_difference/max": 0.2462749481201172,
"sampling/sampling_logp_difference/mean": 0.009866164065897465,
"step": 71,
"step_time": 28.09002854999926
},
{
"clip_ratio/high_max": 0.043560607358813286,
"clip_ratio/high_mean": 0.013494318816810846,
"clip_ratio/low_mean": 0.012428977759554982,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.025923296343535185,
"entropy": 0.20543431770056486,
"epoch": 0.00144,
"grad_norm": 0.23259234428405762,
"kl": 0.5239685252308846,
"learning_rate": 9.999997600006685e-05,
"loss": -0.0218,
"step": 72,
"step_time": 8.672955195000668
},
{
"clip_ratio/high_max": 0.010416666977107525,
"clip_ratio/high_mean": 0.0026041667442768812,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0026041667442768812,
"completions/clipped_ratio": 0.0,
"completions/max_length": 832.0,
"completions/max_terminated_length": 832.0,
"completions/mean_length": 704.984375,
"completions/mean_terminated_length": 704.984375,
"completions/min_length": 533.0,
"completions/min_terminated_length": 533.0,
"entropy": 0.20190842729061842,
"epoch": 0.00146,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.3707256019115448,
"kl": 0.5400361772626638,
"learning_rate": 9.999997464821892e-05,
"loss": 0.006,
"num_tokens": 3692772.0,
"reward": 2.049668073654175,
"reward_std": 15.488001823425293,
"rewards/rollout_reward_func/mean": 2.049668073654175,
"rewards/rollout_reward_func/std": 15.380194664001465,
"sampling/importance_sampling_ratio/max": 1.1559480428695679,
"sampling/importance_sampling_ratio/mean": 0.970598578453064,
"sampling/importance_sampling_ratio/min": 0.6524748802185059,
"sampling/sampling_logp_difference/max": 0.35463929176330566,
"sampling/sampling_logp_difference/mean": 0.009403295814990997,
"step": 73,
"step_time": 28.82742140900018
},
{
"clip_ratio/high_max": 0.06818182021379471,
"clip_ratio/high_mean": 0.018347538425587118,
"clip_ratio/low_mean": 0.02402935700956732,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.04237689543515444,
"entropy": 0.2009204039350152,
"epoch": 0.00148,
"grad_norm": 0.2297271341085434,
"kl": 0.5404210295528173,
"learning_rate": 9.999997325933408e-05,
"loss": 0.001,
"step": 74,
"step_time": 7.489119195001422
},
{
"clip_ratio/high_max": 0.010416666977107525,
"clip_ratio/high_mean": 0.0026041667442768812,
"clip_ratio/low_mean": 0.0015625000232830644,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.004166666767559946,
"completions/clipped_ratio": 0.0,
"completions/max_length": 810.0,
"completions/max_terminated_length": 810.0,
"completions/mean_length": 688.1875,
"completions/mean_terminated_length": 688.1875,
"completions/min_length": 607.0,
"completions/min_terminated_length": 607.0,
"entropy": 0.19008919596672058,
"epoch": 0.0015,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.39288049936294556,
"kl": 0.5065996870398521,
"learning_rate": 9.999997183341232e-05,
"loss": -0.0174,
"num_tokens": 3789251.0,
"reward": 5.712855339050293,
"reward_std": 12.491518020629883,
"rewards/rollout_reward_func/mean": 5.712855339050293,
"rewards/rollout_reward_func/std": 13.803718566894531,
"sampling/importance_sampling_ratio/max": 1.3862897157669067,
"sampling/importance_sampling_ratio/mean": 0.9820230007171631,
"sampling/importance_sampling_ratio/min": 0.7251328825950623,
"sampling/sampling_logp_difference/max": 0.38344359397888184,
"sampling/sampling_logp_difference/mean": 0.011255129240453243,
"step": 75,
"step_time": 29.904527067999425
},
{
"clip_ratio/high_max": 0.0416666679084301,
"clip_ratio/high_mean": 0.014322916977107525,
"clip_ratio/low_mean": 0.03125000069849193,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.04557291744276881,
"entropy": 0.1766198892146349,
"epoch": 0.00152,
"grad_norm": 0.24856555461883545,
"kl": 0.5580815225839615,
"learning_rate": 9.999997037045364e-05,
"loss": -0.0236,
"step": 76,
"step_time": 7.936869918999946
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 829.0,
"completions/max_terminated_length": 829.0,
"completions/mean_length": 704.921875,
"completions/mean_terminated_length": 704.921875,
"completions/min_length": 290.0,
"completions/min_terminated_length": 290.0,
"entropy": 0.1587599003687501,
"epoch": 0.00154,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5212773680686951,
"kl": 0.5196739248931408,
"learning_rate": 9.999996887045807e-05,
"loss": -0.0035,
"num_tokens": 3886377.0,
"reward": 4.441685199737549,
"reward_std": 10.929279327392578,
"rewards/rollout_reward_func/mean": 4.441685676574707,
"rewards/rollout_reward_func/std": 12.737987518310547,
"sampling/importance_sampling_ratio/max": 1.4177803993225098,
"sampling/importance_sampling_ratio/mean": 0.9960745573043823,
"sampling/importance_sampling_ratio/min": 0.6403241157531738,
"sampling/sampling_logp_difference/max": 0.35891127586364746,
"sampling/sampling_logp_difference/mean": 0.009403642266988754,
"step": 77,
"step_time": 29.23442492700042
},
{
"clip_ratio/high_max": 0.03787878900766373,
"clip_ratio/high_mean": 0.009469697251915932,
"clip_ratio/low_mean": 0.02260890230536461,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.03207859944086522,
"entropy": 0.14319165889173746,
"epoch": 0.00156,
"grad_norm": 0.21223606169223785,
"kl": 0.6083459779620171,
"learning_rate": 9.999996733342559e-05,
"loss": -0.0046,
"step": 78,
"step_time": 9.08587798599865
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0052083334885537624,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0052083334885537624,
"completions/clipped_ratio": 0.0,
"completions/max_length": 827.0,
"completions/max_terminated_length": 827.0,
"completions/mean_length": 699.296875,
"completions/mean_terminated_length": 699.296875,
"completions/min_length": 445.0,
"completions/min_terminated_length": 445.0,
"entropy": 0.13262670719996095,
"epoch": 0.00158,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.3686180114746094,
"kl": 0.5813394356518984,
"learning_rate": 9.99999657593562e-05,
"loss": 0.0239,
"num_tokens": 3983088.0,
"reward": 4.565882682800293,
"reward_std": 10.690776824951172,
"rewards/rollout_reward_func/mean": 4.565882682800293,
"rewards/rollout_reward_func/std": 10.94388484954834,
"sampling/importance_sampling_ratio/max": 2.301131010055542,
"sampling/importance_sampling_ratio/mean": 1.038649559020996,
"sampling/importance_sampling_ratio/min": 0.6781718730926514,
"sampling/sampling_logp_difference/max": 0.7350552082061768,
"sampling/sampling_logp_difference/mean": 0.009047108702361584,
"step": 79,
"step_time": 29.03609049600027
},
{
"clip_ratio/high_max": 0.0691287899389863,
"clip_ratio/high_mean": 0.02249053120613098,
"clip_ratio/low_mean": 0.017282197484746575,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.03977272880729288,
"entropy": 0.1365647497586906,
"epoch": 0.0016,
"grad_norm": 0.26296547055244446,
"kl": 0.5871373657137156,
"learning_rate": 9.99999641482499e-05,
"loss": 0.0196,
"step": 80,
"step_time": 8.78226529199901
},
{
"clip_ratio/high_max": 0.0052083334885537624,
"clip_ratio/high_mean": 0.0013020833721384406,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0013020833721384406,
"completions/clipped_ratio": 0.0,
"completions/max_length": 817.0,
"completions/max_terminated_length": 817.0,
"completions/mean_length": 704.28125,
"completions/mean_terminated_length": 704.28125,
"completions/min_length": 194.0,
"completions/min_terminated_length": 194.0,
"entropy": 0.13410852942615747,
"epoch": 0.00162,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.6391101479530334,
"kl": 0.49469868279993534,
"learning_rate": 9.999996250010672e-05,
"loss": -0.0038,
"num_tokens": 4080648.0,
"reward": 5.768660545349121,
"reward_std": 10.985546112060547,
"rewards/rollout_reward_func/mean": 5.768660068511963,
"rewards/rollout_reward_func/std": 11.962743759155273,
"sampling/importance_sampling_ratio/max": 1.4244225025177002,
"sampling/importance_sampling_ratio/mean": 1.0141850709915161,
"sampling/importance_sampling_ratio/min": 0.6568657755851746,
"sampling/sampling_logp_difference/max": 0.3986610174179077,
"sampling/sampling_logp_difference/mean": 0.009017249569296837,
"step": 81,
"step_time": 29.058386802999394
},
{
"clip_ratio/high_max": 0.06250000186264515,
"clip_ratio/high_mean": 0.016927083721384406,
"clip_ratio/low_mean": 0.025213068933226168,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.04214015288744122,
"entropy": 0.1406740453094244,
"epoch": 0.00164,
"grad_norm": 0.3618878424167633,
"kl": 0.5326054207980633,
"learning_rate": 9.99999608149266e-05,
"loss": -0.0092,
"step": 82,
"step_time": 7.4923771910011965
},
{
"clip_ratio/high_max": 0.0052083334885537624,
"clip_ratio/high_mean": 0.0013020833721384406,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0013020833721384406,
"completions/clipped_ratio": 0.0,
"completions/max_length": 827.0,
"completions/max_terminated_length": 827.0,
"completions/mean_length": 690.765625,
"completions/mean_terminated_length": 690.765625,
"completions/min_length": 619.0,
"completions/min_terminated_length": 619.0,
"entropy": 0.13702308759093285,
"epoch": 0.00166,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.7054314017295837,
"kl": 0.5327184200286865,
"learning_rate": 9.999995909270962e-05,
"loss": 0.0131,
"num_tokens": 4176944.0,
"reward": 6.398343563079834,
"reward_std": 12.486600875854492,
"rewards/rollout_reward_func/mean": 6.398343563079834,
"rewards/rollout_reward_func/std": 13.118927955627441,
"sampling/importance_sampling_ratio/max": 1.1626012325286865,
"sampling/importance_sampling_ratio/mean": 0.9923787117004395,
"sampling/importance_sampling_ratio/min": 0.6767197847366333,
"sampling/sampling_logp_difference/max": 0.27681541442871094,
"sampling/sampling_logp_difference/mean": 0.007814774289727211,
"step": 83,
"step_time": 30.334892443000626
},
{
"clip_ratio/high_max": 0.052083334885537624,
"clip_ratio/high_mean": 0.014322917093522847,
"clip_ratio/low_mean": 0.02935606148093939,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.04367897834163159,
"entropy": 0.13890184368938208,
"epoch": 0.00168,
"grad_norm": 0.23295485973358154,
"kl": 0.583111148327589,
"learning_rate": 9.999995733345573e-05,
"loss": 0.0096,
"step": 84,
"step_time": 8.188888645999668
},
{
"clip_ratio/high_max": 0.005681818351149559,
"clip_ratio/high_mean": 0.0014204545877873898,
"clip_ratio/low_mean": 0.0013020833721384406,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0027225379599258304,
"completions/clipped_ratio": 0.0,
"completions/max_length": 847.0,
"completions/max_terminated_length": 847.0,
"completions/mean_length": 709.140625,
"completions/mean_terminated_length": 709.140625,
"completions/min_length": 385.0,
"completions/min_terminated_length": 385.0,
"entropy": 0.1646800385788083,
"epoch": 0.0017,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.721032977104187,
"kl": 0.5625268556177616,
"learning_rate": 9.999995553716494e-05,
"loss": -0.003,
"num_tokens": 4273965.0,
"reward": 5.8386735916137695,
"reward_std": 13.300103187561035,
"rewards/rollout_reward_func/mean": 5.8386735916137695,
"rewards/rollout_reward_func/std": 13.629975318908691,
"sampling/importance_sampling_ratio/max": 1.314743995666504,
"sampling/importance_sampling_ratio/mean": 1.0051491260528564,
"sampling/importance_sampling_ratio/min": 0.7047513127326965,
"sampling/sampling_logp_difference/max": 0.2584061622619629,
"sampling/sampling_logp_difference/mean": 0.009669218212366104,
"step": 85,
"step_time": 28.418585942000846
},
{
"clip_ratio/high_max": 0.06912878947332501,
"clip_ratio/high_mean": 0.019886364112608135,
"clip_ratio/low_mean": 0.04139046813361347,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.061276831780560315,
"entropy": 0.16476231161504984,
"epoch": 0.00172,
"grad_norm": 0.3757534921169281,
"kl": 0.6113345008343458,
"learning_rate": 9.999995370383726e-05,
"loss": -0.0069,
"step": 86,
"step_time": 8.756163650000417
},
{
"clip_ratio/high_max": 0.011363636702299118,
"clip_ratio/high_mean": 0.0028409091755747795,
"clip_ratio/low_mean": 0.0013020833721384406,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00414299254771322,
"completions/clipped_ratio": 0.0,
"completions/max_length": 823.0,
"completions/max_terminated_length": 823.0,
"completions/mean_length": 684.84375,
"completions/mean_terminated_length": 684.84375,
"completions/min_length": 619.0,
"completions/min_terminated_length": 619.0,
"entropy": 0.15467680245637894,
"epoch": 0.00174,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.39151129126548767,
"kl": 0.5371765866875648,
"learning_rate": 9.999995183347267e-05,
"loss": 0.0105,
"num_tokens": 4369299.0,
"reward": 5.963912010192871,
"reward_std": 12.684013366699219,
"rewards/rollout_reward_func/mean": 5.963912010192871,
"rewards/rollout_reward_func/std": 13.017167091369629,
"sampling/importance_sampling_ratio/max": 1.2570720911026,
"sampling/importance_sampling_ratio/mean": 1.0000150203704834,
"sampling/importance_sampling_ratio/min": 0.6576955914497375,
"sampling/sampling_logp_difference/max": 0.23494529724121094,
"sampling/sampling_logp_difference/mean": 0.009037286043167114,
"step": 87,
"step_time": 27.959497561998433
},
{
"clip_ratio/high_max": 0.04876894038170576,
"clip_ratio/high_mean": 0.014796401956118643,
"clip_ratio/low_mean": 0.030184660223312676,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.04498106241226196,
"entropy": 0.1502314694225788,
"epoch": 0.00176,
"grad_norm": 0.24433566629886627,
"kl": 0.518398828804493,
"learning_rate": 9.999994992607121e-05,
"loss": 0.0052,
"step": 88,
"step_time": 6.98385682199978
},
{
"clip_ratio/high_max": 0.0052083334885537624,
"clip_ratio/high_mean": 0.0013020833721384406,
"clip_ratio/low_mean": 0.0013020833721384406,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0026041667442768812,
"completions/clipped_ratio": 0.0,
"completions/max_length": 808.0,
"completions/max_terminated_length": 808.0,
"completions/mean_length": 674.84375,
"completions/mean_terminated_length": 674.84375,
"completions/min_length": 286.0,
"completions/min_terminated_length": 286.0,
"entropy": 0.1636304627172649,
"epoch": 0.00178,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.4070056080818176,
"kl": 0.44829913787543774,
"learning_rate": 9.999994798163285e-05,
"loss": 0.0028,
"num_tokens": 4464636.0,
"reward": 4.596271991729736,
"reward_std": 12.002615928649902,
"rewards/rollout_reward_func/mean": 4.5962724685668945,
"rewards/rollout_reward_func/std": 12.03700065612793,
"sampling/importance_sampling_ratio/max": 1.8310773372650146,
"sampling/importance_sampling_ratio/mean": 1.0015285015106201,
"sampling/importance_sampling_ratio/min": 0.6802361011505127,
"sampling/sampling_logp_difference/max": 0.63387131690979,
"sampling/sampling_logp_difference/mean": 0.01002519205212593,
"step": 89,
"step_time": 29.20652451100068
},
{
"clip_ratio/high_max": 0.053503789473325014,
"clip_ratio/high_mean": 0.014678030740469694,
"clip_ratio/low_mean": 0.014914773171767592,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.02959280402865261,
"entropy": 0.16931697819381952,
"epoch": 0.0018,
"grad_norm": 0.22567316889762878,
"kl": 0.45854073390364647,
"learning_rate": 9.999994600015763e-05,
"loss": -0.0044,
"step": 90,
"step_time": 7.806028198999684
},
{
"clip_ratio/high_max": 0.010416666977107525,
"clip_ratio/high_mean": 0.0026041667442768812,
"clip_ratio/low_mean": 0.0013020833721384406,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.003906250116415322,
"completions/clipped_ratio": 0.0,
"completions/max_length": 818.0,
"completions/max_terminated_length": 818.0,
"completions/mean_length": 690.921875,
"completions/mean_terminated_length": 690.921875,
"completions/min_length": 573.0,
"completions/min_terminated_length": 573.0,
"entropy": 0.17555938381701708,
"epoch": 0.00182,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.6369052529335022,
"kl": 0.5130380634218454,
"learning_rate": 9.99999439816455e-05,
"loss": 0.0097,
"num_tokens": 4560466.0,
"reward": 4.266380786895752,
"reward_std": 8.932316780090332,
"rewards/rollout_reward_func/mean": 4.266380786895752,
"rewards/rollout_reward_func/std": 9.506205558776855,
"sampling/importance_sampling_ratio/max": 1.4317197799682617,
"sampling/importance_sampling_ratio/mean": 0.9800074100494385,
"sampling/importance_sampling_ratio/min": 0.6640469431877136,
"sampling/sampling_logp_difference/max": 0.39695852994918823,
"sampling/sampling_logp_difference/mean": 0.011851027607917786,
"step": 91,
"step_time": 30.03264877599986
},
{
"clip_ratio/high_max": 0.07812500186264515,
"clip_ratio/high_mean": 0.027343750349245965,
"clip_ratio/low_mean": 0.02604166732635349,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.05338541814126074,
"entropy": 0.16226398199796677,
"epoch": 0.00184,
"grad_norm": 0.49021783471107483,
"kl": 0.678026232868433,
"learning_rate": 9.999994192609649e-05,
"loss": 0.0008,
"step": 92,
"step_time": 9.04058756600034
},
{
"clip_ratio/high_max": 0.0052083334885537624,
"clip_ratio/high_mean": 0.0013020833721384406,
"clip_ratio/low_mean": 0.0026041667442768812,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.003906250116415322,
"completions/clipped_ratio": 0.0,
"completions/max_length": 827.0,
"completions/max_terminated_length": 827.0,
"completions/mean_length": 706.40625,
"completions/mean_terminated_length": 706.40625,
"completions/min_length": 297.0,
"completions/min_terminated_length": 297.0,
"entropy": 0.1653224742040038,
"epoch": 0.00186,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5930750966072083,
"kl": 0.533378497697413,
"learning_rate": 9.999993983351059e-05,
"loss": 0.0049,
"num_tokens": 4657400.0,
"reward": 4.687631607055664,
"reward_std": 12.176762580871582,
"rewards/rollout_reward_func/mean": 4.687631607055664,
"rewards/rollout_reward_func/std": 13.946465492248535,
"sampling/importance_sampling_ratio/max": 2.0640549659729004,
"sampling/importance_sampling_ratio/mean": 1.0510772466659546,
"sampling/importance_sampling_ratio/min": 0.6677830219268799,
"sampling/sampling_logp_difference/max": 0.5909380912780762,
"sampling/sampling_logp_difference/mean": 0.01191171444952488,
"step": 93,
"step_time": 28.226474250999672
},
{
"clip_ratio/high_max": 0.0416666679084301,
"clip_ratio/high_mean": 0.011718750349245965,
"clip_ratio/low_mean": 0.022135417442768812,
"clip_ratio/low_min": 0.0052083334885537624,
"clip_ratio/region_mean": 0.033854167675599456,
"entropy": 0.15958264330402017,
"epoch": 0.00188,
"grad_norm": 0.35930758714675903,
"kl": 0.7466034032404423,
"learning_rate": 9.999993770388783e-05,
"loss": 0.0032,
"step": 94,
"step_time": 8.234778083000037
},
{
"clip_ratio/high_max": 0.0052083334885537624,
"clip_ratio/high_mean": 0.0013020833721384406,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0013020833721384406,
"completions/clipped_ratio": 0.0,
"completions/max_length": 824.0,
"completions/max_terminated_length": 824.0,
"completions/mean_length": 697.828125,
"completions/mean_terminated_length": 697.828125,
"completions/min_length": 623.0,
"completions/min_terminated_length": 623.0,
"entropy": 0.16395951714366674,
"epoch": 0.0019,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.30130699276924133,
"kl": 0.4953720346093178,
"learning_rate": 9.99999355372282e-05,
"loss": 0.0087,
"num_tokens": 4753836.0,
"reward": 4.204550743103027,
"reward_std": 11.951547622680664,
"rewards/rollout_reward_func/mean": 4.204550743103027,
"rewards/rollout_reward_func/std": 13.192495346069336,
"sampling/importance_sampling_ratio/max": 1.7262465953826904,
"sampling/importance_sampling_ratio/mean": 1.0100435018539429,
"sampling/importance_sampling_ratio/min": 0.6937407851219177,
"sampling/sampling_logp_difference/max": 0.5034514665603638,
"sampling/sampling_logp_difference/mean": 0.008402319625020027,
"step": 95,
"step_time": 29.812369647001105
},
{
"clip_ratio/high_max": 0.03645833441987634,
"clip_ratio/high_mean": 0.010416666977107525,
"clip_ratio/low_mean": 0.009114583604969084,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.01953125058207661,
"entropy": 0.18544823909178376,
"epoch": 0.00192,
"grad_norm": 0.2023509442806244,
"kl": 0.44245083443820477,
"learning_rate": 9.999993333353168e-05,
"loss": 0.0061,
"step": 96,
"step_time": 7.195571093998296
},
{
"clip_ratio/high_max": 0.0052083334885537624,
"clip_ratio/high_mean": 0.0026041667442768812,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0026041667442768812,
"completions/clipped_ratio": 0.0,
"completions/max_length": 829.0,
"completions/max_terminated_length": 829.0,
"completions/mean_length": 687.921875,
"completions/mean_terminated_length": 687.921875,
"completions/min_length": 596.0,
"completions/min_terminated_length": 596.0,
"entropy": 0.20835321862250566,
"epoch": 0.00194,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.35837119817733765,
"kl": 0.4389466196298599,
"learning_rate": 9.999993109279828e-05,
"loss": 0.0044,
"num_tokens": 4849131.0,
"reward": 3.880918264389038,
"reward_std": 8.090033531188965,
"rewards/rollout_reward_func/mean": 3.880918264389038,
"rewards/rollout_reward_func/std": 9.26294231414795,
"sampling/importance_sampling_ratio/max": 1.2344332933425903,
"sampling/importance_sampling_ratio/mean": 0.9644654989242554,
"sampling/importance_sampling_ratio/min": 0.7370292544364929,
"sampling/sampling_logp_difference/max": 0.29116082191467285,
"sampling/sampling_logp_difference/mean": 0.009601429104804993,
"step": 97,
"step_time": 30.270599251999556
},
{
"clip_ratio/high_max": 0.052083334885537624,
"clip_ratio/high_mean": 0.016927083721384406,
"clip_ratio/low_mean": 0.014559659757651389,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.031486743479035795,
"entropy": 0.21354177221655846,
"epoch": 0.00196,
"grad_norm": 0.20351360738277435,
"kl": 0.43841097690165043,
"learning_rate": 9.999992881502804e-05,
"loss": 0.0004,
"step": 98,
"step_time": 7.506363271999817
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 824.0,
"completions/max_terminated_length": 824.0,
"completions/mean_length": 706.140625,
"completions/mean_terminated_length": 706.140625,
"completions/min_length": 391.0,
"completions/min_terminated_length": 391.0,
"entropy": 0.21307788416743279,
"epoch": 0.00198,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.4080103039741516,
"kl": 0.5334641952067614,
"learning_rate": 9.99999265002209e-05,
"loss": -0.003,
"num_tokens": 4945915.0,
"reward": 5.200403213500977,
"reward_std": 14.344334602355957,
"rewards/rollout_reward_func/mean": 5.200403213500977,
"rewards/rollout_reward_func/std": 14.294367790222168,
"sampling/importance_sampling_ratio/max": 1.2247880697250366,
"sampling/importance_sampling_ratio/mean": 1.0129998922348022,
"sampling/importance_sampling_ratio/min": 0.7771543860435486,
"sampling/sampling_logp_difference/max": 0.23006606101989746,
"sampling/sampling_logp_difference/mean": 0.00854739174246788,
"step": 99,
"step_time": 29.33993570499979
},
{
"clip_ratio/high_max": 0.015625000465661287,
"clip_ratio/high_mean": 0.006510416744276881,
"clip_ratio/low_mean": 0.023555872030556202,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.030066288774833083,
"entropy": 0.21015852224081755,
"epoch": 0.002,
"grad_norm": 0.2297798991203308,
"kl": 0.5835338849574327,
"learning_rate": 9.999992414837691e-05,
"loss": -0.008,
"step": 100,
"step_time": 8.775622698999086
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 837.0,
"completions/max_terminated_length": 837.0,
"completions/mean_length": 711.671875,
"completions/mean_terminated_length": 711.671875,
"completions/min_length": 616.0,
"completions/min_terminated_length": 616.0,
"entropy": 0.2137407148256898,
"epoch": 0.00202,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.4429977834224701,
"kl": 0.4639507979154587,
"learning_rate": 9.999992175949606e-05,
"loss": -0.0173,
"num_tokens": 5042733.0,
"reward": 3.351179838180542,
"reward_std": 8.503268241882324,
"rewards/rollout_reward_func/mean": 3.351179838180542,
"rewards/rollout_reward_func/std": 8.948554039001465,
"sampling/importance_sampling_ratio/max": 1.328324556350708,
"sampling/importance_sampling_ratio/mean": 1.0001481771469116,
"sampling/importance_sampling_ratio/min": 0.5792597532272339,
"sampling/sampling_logp_difference/max": 0.4302701950073242,
"sampling/sampling_logp_difference/mean": 0.008802896365523338,
"step": 101,
"step_time": 29.50366010900052
},
{
"clip_ratio/high_max": 0.0572916679084301,
"clip_ratio/high_mean": 0.02083333407063037,
"clip_ratio/low_mean": 0.021188447950407863,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.04202178155537695,
"entropy": 0.19818230718374252,
"epoch": 0.00204,
"grad_norm": 0.228831484913826,
"kl": 0.523833503946662,
"learning_rate": 9.999991933357836e-05,
"loss": -0.0238,
"step": 102,
"step_time": 7.743058271999871
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 819.0,
"completions/max_terminated_length": 819.0,
"completions/mean_length": 679.5,
"completions/mean_terminated_length": 679.5,
"completions/min_length": 393.0,
"completions/min_terminated_length": 393.0,
"entropy": 0.16811883123591542,
"epoch": 0.00206,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2717866897583008,
"kl": 0.5116975158452988,
"learning_rate": 9.999991687062378e-05,
"loss": 0.0026,
"num_tokens": 5137485.0,
"reward": 3.233732223510742,
"reward_std": 12.289377212524414,
"rewards/rollout_reward_func/mean": 3.233732223510742,
"rewards/rollout_reward_func/std": 14.167500495910645,
"sampling/importance_sampling_ratio/max": 1.1452041864395142,
"sampling/importance_sampling_ratio/mean": 0.9949536323547363,
"sampling/importance_sampling_ratio/min": 0.8263934254646301,
"sampling/sampling_logp_difference/max": 0.11179852485656738,
"sampling/sampling_logp_difference/mean": 0.00560589786618948,
"step": 103,
"step_time": 28.410943980999036
},
{
"clip_ratio/high_max": 0.03645833441987634,
"clip_ratio/high_mean": 0.009114583604969084,
"clip_ratio/low_mean": 0.036576704937033355,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.04569128877483308,
"entropy": 0.1520394361577928,
"epoch": 0.00208,
"grad_norm": 0.1855272352695465,
"kl": 0.548751313239336,
"learning_rate": 9.999991437063234e-05,
"loss": -0.0007,
"step": 104,
"step_time": 7.630572153999765
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0013020833721384406,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0013020833721384406,
"completions/clipped_ratio": 0.0,
"completions/max_length": 821.0,
"completions/max_terminated_length": 821.0,
"completions/mean_length": 689.109375,
"completions/mean_terminated_length": 689.109375,
"completions/min_length": 379.0,
"completions/min_terminated_length": 379.0,
"entropy": 0.15672127809375525,
"epoch": 0.0021,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.45729750394821167,
"kl": 0.6510039251297712,
"learning_rate": 9.999991183360407e-05,
"loss": -0.011,
"num_tokens": 5232831.0,
"reward": 4.220555305480957,
"reward_std": 10.952154159545898,
"rewards/rollout_reward_func/mean": 4.220555305480957,
"rewards/rollout_reward_func/std": 11.161866188049316,
"sampling/importance_sampling_ratio/max": 1.289732813835144,
"sampling/importance_sampling_ratio/mean": 0.9952840209007263,
"sampling/importance_sampling_ratio/min": 0.6639890074729919,
"sampling/sampling_logp_difference/max": 0.4248615503311157,
"sampling/sampling_logp_difference/mean": 0.009283961728215218,
"step": 105,
"step_time": 29.269957731999057
},
{
"clip_ratio/high_max": 0.02083333395421505,
"clip_ratio/high_mean": 0.006510416860692203,
"clip_ratio/low_mean": 0.015625000232830644,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.022135417093522847,
"entropy": 0.15183987142518163,
"epoch": 0.00212,
"grad_norm": 0.19675187766551971,
"kl": 0.7388164456933737,
"learning_rate": 9.999990925953892e-05,
"loss": -0.0165,
"step": 106,
"step_time": 7.576425396001014
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0013020833721384406,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0013020833721384406,
"completions/clipped_ratio": 0.0,
"completions/max_length": 817.0,
"completions/max_terminated_length": 817.0,
"completions/mean_length": 695.0625,
"completions/mean_terminated_length": 695.0625,
"completions/min_length": 619.0,
"completions/min_terminated_length": 619.0,
"entropy": 0.14494483266025782,
"epoch": 0.00214,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.4538170397281647,
"kl": 0.678084384649992,
"learning_rate": 9.999990664843695e-05,
"loss": 0.0147,
"num_tokens": 5328578.0,
"reward": 9.525361061096191,
"reward_std": 13.358152389526367,
"rewards/rollout_reward_func/mean": 9.525361061096191,
"rewards/rollout_reward_func/std": 14.251992225646973,
"sampling/importance_sampling_ratio/max": 1.1812809705734253,
"sampling/importance_sampling_ratio/mean": 0.9926539659500122,
"sampling/importance_sampling_ratio/min": 0.7029387950897217,
"sampling/sampling_logp_difference/max": 0.35564422607421875,
"sampling/sampling_logp_difference/mean": 0.007083391770720482,
"step": 107,
"step_time": 28.04022229299926
},
{
"clip_ratio/high_max": 0.046875000931322575,
"clip_ratio/high_mean": 0.015625000349245965,
"clip_ratio/low_mean": 0.015861742896959186,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.03148674312978983,
"entropy": 0.1589709185063839,
"epoch": 0.00216,
"grad_norm": 0.22844459116458893,
"kl": 0.6251159347593784,
"learning_rate": 9.999990400029812e-05,
"loss": 0.0106,
"step": 108,
"step_time": 8.196292393000022
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0013020833721384406,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0013020833721384406,
"completions/clipped_ratio": 0.0,
"completions/max_length": 832.0,
"completions/max_terminated_length": 832.0,
"completions/mean_length": 703.75,
"completions/mean_terminated_length": 703.75,
"completions/min_length": 522.0,
"completions/min_terminated_length": 522.0,
"entropy": 0.17329717054963112,
"epoch": 0.00218,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.3682776987552643,
"kl": 0.6051198206841946,
"learning_rate": 9.999990131512245e-05,
"loss": 0.0061,
"num_tokens": 5424927.0,
"reward": 6.206368923187256,
"reward_std": 10.578010559082031,
"rewards/rollout_reward_func/mean": 6.206368923187256,
"rewards/rollout_reward_func/std": 11.067666053771973,
"sampling/importance_sampling_ratio/max": 1.4831089973449707,
"sampling/importance_sampling_ratio/mean": 1.002763271331787,
"sampling/importance_sampling_ratio/min": 0.7234499454498291,
"sampling/sampling_logp_difference/max": 0.3583219051361084,
"sampling/sampling_logp_difference/mean": 0.007746794261038303,
"step": 109,
"step_time": 27.809638274999543
},
{
"clip_ratio/high_max": 0.026988637167960405,
"clip_ratio/high_mean": 0.010653409641236067,
"clip_ratio/low_mean": 0.014441288309171796,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.025094697950407863,
"entropy": 0.17460143100470304,
"epoch": 0.0022,
"grad_norm": 0.1794712245464325,
"kl": 0.6243367586284876,
"learning_rate": 9.999989859290995e-05,
"loss": 0.0027,
"step": 110,
"step_time": 7.0755484739993335
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 820.0,
"completions/max_terminated_length": 820.0,
"completions/mean_length": 697.578125,
"completions/mean_terminated_length": 697.578125,
"completions/min_length": 285.0,
"completions/min_terminated_length": 285.0,
"entropy": 0.17424820829182863,
"epoch": 0.00222,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.42331627011299133,
"kl": 0.586581215262413,
"learning_rate": 9.99998958336606e-05,
"loss": -0.0044,
"num_tokens": 5520852.0,
"reward": 3.5279414653778076,
"reward_std": 14.582866668701172,
"rewards/rollout_reward_func/mean": 3.5279414653778076,
"rewards/rollout_reward_func/std": 15.890913963317871,
"sampling/importance_sampling_ratio/max": 1.2239214181900024,
"sampling/importance_sampling_ratio/mean": 0.9994624853134155,
"sampling/importance_sampling_ratio/min": 0.6852503418922424,
"sampling/sampling_logp_difference/max": 0.31956130266189575,
"sampling/sampling_logp_difference/mean": 0.006933148950338364,
"step": 111,
"step_time": 29.204085013999247
},
{
"clip_ratio/high_max": 0.02651515230536461,
"clip_ratio/high_mean": 0.006628788076341152,
"clip_ratio/low_mean": 0.018129006726667285,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.02475779491942376,
"entropy": 0.1639441135339439,
"epoch": 0.00224,
"grad_norm": 0.19418354332447052,
"kl": 0.650929281488061,
"learning_rate": 9.999989303737441e-05,
"loss": -0.0109,
"step": 112,
"step_time": 7.643361527999332
},
{
"clip_ratio/high_max": 0.0052083334885537624,
"clip_ratio/high_mean": 0.0013020833721384406,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0013020833721384406,
"completions/clipped_ratio": 0.0,
"completions/max_length": 813.0,
"completions/max_terminated_length": 813.0,
"completions/mean_length": 693.09375,
"completions/mean_terminated_length": 693.09375,
"completions/min_length": 630.0,
"completions/min_terminated_length": 630.0,
"entropy": 0.14537212159484625,
"epoch": 0.00226,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.4737316071987152,
"kl": 0.669768800958991,
"learning_rate": 9.99998902040514e-05,
"loss": 0.0169,
"num_tokens": 5616460.0,
"reward": 3.8732333183288574,
"reward_std": 9.794268608093262,
"rewards/rollout_reward_func/mean": 3.8732333183288574,
"rewards/rollout_reward_func/std": 10.40365982055664,
"sampling/importance_sampling_ratio/max": 1.1869113445281982,
"sampling/importance_sampling_ratio/mean": 0.9964576959609985,
"sampling/importance_sampling_ratio/min": 0.5200645923614502,
"sampling/sampling_logp_difference/max": 0.6150112152099609,
"sampling/sampling_logp_difference/mean": 0.007128065451979637,
"step": 113,
"step_time": 27.900884353000038
},
{
"clip_ratio/high_max": 0.042140152771025896,
"clip_ratio/high_mean": 0.011837121681310236,
"clip_ratio/low_mean": 0.006510416860692203,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.01834753854200244,
"entropy": 0.14897123211994767,
"epoch": 0.00228,
"grad_norm": 0.20660799741744995,
"kl": 0.7189689762890339,
"learning_rate": 9.999988733369157e-05,
"loss": 0.0137,
"step": 114,
"step_time": 7.532232160000149
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 828.0,
"completions/max_terminated_length": 828.0,
"completions/mean_length": 689.0625,
"completions/mean_terminated_length": 689.0625,
"completions/min_length": 291.0,
"completions/min_terminated_length": 291.0,
"entropy": 0.16697307769209146,
"epoch": 0.0023,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.37319666147232056,
"kl": 0.6085000336170197,
"learning_rate": 9.999988442629488e-05,
"loss": -0.015,
"num_tokens": 5711756.0,
"reward": 3.845529079437256,
"reward_std": 9.702705383300781,
"rewards/rollout_reward_func/mean": 3.845529079437256,
"rewards/rollout_reward_func/std": 9.905435562133789,
"sampling/importance_sampling_ratio/max": 1.3216222524642944,
"sampling/importance_sampling_ratio/mean": 1.0128694772720337,
"sampling/importance_sampling_ratio/min": 0.7146333456039429,
"sampling/sampling_logp_difference/max": 0.3742462396621704,
"sampling/sampling_logp_difference/mean": 0.006911748554557562,
"step": 115,
"step_time": 29.116642522001257
},
{
"clip_ratio/high_max": 0.0416666679084301,
"clip_ratio/high_mean": 0.011718750349245965,
"clip_ratio/low_mean": 0.020951705169864,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.032670455519109964,
"entropy": 0.17214004416018724,
"epoch": 0.00232,
"grad_norm": 0.19411630928516388,
"kl": 0.6454576198011637,
"learning_rate": 9.99998814818614e-05,
"loss": -0.0191,
"step": 116,
"step_time": 7.846242159999747
},
{
"clip_ratio/high_max": 0.0052083334885537624,
"clip_ratio/high_mean": 0.0013020833721384406,
"clip_ratio/low_mean": 0.0013020833721384406,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0026041667442768812,
"completions/clipped_ratio": 0.0,
"completions/max_length": 815.0,
"completions/max_terminated_length": 815.0,
"completions/mean_length": 674.78125,
"completions/mean_terminated_length": 674.78125,
"completions/min_length": 273.0,
"completions/min_terminated_length": 273.0,
"entropy": 0.16358821745961905,
"epoch": 0.00234,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.3246734142303467,
"kl": 0.5800395030528307,
"learning_rate": 9.999987850039107e-05,
"loss": 0.0099,
"num_tokens": 5806145.0,
"reward": 1.2733659744262695,
"reward_std": 12.069713592529297,
"rewards/rollout_reward_func/mean": 1.2733662128448486,
"rewards/rollout_reward_func/std": 12.829185485839844,
"sampling/importance_sampling_ratio/max": 1.306739330291748,
"sampling/importance_sampling_ratio/mean": 1.0012977123260498,
"sampling/importance_sampling_ratio/min": 0.8135073781013489,
"sampling/sampling_logp_difference/max": 0.19866454601287842,
"sampling/sampling_logp_difference/mean": 0.006336529273539782,
"step": 117,
"step_time": 27.930649275999258
},
{
"clip_ratio/high_max": 0.02083333395421505,
"clip_ratio/high_mean": 0.006510416860692203,
"clip_ratio/low_mean": 0.013020833721384406,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.01953125058207661,
"entropy": 0.16348634008318186,
"epoch": 0.00236,
"grad_norm": 0.11877016723155975,
"kl": 0.587722685188055,
"learning_rate": 9.999987548188396e-05,
"loss": 0.0055,
"step": 118,
"step_time": 7.173724952000157
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 834.0,
"completions/max_terminated_length": 834.0,
"completions/mean_length": 685.921875,
"completions/mean_terminated_length": 685.921875,
"completions/min_length": 307.0,
"completions/min_terminated_length": 307.0,
"entropy": 0.17666231095790863,
"epoch": 0.00238,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2972959578037262,
"kl": 0.5733677446842194,
"learning_rate": 9.999987242634001e-05,
"loss": 0.0156,
"num_tokens": 5901319.0,
"reward": 6.098433494567871,
"reward_std": 11.96851921081543,
"rewards/rollout_reward_func/mean": 6.098433494567871,
"rewards/rollout_reward_func/std": 14.112695693969727,
"sampling/importance_sampling_ratio/max": 1.2103277444839478,
"sampling/importance_sampling_ratio/mean": 1.0073938369750977,
"sampling/importance_sampling_ratio/min": 0.7692804932594299,
"sampling/sampling_logp_difference/max": 0.13658356666564941,
"sampling/sampling_logp_difference/mean": 0.0063937013037502766,
"step": 119,
"step_time": 28.342584406000242
},
{
"clip_ratio/high_max": 0.010416666977107525,
"clip_ratio/high_mean": 0.0026041667442768812,
"clip_ratio/low_mean": 0.009114583488553762,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.011718750232830644,
"entropy": 0.16604932164773345,
"epoch": 0.0024,
"grad_norm": 0.23078079521656036,
"kl": 0.5974587891250849,
"learning_rate": 9.999986933375924e-05,
"loss": 0.0105,
"step": 120,
"step_time": 7.440147934999914
},
{
"clip_ratio/high_max": 0.005681818351149559,
"clip_ratio/high_mean": 0.0014204545877873898,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0014204545877873898,
"completions/clipped_ratio": 0.0,
"completions/max_length": 827.0,
"completions/max_terminated_length": 827.0,
"completions/mean_length": 677.953125,
"completions/mean_terminated_length": 677.953125,
"completions/min_length": 197.0,
"completions/min_terminated_length": 197.0,
"entropy": 0.13314053160138428,
"epoch": 0.00242,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2688275873661041,
"kl": 0.6813949979841709,
"learning_rate": 9.999986620414167e-05,
"loss": -0.0055,
"num_tokens": 5995970.0,
"reward": 4.1811299324035645,
"reward_std": 11.76725959777832,
"rewards/rollout_reward_func/mean": 4.1811299324035645,
"rewards/rollout_reward_func/std": 12.213129997253418,
"sampling/importance_sampling_ratio/max": 1.4055489301681519,
"sampling/importance_sampling_ratio/mean": 1.0007095336914062,
"sampling/importance_sampling_ratio/min": 0.7907775640487671,
"sampling/sampling_logp_difference/max": 0.2328205108642578,
"sampling/sampling_logp_difference/mean": 0.0057580312713980675,
"step": 121,
"step_time": 25.73195371799966
},
{
"clip_ratio/high_max": 0.03172348579391837,
"clip_ratio/high_mean": 0.007930871448479593,
"clip_ratio/low_mean": 0.007812500232830644,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.015743371681310236,
"entropy": 0.12837151251733303,
"epoch": 0.00244,
"grad_norm": 0.19652943313121796,
"kl": 0.6755912862718105,
"learning_rate": 9.99998630374873e-05,
"loss": -0.0109,
"step": 122,
"step_time": 7.963045050999881
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 777.0,
"completions/max_terminated_length": 777.0,
"completions/mean_length": 676.75,
"completions/mean_terminated_length": 676.75,
"completions/min_length": 277.0,
"completions/min_terminated_length": 277.0,
"entropy": 0.14288373803719878,
"epoch": 0.00246,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5129408836364746,
"kl": 0.6468502469360828,
"learning_rate": 9.999985983379613e-05,
"loss": -0.002,
"num_tokens": 6090409.0,
"reward": 5.090976238250732,
"reward_std": 8.817068099975586,
"rewards/rollout_reward_func/mean": 5.090975761413574,
"rewards/rollout_reward_func/std": 9.348170280456543,
"sampling/importance_sampling_ratio/max": 1.2873598337173462,
"sampling/importance_sampling_ratio/mean": 0.9989021420478821,
"sampling/importance_sampling_ratio/min": 0.8453167676925659,
"sampling/sampling_logp_difference/max": 0.1934504508972168,
"sampling/sampling_logp_difference/mean": 0.0064778015948832035,
"step": 123,
"step_time": 28.174620942000274
},
{
"clip_ratio/high_max": 0.026041667442768812,
"clip_ratio/high_mean": 0.006510416860692203,
"clip_ratio/low_mean": 0.02367424312978983,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.03018465987406671,
"entropy": 0.12851850083097816,
"epoch": 0.00248,
"grad_norm": 0.170461967587471,
"kl": 0.6984463054686785,
"learning_rate": 9.999985659306817e-05,
"loss": -0.0077,
"step": 124,
"step_time": 6.415902794999965
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 823.0,
"completions/max_terminated_length": 823.0,
"completions/mean_length": 686.765625,
"completions/mean_terminated_length": 686.765625,
"completions/min_length": 274.0,
"completions/min_terminated_length": 274.0,
"entropy": 0.12949980096891522,
"epoch": 0.0025,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.4389054477214813,
"kl": 0.8373041488230228,
"learning_rate": 9.999985331530339e-05,
"loss": -0.0001,
"num_tokens": 6185533.0,
"reward": 6.523627281188965,
"reward_std": 12.731056213378906,
"rewards/rollout_reward_func/mean": 6.523627281188965,
"rewards/rollout_reward_func/std": 13.220861434936523,
"sampling/importance_sampling_ratio/max": 1.4951905012130737,
"sampling/importance_sampling_ratio/mean": 1.0012614727020264,
"sampling/importance_sampling_ratio/min": 0.7251157760620117,
"sampling/sampling_logp_difference/max": 0.39764922857284546,
"sampling/sampling_logp_difference/mean": 0.006425045896321535,
"step": 125,
"step_time": 27.883570014999805
},
{
"clip_ratio/high_max": 0.03645833441987634,
"clip_ratio/high_mean": 0.009114583604969084,
"clip_ratio/low_mean": 0.02854567370377481,
"clip_ratio/low_min": 0.0052083334885537624,
"clip_ratio/region_mean": 0.03766025695949793,
"entropy": 0.11564141698181629,
"epoch": 0.00252,
"grad_norm": 0.24558016657829285,
"kl": 1.0033343844115734,
"learning_rate": 9.999985000050182e-05,
"loss": -0.0041,
"step": 126,
"step_time": 6.9546678629999406
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0013020833721384406,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0013020833721384406,
"completions/clipped_ratio": 0.0,
"completions/max_length": 837.0,
"completions/max_terminated_length": 837.0,
"completions/mean_length": 683.515625,
"completions/mean_terminated_length": 683.515625,
"completions/min_length": 617.0,
"completions/min_terminated_length": 617.0,
"entropy": 0.10271549178287387,
"epoch": 0.00254,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.48136869072914124,
"kl": 0.8534571155905724,
"learning_rate": 9.999984664866347e-05,
"loss": 0.0132,
"num_tokens": 6280443.0,
"reward": 4.674668788909912,
"reward_std": 11.713541030883789,
"rewards/rollout_reward_func/mean": 4.67466926574707,
"rewards/rollout_reward_func/std": 13.705061912536621,
"sampling/importance_sampling_ratio/max": 1.1515135765075684,
"sampling/importance_sampling_ratio/mean": 0.9829530715942383,
"sampling/importance_sampling_ratio/min": 0.6125902533531189,
"sampling/sampling_logp_difference/max": 0.4248628616333008,
"sampling/sampling_logp_difference/mean": 0.00649910606443882,
"step": 127,
"step_time": 27.126788691001366
},
{
"clip_ratio/high_max": 0.03645833441987634,
"clip_ratio/high_mean": 0.009114583604969084,
"clip_ratio/low_mean": 0.015625000465661287,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.02473958395421505,
"entropy": 0.10252567520365119,
"epoch": 0.00256,
"grad_norm": 0.25049537420272827,
"kl": 0.9629664830863476,
"learning_rate": 9.999984325978833e-05,
"loss": 0.0108,
"step": 128,
"step_time": 7.304040701002123
},
{
"clip_ratio/high_max": 0.004166666883975267,
"clip_ratio/high_mean": 0.0010416667209938169,
"clip_ratio/low_mean": 0.0010416667209938169,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0020833334419876337,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1019.0,
"completions/max_terminated_length": 1019.0,
"completions/mean_length": 959.40625,
"completions/mean_terminated_length": 959.40625,
"completions/min_length": 910.0,
"completions/min_terminated_length": 910.0,
"entropy": 0.13085902528837323,
"epoch": 0.00258,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.6536189913749695,
"kl": 0.8716370463371277,
"learning_rate": 9.99998398338764e-05,
"loss": 0.0229,
"num_tokens": 6393090.0,
"reward": 5.6695556640625,
"reward_std": 11.05074405670166,
"rewards/rollout_reward_func/mean": 5.669555187225342,
"rewards/rollout_reward_func/std": 12.366477966308594,
"sampling/importance_sampling_ratio/max": 1.2895963191986084,
"sampling/importance_sampling_ratio/mean": 1.023085355758667,
"sampling/importance_sampling_ratio/min": 0.7725162506103516,
"sampling/sampling_logp_difference/max": 0.30040407180786133,
"sampling/sampling_logp_difference/mean": 0.008372966200113297,
"step": 129,
"step_time": 33.28244163999989
},
{
"clip_ratio/high_max": 0.07559524197131395,
"clip_ratio/high_mean": 0.028273811331018806,
"clip_ratio/low_mean": 0.02299107296857983,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.051264884416013956,
"entropy": 0.14083249866962433,
"epoch": 0.0026,
"grad_norm": 0.30618196725845337,
"kl": 0.9511819295585155,
"learning_rate": 9.999983637092769e-05,
"loss": 0.0154,
"step": 130,
"step_time": 8.346246693000012
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1034.0,
"completions/max_terminated_length": 1034.0,
"completions/mean_length": 954.09375,
"completions/mean_terminated_length": 954.09375,
"completions/min_length": 446.0,
"completions/min_terminated_length": 446.0,
"entropy": 0.1475105220451951,
"epoch": 0.00262,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5288283228874207,
"kl": 0.912773609161377,
"learning_rate": 9.999983287094222e-05,
"loss": -0.0212,
"num_tokens": 6505385.0,
"reward": 7.057158470153809,
"reward_std": 10.295648574829102,
"rewards/rollout_reward_func/mean": 7.057158470153809,
"rewards/rollout_reward_func/std": 10.425559997558594,
"sampling/importance_sampling_ratio/max": 1.3561307191848755,
"sampling/importance_sampling_ratio/mean": 0.9847082495689392,
"sampling/importance_sampling_ratio/min": 0.6632312536239624,
"sampling/sampling_logp_difference/max": 0.22558808326721191,
"sampling/sampling_logp_difference/mean": 0.00731184845790267,
"step": 131,
"step_time": 33.95278312799974
},
{
"clip_ratio/high_max": 0.06726190773770213,
"clip_ratio/high_mean": 0.02313988225068897,
"clip_ratio/low_mean": 0.03020833560731262,
"clip_ratio/low_min": 0.004166666883975267,
"clip_ratio/region_mean": 0.0533482184400782,
"entropy": 0.16657310537993908,
"epoch": 0.00264,
"grad_norm": 0.29056552052497864,
"kl": 0.7771002501249313,
"learning_rate": 9.999982933391997e-05,
"loss": -0.0284,
"step": 132,
"step_time": 7.300914149999244
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0031250001629814506,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0031250001629814506,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1041.0,
"completions/max_terminated_length": 1041.0,
"completions/mean_length": 929.96875,
"completions/mean_terminated_length": 929.96875,
"completions/min_length": 190.0,
"completions/min_terminated_length": 190.0,
"entropy": 0.18474403023719788,
"epoch": 0.00266,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.6026266813278198,
"kl": 0.7070890348404646,
"learning_rate": 9.999982575986094e-05,
"loss": -0.0,
"num_tokens": 6616176.0,
"reward": 3.4366226196289062,
"reward_std": 14.906189918518066,
"rewards/rollout_reward_func/mean": 3.436622381210327,
"rewards/rollout_reward_func/std": 16.053083419799805,
"sampling/importance_sampling_ratio/max": 1.3064157962799072,
"sampling/importance_sampling_ratio/mean": 1.0054032802581787,
"sampling/importance_sampling_ratio/min": 0.5862367749214172,
"sampling/sampling_logp_difference/max": 0.5461184978485107,
"sampling/sampling_logp_difference/mean": 0.010052897036075592,
"step": 133,
"step_time": 32.881761665999875
},
{
"clip_ratio/high_max": 0.06369047937914729,
"clip_ratio/high_mean": 0.022172620403580368,
"clip_ratio/low_mean": 0.03557477821595967,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.05774739931803197,
"entropy": 0.18857589829713106,
"epoch": 0.00268,
"grad_norm": 0.2586621344089508,
"kl": 0.8268643505871296,
"learning_rate": 9.999982214876515e-05,
"loss": -0.0078,
"step": 134,
"step_time": 7.700307692000479
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1035.0,
"completions/max_terminated_length": 1035.0,
"completions/mean_length": 949.171875,
"completions/mean_terminated_length": 949.171875,
"completions/min_length": 820.0,
"completions/min_terminated_length": 820.0,
"entropy": 0.21391641069203615,
"epoch": 0.0027,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5073988437652588,
"kl": 0.669338870793581,
"learning_rate": 9.999981850063262e-05,
"loss": -0.0078,
"num_tokens": 6728116.0,
"reward": 5.17537784576416,
"reward_std": 13.093953132629395,
"rewards/rollout_reward_func/mean": 5.175378322601318,
"rewards/rollout_reward_func/std": 13.309264183044434,
"sampling/importance_sampling_ratio/max": 1.3000229597091675,
"sampling/importance_sampling_ratio/mean": 0.9869031310081482,
"sampling/importance_sampling_ratio/min": 0.7261144518852234,
"sampling/sampling_logp_difference/max": 0.1514453887939453,
"sampling/sampling_logp_difference/mean": 0.008218428120017052,
"step": 135,
"step_time": 32.26767973099959
},
{
"clip_ratio/high_max": 0.06815476482734084,
"clip_ratio/high_mean": 0.022321430151350796,
"clip_ratio/low_mean": 0.04136904957704246,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.06369047961197793,
"entropy": 0.21292453352361917,
"epoch": 0.00272,
"grad_norm": 0.3758169710636139,
"kl": 0.6697604712098837,
"learning_rate": 9.99998148154633e-05,
"loss": -0.0147,
"step": 136,
"step_time": 8.984081079998305
},
{
"clip_ratio/high_max": 0.004166666883975267,
"clip_ratio/high_mean": 0.0010416667209938169,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0010416667209938169,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1033.0,
"completions/max_terminated_length": 1033.0,
"completions/mean_length": 941.34375,
"completions/mean_terminated_length": 941.34375,
"completions/min_length": 618.0,
"completions/min_terminated_length": 618.0,
"entropy": 0.23286819364875555,
"epoch": 0.00274,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.6691973805427551,
"kl": 0.6865591164678335,
"learning_rate": 9.999981109325724e-05,
"loss": 0.0217,
"num_tokens": 6839571.0,
"reward": 4.762706756591797,
"reward_std": 11.42410659790039,
"rewards/rollout_reward_func/mean": 4.762706756591797,
"rewards/rollout_reward_func/std": 11.434100151062012,
"sampling/importance_sampling_ratio/max": 1.5375083684921265,
"sampling/importance_sampling_ratio/mean": 1.0140312910079956,
"sampling/importance_sampling_ratio/min": 0.691352128982544,
"sampling/sampling_logp_difference/max": 0.24680709838867188,
"sampling/sampling_logp_difference/mean": 0.010121582075953484,
"step": 137,
"step_time": 31.738008715999968
},
{
"clip_ratio/high_max": 0.07113095559179783,
"clip_ratio/high_mean": 0.022098215762525797,
"clip_ratio/low_mean": 0.0486922818236053,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.07079049723688513,
"entropy": 0.2127716289833188,
"epoch": 0.00276,
"grad_norm": 0.30709579586982727,
"kl": 0.6930392682552338,
"learning_rate": 9.999980733401442e-05,
"loss": 0.0087,
"step": 138,
"step_time": 8.016800426000827
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1036.0,
"completions/max_terminated_length": 1036.0,
"completions/mean_length": 978.6875,
"completions/mean_terminated_length": 978.6875,
"completions/min_length": 407.0,
"completions/min_terminated_length": 407.0,
"entropy": 0.19525799248367548,
"epoch": 0.00278,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.7576553225517273,
"kl": 0.7210239768028259,
"learning_rate": 9.999980353773486e-05,
"loss": 0.0087,
"num_tokens": 6953628.0,
"reward": 7.9931640625,
"reward_std": 14.572214126586914,
"rewards/rollout_reward_func/mean": 7.993164539337158,
"rewards/rollout_reward_func/std": 15.543896675109863,
"sampling/importance_sampling_ratio/max": 1.4368523359298706,
"sampling/importance_sampling_ratio/mean": 1.020465612411499,
"sampling/importance_sampling_ratio/min": 0.6616964340209961,
"sampling/sampling_logp_difference/max": 0.3545997142791748,
"sampling/sampling_logp_difference/mean": 0.009700989350676537,
"step": 139,
"step_time": 31.6874715999993
},
{
"clip_ratio/high_max": 0.07712912419810891,
"clip_ratio/high_mean": 0.02553228137549013,
"clip_ratio/low_mean": 0.053521828493103385,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.07905411045067012,
"entropy": 0.1893756091594696,
"epoch": 0.0028,
"grad_norm": 0.31711098551750183,
"kl": 0.786970479413867,
"learning_rate": 9.999979970441856e-05,
"loss": -0.0032,
"step": 140,
"step_time": 8.092264081999474
},
{
"clip_ratio/high_max": 0.008333333767950535,
"clip_ratio/high_mean": 0.0020833334419876337,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0020833334419876337,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1037.0,
"completions/max_terminated_length": 1037.0,
"completions/mean_length": 955.375,
"completions/mean_terminated_length": 955.375,
"completions/min_length": 198.0,
"completions/min_terminated_length": 198.0,
"entropy": 0.17851338349282742,
"epoch": 0.00282,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5795699954032898,
"kl": 0.7661695275455713,
"learning_rate": 9.999979583406551e-05,
"loss": -0.0028,
"num_tokens": 7066060.0,
"reward": 5.970464706420898,
"reward_std": 14.057101249694824,
"rewards/rollout_reward_func/mean": 5.970464706420898,
"rewards/rollout_reward_func/std": 15.589529991149902,
"sampling/importance_sampling_ratio/max": 1.2306140661239624,
"sampling/importance_sampling_ratio/mean": 1.0006752014160156,
"sampling/importance_sampling_ratio/min": 0.7063568830490112,
"sampling/sampling_logp_difference/max": 0.24321842193603516,
"sampling/sampling_logp_difference/mean": 0.008507179096341133,
"step": 141,
"step_time": 31.519457149999653
},
{
"clip_ratio/high_max": 0.10007440904155374,
"clip_ratio/high_mean": 0.03335193661041558,
"clip_ratio/low_mean": 0.04136905015911907,
"clip_ratio/low_min": 0.004166666883975267,
"clip_ratio/region_mean": 0.07472098711878061,
"entropy": 0.16169621469452977,
"epoch": 0.00284,
"grad_norm": 0.21583755314350128,
"kl": 0.8030649330466986,
"learning_rate": 9.999979192667573e-05,
"loss": -0.0127,
"step": 142,
"step_time": 8.37791394099986
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1048.0,
"completions/max_terminated_length": 1048.0,
"completions/mean_length": 965.875,
"completions/mean_terminated_length": 965.875,
"completions/min_length": 397.0,
"completions/min_terminated_length": 397.0,
"entropy": 0.1351936119608581,
"epoch": 0.00286,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.6277215480804443,
"kl": 0.6056302916258574,
"learning_rate": 9.999978798224921e-05,
"loss": -0.0037,
"num_tokens": 7179154.0,
"reward": 7.006319046020508,
"reward_std": 16.71393394470215,
"rewards/rollout_reward_func/mean": 7.006319522857666,
"rewards/rollout_reward_func/std": 17.009944915771484,
"sampling/importance_sampling_ratio/max": 1.4720120429992676,
"sampling/importance_sampling_ratio/mean": 1.0313916206359863,
"sampling/importance_sampling_ratio/min": 0.8535375595092773,
"sampling/sampling_logp_difference/max": 0.33231019973754883,
"sampling/sampling_logp_difference/mean": 0.007416378241032362,
"step": 143,
"step_time": 31.253298032000657
},
{
"clip_ratio/high_max": 0.03363095410168171,
"clip_ratio/high_mean": 0.010565476841293275,
"clip_ratio/low_mean": 0.022564054117538035,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.03312953084241599,
"entropy": 0.12869372498244047,
"epoch": 0.00288,
"grad_norm": 0.3357137143611908,
"kl": 0.6441880892962217,
"learning_rate": 9.999978400078598e-05,
"loss": -0.011,
"step": 144,
"step_time": 8.612604698998894
},
{
"clip_ratio/high_max": 0.004166666883975267,
"clip_ratio/high_mean": 0.0010416667209938169,
"clip_ratio/low_mean": 0.0010416667209938169,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0020833334419876337,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1033.0,
"completions/max_terminated_length": 1033.0,
"completions/mean_length": 981.234375,
"completions/mean_terminated_length": 981.234375,
"completions/min_length": 922.0,
"completions/min_terminated_length": 922.0,
"entropy": 0.14574182452633977,
"epoch": 0.0029,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.6256417632102966,
"kl": 0.680026089772582,
"learning_rate": 9.9999779982286e-05,
"loss": 0.0066,
"num_tokens": 7293276.0,
"reward": 9.345416069030762,
"reward_std": 12.761893272399902,
"rewards/rollout_reward_func/mean": 9.345417022705078,
"rewards/rollout_reward_func/std": 14.231216430664062,
"sampling/importance_sampling_ratio/max": 1.2866383790969849,
"sampling/importance_sampling_ratio/mean": 0.9946876764297485,
"sampling/importance_sampling_ratio/min": 0.7063043117523193,
"sampling/sampling_logp_difference/max": 0.3008323907852173,
"sampling/sampling_logp_difference/mean": 0.007616790477186441,
"step": 145,
"step_time": 31.996783762999257
},
{
"clip_ratio/high_max": 0.05424107378348708,
"clip_ratio/high_mean": 0.020851935259997845,
"clip_ratio/low_mean": 0.03377976384945214,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.05463169957511127,
"entropy": 0.14952234365046024,
"epoch": 0.00292,
"grad_norm": 0.3231852948665619,
"kl": 0.7531629204750061,
"learning_rate": 9.999977592674931e-05,
"loss": -0.0032,
"step": 146,
"step_time": 8.073437064001155
},
{
"clip_ratio/high_max": 0.012500000651925802,
"clip_ratio/high_mean": 0.0031250001629814506,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0031250001629814506,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1024.0,
"completions/max_terminated_length": 1024.0,
"completions/mean_length": 951.1875,
"completions/mean_terminated_length": 951.1875,
"completions/min_length": 193.0,
"completions/min_terminated_length": 193.0,
"entropy": 0.14304543379694223,
"epoch": 0.00294,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.4544009566307068,
"kl": 0.6561761032789946,
"learning_rate": 9.999977183417592e-05,
"loss": -0.0136,
"num_tokens": 7405394.0,
"reward": 9.592363357543945,
"reward_std": 11.82339859008789,
"rewards/rollout_reward_func/mean": 9.592363357543945,
"rewards/rollout_reward_func/std": 12.213863372802734,
"sampling/importance_sampling_ratio/max": 1.3994261026382446,
"sampling/importance_sampling_ratio/mean": 0.9877851009368896,
"sampling/importance_sampling_ratio/min": 0.5693183541297913,
"sampling/sampling_logp_difference/max": 0.5401673913002014,
"sampling/sampling_logp_difference/mean": 0.007635599002242088,
"step": 147,
"step_time": 31.870756492000055
},
{
"clip_ratio/high_max": 0.054166669491678476,
"clip_ratio/high_mean": 0.013541667372919619,
"clip_ratio/low_mean": 0.036681550089269876,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.05022321757860482,
"entropy": 0.14903255039826035,
"epoch": 0.00296,
"grad_norm": 0.34076768159866333,
"kl": 0.6760309524834156,
"learning_rate": 9.99997677045658e-05,
"loss": -0.0174,
"step": 148,
"step_time": 8.03263958799971
},
{
"clip_ratio/high_max": 0.008333333767950535,
"clip_ratio/high_mean": 0.0020833334419876337,
"clip_ratio/low_mean": 0.0022435898426920176,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.004326923284679651,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1022.0,
"completions/max_terminated_length": 1022.0,
"completions/mean_length": 950.578125,
"completions/mean_terminated_length": 950.578125,
"completions/min_length": 673.0,
"completions/min_terminated_length": 673.0,
"entropy": 0.16968106850981712,
"epoch": 0.00298,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5638662576675415,
"kl": 0.6232388503849506,
"learning_rate": 9.999976353791898e-05,
"loss": -0.0115,
"num_tokens": 7517436.0,
"reward": 6.506036281585693,
"reward_std": 12.593399047851562,
"rewards/rollout_reward_func/mean": 6.506035804748535,
"rewards/rollout_reward_func/std": 13.552786827087402,
"sampling/importance_sampling_ratio/max": 1.6476225852966309,
"sampling/importance_sampling_ratio/mean": 0.9991188645362854,
"sampling/importance_sampling_ratio/min": 0.5213066935539246,
"sampling/sampling_logp_difference/max": 0.576519250869751,
"sampling/sampling_logp_difference/mean": 0.01059242058545351,
"step": 149,
"step_time": 30.528242389000752
},
{
"clip_ratio/high_max": 0.05000000260770321,
"clip_ratio/high_mean": 0.01458333432674408,
"clip_ratio/low_mean": 0.03889938397333026,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.05348271911498159,
"entropy": 0.17780038248747587,
"epoch": 0.003,
"grad_norm": 0.5385463833808899,
"kl": 0.8597960155457258,
"learning_rate": 9.999975933423545e-05,
"loss": -0.0172,
"step": 150,
"step_time": 8.0192518380004
},
{
"clip_ratio/high_max": 0.004166666883975267,
"clip_ratio/high_mean": 0.0010416667209938169,
"clip_ratio/low_mean": 0.002157738199457526,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.003199404920451343,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1020.0,
"completions/max_terminated_length": 1020.0,
"completions/mean_length": 953.953125,
"completions/mean_terminated_length": 953.953125,
"completions/min_length": 664.0,
"completions/min_terminated_length": 664.0,
"entropy": 0.1825277367606759,
"epoch": 0.00302,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.6920294165611267,
"kl": 0.6721424907445908,
"learning_rate": 9.999975509351522e-05,
"loss": -0.0165,
"num_tokens": 7629697.0,
"reward": 6.279596328735352,
"reward_std": 13.454200744628906,
"rewards/rollout_reward_func/mean": 6.279596328735352,
"rewards/rollout_reward_func/std": 15.490900039672852,
"sampling/importance_sampling_ratio/max": 1.2544176578521729,
"sampling/importance_sampling_ratio/mean": 0.9968298673629761,
"sampling/importance_sampling_ratio/min": 0.5891286730766296,
"sampling/sampling_logp_difference/max": 0.36822509765625,
"sampling/sampling_logp_difference/mean": 0.009644769132137299,
"step": 151,
"step_time": 30.041253716999563
},
{
"clip_ratio/high_max": 0.06250000279396772,
"clip_ratio/high_mean": 0.02187500149011612,
"clip_ratio/low_mean": 0.027847783756442368,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0497227858286351,
"entropy": 0.19313342962414026,
"epoch": 0.00304,
"grad_norm": 0.3150973320007324,
"kl": 0.6543413959443569,
"learning_rate": 9.99997508157583e-05,
"loss": -0.0263,
"step": 152,
"step_time": 8.048088266000377
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 993.0,
"completions/max_terminated_length": 993.0,
"completions/mean_length": 933.640625,
"completions/mean_terminated_length": 933.640625,
"completions/min_length": 191.0,
"completions/min_terminated_length": 191.0,
"entropy": 0.1851256461814046,
"epoch": 0.00306,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.6191554665565491,
"kl": 0.5646015591919422,
"learning_rate": 9.999974650096467e-05,
"loss": -0.0157,
"num_tokens": 7740640.0,
"reward": 7.951285362243652,
"reward_std": 13.322220802307129,
"rewards/rollout_reward_func/mean": 7.951285362243652,
"rewards/rollout_reward_func/std": 15.29836654663086,
"sampling/importance_sampling_ratio/max": 1.1902070045471191,
"sampling/importance_sampling_ratio/mean": 0.9911805987358093,
"sampling/importance_sampling_ratio/min": 0.6955353617668152,
"sampling/sampling_logp_difference/max": 0.37529921531677246,
"sampling/sampling_logp_difference/mean": 0.007848689332604408,
"step": 153,
"step_time": 30.541750664000574
},
{
"clip_ratio/high_max": 0.04301470750942826,
"clip_ratio/high_mean": 0.013878677156753838,
"clip_ratio/low_mean": 0.039536832249723375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.053415509522892535,
"entropy": 0.16637779865413904,
"epoch": 0.00308,
"grad_norm": 0.3494158089160919,
"kl": 0.6059492044150829,
"learning_rate": 9.999974214913437e-05,
"loss": -0.0231,
"step": 154,
"step_time": 8.139173758999277
},
{
"clip_ratio/high_max": 0.004166666883975267,
"clip_ratio/high_mean": 0.0010416667209938169,
"clip_ratio/low_mean": 0.0010416667209938169,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0020833334419876337,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1041.0,
"completions/max_terminated_length": 1041.0,
"completions/mean_length": 972.640625,
"completions/mean_terminated_length": 972.640625,
"completions/min_length": 935.0,
"completions/min_terminated_length": 935.0,
"entropy": 0.1503364727832377,
"epoch": 0.0031,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.6759209036827087,
"kl": 0.6219805851578712,
"learning_rate": 9.999973776026739e-05,
"loss": 0.0152,
"num_tokens": 7854154.0,
"reward": 5.902735710144043,
"reward_std": 12.42209243774414,
"rewards/rollout_reward_func/mean": 5.902735710144043,
"rewards/rollout_reward_func/std": 12.867145538330078,
"sampling/importance_sampling_ratio/max": 1.4259474277496338,
"sampling/importance_sampling_ratio/mean": 1.0006431341171265,
"sampling/importance_sampling_ratio/min": 0.6987265348434448,
"sampling/sampling_logp_difference/max": 0.35797882080078125,
"sampling/sampling_logp_difference/mean": 0.008803295902907848,
"step": 155,
"step_time": 31.54653142600091
},
{
"clip_ratio/high_max": 0.054464288521558046,
"clip_ratio/high_mean": 0.018824405618943274,
"clip_ratio/low_mean": 0.0364583358168602,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.05528274178504944,
"entropy": 0.1241895561106503,
"epoch": 0.00312,
"grad_norm": 0.955508828163147,
"kl": 0.9998617265373468,
"learning_rate": 9.999973333436372e-05,
"loss": 0.017,
"step": 156,
"step_time": 7.910055370999544
},
{
"clip_ratio/high_max": 0.004166666883975267,
"clip_ratio/high_mean": 0.0010416667209938169,
"clip_ratio/low_mean": 0.0006127451197244227,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0016544118407182395,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1040.0,
"completions/max_terminated_length": 1040.0,
"completions/mean_length": 972.546875,
"completions/mean_terminated_length": 972.546875,
"completions/min_length": 305.0,
"completions/min_terminated_length": 305.0,
"entropy": 0.11084589222446084,
"epoch": 0.00314,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.9052144885063171,
"kl": 0.9162529278546572,
"learning_rate": 9.999972887142338e-05,
"loss": 0.0236,
"num_tokens": 7967770.0,
"reward": 10.1655855178833,
"reward_std": 15.845230102539062,
"rewards/rollout_reward_func/mean": 10.1655855178833,
"rewards/rollout_reward_func/std": 17.717178344726562,
"sampling/importance_sampling_ratio/max": 1.5550763607025146,
"sampling/importance_sampling_ratio/mean": 1.0152667760849,
"sampling/importance_sampling_ratio/min": 0.6825421452522278,
"sampling/sampling_logp_difference/max": 0.38708627223968506,
"sampling/sampling_logp_difference/mean": 0.006948791444301605,
"step": 157,
"step_time": 30.977979516999312
},
{
"clip_ratio/high_max": 0.041964287869632244,
"clip_ratio/high_mean": 0.013616072130389512,
"clip_ratio/low_mean": 0.019929535686969757,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.03354560805018991,
"entropy": 0.11029910668730736,
"epoch": 0.00316,
"grad_norm": 0.3586527705192566,
"kl": 0.996163547039032,
"learning_rate": 9.999972437144637e-05,
"loss": 0.018,
"step": 158,
"step_time": 8.73399685899949
},
{
"clip_ratio/high_max": 0.004166666883975267,
"clip_ratio/high_mean": 0.0010416667209938169,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0010416667209938169,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1026.0,
"completions/max_terminated_length": 1026.0,
"completions/mean_length": 954.4375,
"completions/mean_terminated_length": 954.4375,
"completions/min_length": 686.0,
"completions/min_terminated_length": 686.0,
"entropy": 0.14973071590065956,
"epoch": 0.00318,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.7992783784866333,
"kl": 0.5131530929356813,
"learning_rate": 9.999971983443269e-05,
"loss": -0.0019,
"num_tokens": 8080082.0,
"reward": 5.8201141357421875,
"reward_std": 11.146739959716797,
"rewards/rollout_reward_func/mean": 5.8201141357421875,
"rewards/rollout_reward_func/std": 11.795808792114258,
"sampling/importance_sampling_ratio/max": 1.2158492803573608,
"sampling/importance_sampling_ratio/mean": 0.9923404455184937,
"sampling/importance_sampling_ratio/min": 0.623603343963623,
"sampling/sampling_logp_difference/max": 0.24274826049804688,
"sampling/sampling_logp_difference/mean": 0.007134515792131424,
"step": 159,
"step_time": 31.143712819999564
},
{
"clip_ratio/high_max": 0.06250000232830644,
"clip_ratio/high_mean": 0.017708334140479565,
"clip_ratio/low_mean": 0.028382036020047963,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.046090369927696884,
"entropy": 0.15424074092879891,
"epoch": 0.0032,
"grad_norm": 0.4114607274532318,
"kl": 0.5258241277188063,
"learning_rate": 9.999971526038235e-05,
"loss": -0.0105,
"step": 160,
"step_time": 7.376053459000104
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0020833334419876337,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0020833334419876337,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1066.0,
"completions/max_terminated_length": 1066.0,
"completions/mean_length": 964.390625,
"completions/mean_terminated_length": 964.390625,
"completions/min_length": 795.0,
"completions/min_terminated_length": 795.0,
"entropy": 0.14214739575982094,
"epoch": 0.00322,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.6593955159187317,
"kl": 0.7137422636151314,
"learning_rate": 9.999971064929537e-05,
"loss": 0.0221,
"num_tokens": 8193063.0,
"reward": 7.681003093719482,
"reward_std": 11.441247940063477,
"rewards/rollout_reward_func/mean": 7.681002616882324,
"rewards/rollout_reward_func/std": 13.56708812713623,
"sampling/importance_sampling_ratio/max": 1.4164402484893799,
"sampling/importance_sampling_ratio/mean": 1.0107839107513428,
"sampling/importance_sampling_ratio/min": 0.6920035481452942,
"sampling/sampling_logp_difference/max": 0.3535594940185547,
"sampling/sampling_logp_difference/mean": 0.007559535559266806,
"step": 161,
"step_time": 32.16549203100021
},
{
"clip_ratio/high_max": 0.045833335258066654,
"clip_ratio/high_mean": 0.014657739084213972,
"clip_ratio/low_mean": 0.033670345321297646,
"clip_ratio/low_min": 0.004166666883975267,
"clip_ratio/region_mean": 0.04832808405626565,
"entropy": 0.1284659137018025,
"epoch": 0.00324,
"grad_norm": 0.44948309659957886,
"kl": 0.8788620755076408,
"learning_rate": 9.999970600117172e-05,
"loss": 0.0155,
"step": 162,
"step_time": 8.349364119001166
},
{
"clip_ratio/high_max": 0.012500000651925802,
"clip_ratio/high_mean": 0.0031250001629814506,
"clip_ratio/low_mean": 0.0010416667209938169,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.004166666883975267,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1028.0,
"completions/max_terminated_length": 1028.0,
"completions/mean_length": 958.1875,
"completions/mean_terminated_length": 958.1875,
"completions/min_length": 688.0,
"completions/min_terminated_length": 688.0,
"entropy": 0.1297779600135982,
"epoch": 0.00326,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.45152774453163147,
"kl": 0.6032252982258797,
"learning_rate": 9.999970131601142e-05,
"loss": -0.007,
"num_tokens": 8305653.0,
"reward": 9.560303688049316,
"reward_std": 12.965145111083984,
"rewards/rollout_reward_func/mean": 9.560302734375,
"rewards/rollout_reward_func/std": 13.572053909301758,
"sampling/importance_sampling_ratio/max": 1.3970085382461548,
"sampling/importance_sampling_ratio/mean": 0.9942675828933716,
"sampling/importance_sampling_ratio/min": 0.5912600755691528,
"sampling/sampling_logp_difference/max": 0.43671131134033203,
"sampling/sampling_logp_difference/mean": 0.006968793459236622,
"step": 163,
"step_time": 29.62484441499919
},
{
"clip_ratio/high_max": 0.04534313944168389,
"clip_ratio/high_mean": 0.013419118302408606,
"clip_ratio/low_mean": 0.028385418467223644,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.04180453671142459,
"entropy": 0.12798475893214345,
"epoch": 0.00328,
"grad_norm": 0.37086573243141174,
"kl": 0.5329502020031214,
"learning_rate": 9.99996965938145e-05,
"loss": -0.0114,
"step": 164,
"step_time": 9.19148286500058
},
{
"clip_ratio/high_max": 0.004166666883975267,
"clip_ratio/high_mean": 0.0010416667209938169,
"clip_ratio/low_mean": 0.0010416667209938169,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0020833334419876337,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1068.0,
"completions/max_terminated_length": 1068.0,
"completions/mean_length": 958.28125,
"completions/mean_terminated_length": 958.28125,
"completions/min_length": 714.0,
"completions/min_terminated_length": 714.0,
"entropy": 0.14121837774291635,
"epoch": 0.0033,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.445486456155777,
"kl": 0.6868670284748077,
"learning_rate": 9.999969183458092e-05,
"loss": 0.017,
"num_tokens": 8418180.0,
"reward": 6.036255836486816,
"reward_std": 14.006401062011719,
"rewards/rollout_reward_func/mean": 6.036255836486816,
"rewards/rollout_reward_func/std": 15.667006492614746,
"sampling/importance_sampling_ratio/max": 1.4084051847457886,
"sampling/importance_sampling_ratio/mean": 0.9844825267791748,
"sampling/importance_sampling_ratio/min": 0.6458684802055359,
"sampling/sampling_logp_difference/max": 0.35437726974487305,
"sampling/sampling_logp_difference/mean": 0.008984029293060303,
"step": 165,
"step_time": 30.86212910500126
},
{
"clip_ratio/high_max": 0.041964287869632244,
"clip_ratio/high_mean": 0.012574405525811017,
"clip_ratio/low_mean": 0.02604166802484542,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.038616073317825794,
"entropy": 0.13694474566727877,
"epoch": 0.00332,
"grad_norm": 0.2597510814666748,
"kl": 0.670884259045124,
"learning_rate": 9.999968703831071e-05,
"loss": 0.012,
"step": 166,
"step_time": 8.765868728999521
},
{
"clip_ratio/high_max": 0.008333333767950535,
"clip_ratio/high_mean": 0.0020833334419876337,
"clip_ratio/low_mean": 0.0020833334419876337,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.004166666883975267,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1028.0,
"completions/max_terminated_length": 1028.0,
"completions/mean_length": 964.015625,
"completions/mean_terminated_length": 964.015625,
"completions/min_length": 773.0,
"completions/min_terminated_length": 773.0,
"entropy": 0.13714495720341802,
"epoch": 0.00334,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.742760181427002,
"kl": 0.5935596115887165,
"learning_rate": 9.999968220500386e-05,
"loss": 0.0264,
"num_tokens": 8531148.0,
"reward": 6.6519269943237305,
"reward_std": 14.873868942260742,
"rewards/rollout_reward_func/mean": 6.6519269943237305,
"rewards/rollout_reward_func/std": 15.216424942016602,
"sampling/importance_sampling_ratio/max": 1.4992643594741821,
"sampling/importance_sampling_ratio/mean": 1.0216107368469238,
"sampling/importance_sampling_ratio/min": 0.7036370635032654,
"sampling/sampling_logp_difference/max": 0.351947546005249,
"sampling/sampling_logp_difference/mean": 0.008944995701313019,
"step": 167,
"step_time": 30.057006109999747
},
{
"clip_ratio/high_max": 0.03750000195577741,
"clip_ratio/high_mean": 0.013541667489334941,
"clip_ratio/low_mean": 0.03437500225845724,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.04791666998062283,
"entropy": 0.13065697345882654,
"epoch": 0.00336,
"grad_norm": 8.381538391113281,
"kl": 7.166379388421774,
"learning_rate": 9.999967733466041e-05,
"loss": 0.0808,
"step": 168,
"step_time": 8.213664751000124
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0010416667209938169,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0010416667209938169,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1016.0,
"completions/max_terminated_length": 1016.0,
"completions/mean_length": 943.078125,
"completions/mean_terminated_length": 943.078125,
"completions/min_length": 868.0,
"completions/min_terminated_length": 868.0,
"entropy": 0.13596792286261916,
"epoch": 0.00338,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.6320606470108032,
"kl": 0.5089176166802645,
"learning_rate": 9.999967242728034e-05,
"loss": -0.0005,
"num_tokens": 8642652.0,
"reward": 9.83786392211914,
"reward_std": 12.724628448486328,
"rewards/rollout_reward_func/mean": 9.83786392211914,
"rewards/rollout_reward_func/std": 13.589927673339844,
"sampling/importance_sampling_ratio/max": 1.5156316757202148,
"sampling/importance_sampling_ratio/mean": 1.001371145248413,
"sampling/importance_sampling_ratio/min": 0.75341796875,
"sampling/sampling_logp_difference/max": 0.40897202491760254,
"sampling/sampling_logp_difference/mean": 0.006749385967850685,
"step": 169,
"step_time": 30.052868118000788
},
{
"clip_ratio/high_max": 0.020833334419876337,
"clip_ratio/high_mean": 0.007291667046956718,
"clip_ratio/low_mean": 0.03333333553746343,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.04062500281725079,
"entropy": 0.13328771898522973,
"epoch": 0.0034,
"grad_norm": 0.27786943316459656,
"kl": 0.5417735707014799,
"learning_rate": 9.999966748286363e-05,
"loss": -0.004,
"step": 170,
"step_time": 7.808134698000686
},
{
"clip_ratio/high_max": 0.004166666883975267,
"clip_ratio/high_mean": 0.0010416667209938169,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0010416667209938169,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1033.0,
"completions/max_terminated_length": 1033.0,
"completions/mean_length": 977.46875,
"completions/mean_terminated_length": 977.46875,
"completions/min_length": 890.0,
"completions/min_terminated_length": 890.0,
"entropy": 0.14305478753522038,
"epoch": 0.00342,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.477180153131485,
"kl": 0.9006227869540453,
"learning_rate": 9.999966250141033e-05,
"loss": -0.016,
"num_tokens": 8756508.0,
"reward": 9.534229278564453,
"reward_std": 10.647237777709961,
"rewards/rollout_reward_func/mean": 9.534229278564453,
"rewards/rollout_reward_func/std": 11.566615104675293,
"sampling/importance_sampling_ratio/max": 1.4990143775939941,
"sampling/importance_sampling_ratio/mean": 1.0070048570632935,
"sampling/importance_sampling_ratio/min": 0.6254692077636719,
"sampling/sampling_logp_difference/max": 0.4892125129699707,
"sampling/sampling_logp_difference/mean": 0.008062894456088543,
"step": 171,
"step_time": 29.967204156000207
},
{
"clip_ratio/high_max": 0.03333333507180214,
"clip_ratio/high_mean": 0.009375000605359674,
"clip_ratio/low_mean": 0.03333333553746343,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.042708336492069066,
"entropy": 0.13219841895624995,
"epoch": 0.00344,
"grad_norm": 0.2979583740234375,
"kl": 0.9737532902508974,
"learning_rate": 9.999965748292042e-05,
"loss": -0.0247,
"step": 172,
"step_time": 8.450734508001005
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0031250001629814506,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0031250001629814506,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1043.0,
"completions/max_terminated_length": 1043.0,
"completions/mean_length": 977.578125,
"completions/mean_terminated_length": 977.578125,
"completions/min_length": 911.0,
"completions/min_terminated_length": 911.0,
"entropy": 0.13215081067755818,
"epoch": 0.00346,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.570915699005127,
"kl": 0.7707110401242971,
"learning_rate": 9.999965242739393e-05,
"loss": 0.0115,
"num_tokens": 8870395.0,
"reward": 7.963113784790039,
"reward_std": 12.185734748840332,
"rewards/rollout_reward_func/mean": 7.963113784790039,
"rewards/rollout_reward_func/std": 12.419037818908691,
"sampling/importance_sampling_ratio/max": 1.2637660503387451,
"sampling/importance_sampling_ratio/mean": 0.9871397614479065,
"sampling/importance_sampling_ratio/min": 0.6115806102752686,
"sampling/sampling_logp_difference/max": 0.3316690921783447,
"sampling/sampling_logp_difference/mean": 0.0069004204124212265,
"step": 173,
"step_time": 29.865270385998883
},
{
"clip_ratio/high_max": 0.05000000214204192,
"clip_ratio/high_mean": 0.013541667256504297,
"clip_ratio/low_mean": 0.025976563920266926,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.03951823094394058,
"entropy": 0.1266618687659502,
"epoch": 0.00348,
"grad_norm": 0.3126421570777893,
"kl": 0.7724483050405979,
"learning_rate": 9.999964733483083e-05,
"loss": 0.0074,
"step": 174,
"step_time": 8.14785716599863
},
{
"clip_ratio/high_max": 0.004166666883975267,
"clip_ratio/high_mean": 0.0010416667209938169,
"clip_ratio/low_mean": 0.0010416667209938169,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0020833334419876337,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1040.0,
"completions/max_terminated_length": 1040.0,
"completions/mean_length": 986.78125,
"completions/mean_terminated_length": 986.78125,
"completions/min_length": 814.0,
"completions/min_terminated_length": 814.0,
"entropy": 0.12785040121525526,
"epoch": 0.0035,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.4320923089981079,
"kl": 0.5314337890595198,
"learning_rate": 9.999964220523112e-05,
"loss": 0.0134,
"num_tokens": 8984945.0,
"reward": 11.988597869873047,
"reward_std": 11.876688957214355,
"rewards/rollout_reward_func/mean": 11.988597869873047,
"rewards/rollout_reward_func/std": 12.529437065124512,
"sampling/importance_sampling_ratio/max": 1.5641355514526367,
"sampling/importance_sampling_ratio/mean": 1.0155951976776123,
"sampling/importance_sampling_ratio/min": 0.7307262420654297,
"sampling/sampling_logp_difference/max": 0.28014975786209106,
"sampling/sampling_logp_difference/mean": 0.006405924912542105,
"step": 175,
"step_time": 30.800439373998415
},
{
"clip_ratio/high_max": 0.025000001303851604,
"clip_ratio/high_mean": 0.008333333767950535,
"clip_ratio/low_mean": 0.015625001047737896,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.023958335164934397,
"entropy": 0.12503943219780922,
"epoch": 0.00352,
"grad_norm": 0.25414347648620605,
"kl": 0.545308168977499,
"learning_rate": 9.999963703859485e-05,
"loss": 0.0068,
"step": 176,
"step_time": 8.294947108000088
},
{
"clip_ratio/high_max": 0.012500000651925802,
"clip_ratio/high_mean": 0.0031250001629814506,
"clip_ratio/low_mean": 0.0010416667209938169,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.004166666883975267,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1032.0,
"completions/max_terminated_length": 1032.0,
"completions/mean_length": 947.78125,
"completions/mean_terminated_length": 947.78125,
"completions/min_length": 877.0,
"completions/min_terminated_length": 877.0,
"entropy": 0.11795077985152602,
"epoch": 0.00354,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5428488850593567,
"kl": 0.5484364293515682,
"learning_rate": 9.9999631834922e-05,
"loss": 0.0209,
"num_tokens": 9096764.0,
"reward": 7.462541580200195,
"reward_std": 9.003820419311523,
"rewards/rollout_reward_func/mean": 7.462541103363037,
"rewards/rollout_reward_func/std": 9.709749221801758,
"sampling/importance_sampling_ratio/max": 1.6056361198425293,
"sampling/importance_sampling_ratio/mean": 1.0011367797851562,
"sampling/importance_sampling_ratio/min": 0.6226766109466553,
"sampling/sampling_logp_difference/max": 0.48480892181396484,
"sampling/sampling_logp_difference/mean": 0.007405002135783434,
"step": 177,
"step_time": 30.438013943000442
},
{
"clip_ratio/high_max": 0.025000001303851604,
"clip_ratio/high_mean": 0.006250000325962901,
"clip_ratio/low_mean": 0.021875001140870154,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.028125001583248377,
"entropy": 0.11180919618345797,
"epoch": 0.00356,
"grad_norm": 1.0773159265518188,
"kl": 0.7693799175322056,
"learning_rate": 9.999962659421255e-05,
"loss": 0.0218,
"step": 178,
"step_time": 8.289468396000302
},
{
"clip_ratio/high_max": 0.012500000651925802,
"clip_ratio/high_mean": 0.0031250001629814506,
"clip_ratio/low_mean": 0.0020833334419876337,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005208333604969084,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1017.0,
"completions/max_terminated_length": 1017.0,
"completions/mean_length": 962.765625,
"completions/mean_terminated_length": 962.765625,
"completions/min_length": 893.0,
"completions/min_terminated_length": 893.0,
"entropy": 0.12420041672885418,
"epoch": 0.00358,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5452980399131775,
"kl": 0.5841826293617487,
"learning_rate": 9.999962131646658e-05,
"loss": 0.0223,
"num_tokens": 9209601.0,
"reward": 9.949074745178223,
"reward_std": 11.123800277709961,
"rewards/rollout_reward_func/mean": 9.949074745178223,
"rewards/rollout_reward_func/std": 11.492538452148438,
"sampling/importance_sampling_ratio/max": 1.846232295036316,
"sampling/importance_sampling_ratio/mean": 1.0060797929763794,
"sampling/importance_sampling_ratio/min": 0.692804217338562,
"sampling/sampling_logp_difference/max": 0.6036995649337769,
"sampling/sampling_logp_difference/mean": 0.0071367728523910046,
"step": 179,
"step_time": 29.633916566999687
},
{
"clip_ratio/high_max": 0.03750000195577741,
"clip_ratio/high_mean": 0.014583334093913436,
"clip_ratio/low_mean": 0.018824405735358596,
"clip_ratio/low_min": 0.004166666883975267,
"clip_ratio/region_mean": 0.033407740062102675,
"entropy": 0.11627750238403678,
"epoch": 0.0036,
"grad_norm": 0.38062411546707153,
"kl": 0.639982882887125,
"learning_rate": 9.999961600168402e-05,
"loss": 0.0192,
"step": 180,
"step_time": 8.508149862998835
},
{
"clip_ratio/high_max": 0.012500000651925802,
"clip_ratio/high_mean": 0.0031250001629814506,
"clip_ratio/low_mean": 0.0010416667209938169,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.004166666883975267,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1031.0,
"completions/max_terminated_length": 1031.0,
"completions/mean_length": 964.46875,
"completions/mean_terminated_length": 964.46875,
"completions/min_length": 816.0,
"completions/min_terminated_length": 816.0,
"entropy": 0.10024931281805038,
"epoch": 0.00362,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.7843199968338013,
"kl": 0.5290133021771908,
"learning_rate": 9.999961064986489e-05,
"loss": -0.0105,
"num_tokens": 9322591.0,
"reward": 9.743326187133789,
"reward_std": 11.718559265136719,
"rewards/rollout_reward_func/mean": 9.743326187133789,
"rewards/rollout_reward_func/std": 11.767054557800293,
"sampling/importance_sampling_ratio/max": 1.2395166158676147,
"sampling/importance_sampling_ratio/mean": 0.9893835783004761,
"sampling/importance_sampling_ratio/min": 0.7077917456626892,
"sampling/sampling_logp_difference/max": 0.36174678802490234,
"sampling/sampling_logp_difference/mean": 0.0061057801358401775,
"step": 181,
"step_time": 30.010152957000173
},
{
"clip_ratio/high_max": 0.04583333572372794,
"clip_ratio/high_mean": 0.01458333432674408,
"clip_ratio/low_mean": 0.019791668048128486,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.034375002374872565,
"entropy": 0.0898241214454174,
"epoch": 0.00364,
"grad_norm": 0.898304283618927,
"kl": 1.3444663938134909,
"learning_rate": 9.999960526100922e-05,
"loss": -0.0074,
"step": 182,
"step_time": 8.117577253999116
},
{
"clip_ratio/high_max": 0.012500000651925802,
"clip_ratio/high_mean": 0.0031250001629814506,
"clip_ratio/low_mean": 0.0011160714784637094,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00424107164144516,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1037.0,
"completions/max_terminated_length": 1037.0,
"completions/mean_length": 965.296875,
"completions/mean_terminated_length": 965.296875,
"completions/min_length": 887.0,
"completions/min_terminated_length": 887.0,
"entropy": 0.12110280524939299,
"epoch": 0.00366,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.47400349378585815,
"kl": 0.513819495216012,
"learning_rate": 9.999959983511699e-05,
"loss": 0.0011,
"num_tokens": 9435640.0,
"reward": 11.970619201660156,
"reward_std": 16.7136287689209,
"rewards/rollout_reward_func/mean": 11.970619201660156,
"rewards/rollout_reward_func/std": 17.193565368652344,
"sampling/importance_sampling_ratio/max": 1.4852927923202515,
"sampling/importance_sampling_ratio/mean": 0.9956411123275757,
"sampling/importance_sampling_ratio/min": 0.58425372838974,
"sampling/sampling_logp_difference/max": 0.4939703941345215,
"sampling/sampling_logp_difference/mean": 0.007358514238148928,
"step": 183,
"step_time": 30.018645907000064
},
{
"clip_ratio/high_max": 0.03750000195577741,
"clip_ratio/high_mean": 0.01041666732635349,
"clip_ratio/low_mean": 0.01875000086147338,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.029166668420657516,
"entropy": 0.12478661234490573,
"epoch": 0.00368,
"grad_norm": 0.29323798418045044,
"kl": 0.46843259409070015,
"learning_rate": 9.999959437218822e-05,
"loss": -0.0073,
"step": 184,
"step_time": 8.045792180003446
},
{
"clip_ratio/high_max": 0.004166666883975267,
"clip_ratio/high_mean": 0.0010416667209938169,
"clip_ratio/low_mean": 0.0010416667209938169,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0020833334419876337,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1026.0,
"completions/max_terminated_length": 1026.0,
"completions/mean_length": 959.875,
"completions/mean_terminated_length": 959.875,
"completions/min_length": 676.0,
"completions/min_terminated_length": 676.0,
"entropy": 0.12372714094817638,
"epoch": 0.0037,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.49018675088882446,
"kl": 0.5567406937479973,
"learning_rate": 9.999958887222293e-05,
"loss": -0.0266,
"num_tokens": 9548327.0,
"reward": 8.300872802734375,
"reward_std": 11.473505020141602,
"rewards/rollout_reward_func/mean": 8.300872802734375,
"rewards/rollout_reward_func/std": 13.137120246887207,
"sampling/importance_sampling_ratio/max": 1.3434193134307861,
"sampling/importance_sampling_ratio/mean": 1.0231890678405762,
"sampling/importance_sampling_ratio/min": 0.8001201748847961,
"sampling/sampling_logp_difference/max": 0.24235105514526367,
"sampling/sampling_logp_difference/mean": 0.006944713182747364,
"step": 185,
"step_time": 30.03380806199948
},
{
"clip_ratio/high_max": 0.058333336375653744,
"clip_ratio/high_mean": 0.01770833437331021,
"clip_ratio/low_mean": 0.012500000651925802,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.03020833490882069,
"entropy": 0.13152629090473056,
"epoch": 0.00372,
"grad_norm": 0.23521849513053894,
"kl": 0.5634740013629198,
"learning_rate": 9.999958333522109e-05,
"loss": -0.0341,
"step": 186,
"step_time": 8.600791754000966
},
{
"clip_ratio/high_max": 0.004166666883975267,
"clip_ratio/high_mean": 0.0010416667209938169,
"clip_ratio/low_mean": 0.0020833334419876337,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0031250001629814506,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1045.0,
"completions/max_terminated_length": 1045.0,
"completions/mean_length": 945.21875,
"completions/mean_terminated_length": 945.21875,
"completions/min_length": 289.0,
"completions/min_terminated_length": 289.0,
"entropy": 0.1315653999336064,
"epoch": 0.00374,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.36500951647758484,
"kl": 0.5178995914757252,
"learning_rate": 9.999957776118273e-05,
"loss": -0.0136,
"num_tokens": 9660136.0,
"reward": 7.931632041931152,
"reward_std": 11.40542984008789,
"rewards/rollout_reward_func/mean": 7.931632041931152,
"rewards/rollout_reward_func/std": 12.151664733886719,
"sampling/importance_sampling_ratio/max": 1.7536835670471191,
"sampling/importance_sampling_ratio/mean": 1.001771092414856,
"sampling/importance_sampling_ratio/min": 0.7216951251029968,
"sampling/sampling_logp_difference/max": 0.5699708461761475,
"sampling/sampling_logp_difference/mean": 0.0067958529107272625,
"step": 187,
"step_time": 29.347854906000975
},
{
"clip_ratio/high_max": 0.054166669491678476,
"clip_ratio/high_mean": 0.01770833448972553,
"clip_ratio/low_mean": 0.025694445823319256,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.04340278054587543,
"entropy": 0.13414463540539145,
"epoch": 0.00376,
"grad_norm": 0.21745486557483673,
"kl": 0.5746774040162563,
"learning_rate": 9.999957215010784e-05,
"loss": -0.019,
"step": 188,
"step_time": 8.856123159000163
},
{
"clip_ratio/high_max": 0.004166666883975267,
"clip_ratio/high_mean": 0.0010416667209938169,
"clip_ratio/low_mean": 0.0010416667209938169,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0020833334419876337,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1026.0,
"completions/max_terminated_length": 1026.0,
"completions/mean_length": 947.34375,
"completions/mean_terminated_length": 947.34375,
"completions/min_length": 216.0,
"completions/min_terminated_length": 216.0,
"entropy": 0.14527452224865556,
"epoch": 0.00378,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.4758737087249756,
"kl": 0.6551676895469427,
"learning_rate": 9.999956650199645e-05,
"loss": -0.0064,
"num_tokens": 9771998.0,
"reward": 8.513150215148926,
"reward_std": 14.811095237731934,
"rewards/rollout_reward_func/mean": 8.513150215148926,
"rewards/rollout_reward_func/std": 15.769759178161621,
"sampling/importance_sampling_ratio/max": 1.4140323400497437,
"sampling/importance_sampling_ratio/mean": 1.0076611042022705,
"sampling/importance_sampling_ratio/min": 0.5691302418708801,
"sampling/sampling_logp_difference/max": 0.7131770253181458,
"sampling/sampling_logp_difference/mean": 0.009376442059874535,
"step": 189,
"step_time": 30.213357230003567
},
{
"clip_ratio/high_max": 0.054166669491678476,
"clip_ratio/high_mean": 0.014583334210328758,
"clip_ratio/low_mean": 0.0281250016996637,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.042708336375653744,
"entropy": 0.13438974926248193,
"epoch": 0.0038,
"grad_norm": 0.2324807345867157,
"kl": 0.737682543694973,
"learning_rate": 9.999956081684854e-05,
"loss": -0.0149,
"step": 190,
"step_time": 7.734431613998822
},
{
"clip_ratio/high_max": 0.004166666883975267,
"clip_ratio/high_mean": 0.0010416667209938169,
"clip_ratio/low_mean": 0.0010416667209938169,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0020833334419876337,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1019.0,
"completions/max_terminated_length": 1019.0,
"completions/mean_length": 962.203125,
"completions/mean_terminated_length": 962.203125,
"completions/min_length": 881.0,
"completions/min_terminated_length": 881.0,
"entropy": 0.1253855088725686,
"epoch": 0.00382,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.41439345479011536,
"kl": 0.709712341427803,
"learning_rate": 9.999955509466414e-05,
"loss": 0.0269,
"num_tokens": 9884808.0,
"reward": 9.057092666625977,
"reward_std": 9.098945617675781,
"rewards/rollout_reward_func/mean": 9.05709171295166,
"rewards/rollout_reward_func/std": 10.38012981414795,
"sampling/importance_sampling_ratio/max": 1.3585758209228516,
"sampling/importance_sampling_ratio/mean": 0.989570677280426,
"sampling/importance_sampling_ratio/min": 0.6827925443649292,
"sampling/sampling_logp_difference/max": 0.40184950828552246,
"sampling/sampling_logp_difference/mean": 0.00655590184032917,
"step": 191,
"step_time": 31.590866651999022
},
{
"clip_ratio/high_max": 0.03392857313156128,
"clip_ratio/high_mean": 0.010565476841293275,
"clip_ratio/low_mean": 0.0293154779355973,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0398809548933059,
"entropy": 0.11225170968100429,
"epoch": 0.00384,
"grad_norm": 0.23349761962890625,
"kl": 0.8278532009571791,
"learning_rate": 9.999954933544323e-05,
"loss": 0.0201,
"step": 192,
"step_time": 7.970918687003177
},
{
"clip_ratio/high_max": 0.004166666883975267,
"clip_ratio/high_mean": 0.0010416667209938169,
"clip_ratio/low_mean": 0.0020833334419876337,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0031250001629814506,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1031.0,
"completions/max_terminated_length": 1031.0,
"completions/mean_length": 977.734375,
"completions/mean_terminated_length": 977.734375,
"completions/min_length": 896.0,
"completions/min_terminated_length": 896.0,
"entropy": 0.11717891087755561,
"epoch": 0.00386,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.4010028839111328,
"kl": 0.6346222888678312,
"learning_rate": 9.999954353918583e-05,
"loss": 0.0125,
"num_tokens": 9998710.0,
"reward": 12.752401351928711,
"reward_std": 15.009429931640625,
"rewards/rollout_reward_func/mean": 12.752399444580078,
"rewards/rollout_reward_func/std": 15.288240432739258,
"sampling/importance_sampling_ratio/max": 1.3140867948532104,
"sampling/importance_sampling_ratio/mean": 0.9636229276657104,
"sampling/importance_sampling_ratio/min": 0.5537927746772766,
"sampling/sampling_logp_difference/max": 0.36048221588134766,
"sampling/sampling_logp_difference/mean": 0.007171455770730972,
"step": 193,
"step_time": 30.459012025998163
},
{
"clip_ratio/high_max": 0.029166667722165585,
"clip_ratio/high_mean": 0.007291666930541396,
"clip_ratio/low_mean": 0.03020833502523601,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.03750000218860805,
"entropy": 0.11490702140145004,
"epoch": 0.00388,
"grad_norm": 0.23535722494125366,
"kl": 0.6073946505784988,
"learning_rate": 9.999953770589194e-05,
"loss": 0.006,
"step": 194,
"step_time": 8.631377130000146
},
{
"clip_ratio/high_max": 0.008333333767950535,
"clip_ratio/high_mean": 0.0020833334419876337,
"clip_ratio/low_mean": 0.0010416667209938169,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0031250001629814506,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1033.0,
"completions/max_terminated_length": 1033.0,
"completions/mean_length": 970.203125,
"completions/mean_terminated_length": 970.203125,
"completions/min_length": 898.0,
"completions/min_terminated_length": 898.0,
"entropy": 0.11113500501960516,
"epoch": 0.0039,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.604935348033905,
"kl": 0.6790309809148312,
"learning_rate": 9.999953183556157e-05,
"loss": 0.0026,
"num_tokens": 10112081.0,
"reward": 7.972203731536865,
"reward_std": 13.011554718017578,
"rewards/rollout_reward_func/mean": 7.972204208374023,
"rewards/rollout_reward_func/std": 13.773921966552734,
"sampling/importance_sampling_ratio/max": 1.3542617559432983,
"sampling/importance_sampling_ratio/mean": 0.9855128526687622,
"sampling/importance_sampling_ratio/min": 0.597061276435852,
"sampling/sampling_logp_difference/max": 0.4635782241821289,
"sampling/sampling_logp_difference/mean": 0.006834958214312792,
"step": 195,
"step_time": 30.052685054003632
},
{
"clip_ratio/high_max": 0.029166668187826872,
"clip_ratio/high_mean": 0.007291667046956718,
"clip_ratio/low_mean": 0.015625000814907253,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.022916667978279293,
"entropy": 0.11238743201829493,
"epoch": 0.00392,
"grad_norm": 0.4268299341201782,
"kl": 0.700402544811368,
"learning_rate": 9.999952592819473e-05,
"loss": -0.0015,
"step": 196,
"step_time": 8.260044886000287
},
{
"clip_ratio/high_max": 0.008333333767950535,
"clip_ratio/high_mean": 0.0020833334419876337,
"clip_ratio/low_mean": 0.0032738096779212356,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005357143119908869,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1043.0,
"completions/max_terminated_length": 1043.0,
"completions/mean_length": 934.5,
"completions/mean_terminated_length": 934.5,
"completions/min_length": 878.0,
"completions/min_terminated_length": 878.0,
"entropy": 0.1125073074363172,
"epoch": 0.00394,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.6487244963645935,
"kl": 0.6208249572664499,
"learning_rate": 9.99995199837914e-05,
"loss": 0.0046,
"num_tokens": 10223022.0,
"reward": 8.661399841308594,
"reward_std": 15.73376178741455,
"rewards/rollout_reward_func/mean": 8.661399841308594,
"rewards/rollout_reward_func/std": 15.457544326782227,
"sampling/importance_sampling_ratio/max": 1.324127435684204,
"sampling/importance_sampling_ratio/mean": 1.0008368492126465,
"sampling/importance_sampling_ratio/min": 0.6733382344245911,
"sampling/sampling_logp_difference/max": 0.35140299797058105,
"sampling/sampling_logp_difference/mean": 0.007979365065693855,
"step": 197,
"step_time": 31.166683005998493
},
{
"clip_ratio/high_max": 0.021130953449755907,
"clip_ratio/high_mean": 0.00840773864183575,
"clip_ratio/low_mean": 0.02730654936749488,
"clip_ratio/low_min": 0.004166666883975267,
"clip_ratio/region_mean": 0.035714288242161274,
"entropy": 0.11146878870204091,
"epoch": 0.00396,
"grad_norm": 0.5962705016136169,
"kl": 0.9501709761098027,
"learning_rate": 9.999951400235163e-05,
"loss": 0.004,
"step": 198,
"step_time": 8.287430281997331
},
{
"clip_ratio/high_max": 0.012797619681805372,
"clip_ratio/high_mean": 0.003199404920451343,
"clip_ratio/low_mean": 0.0020833334419876337,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005282738362438977,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1047.0,
"completions/max_terminated_length": 1047.0,
"completions/mean_length": 980.65625,
"completions/mean_terminated_length": 980.65625,
"completions/min_length": 902.0,
"completions/min_terminated_length": 902.0,
"entropy": 0.11779335234314203,
"epoch": 0.00398,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.523526668548584,
"kl": 0.5669353120028973,
"learning_rate": 9.999950798387541e-05,
"loss": 0.0049,
"num_tokens": 10337112.0,
"reward": 10.420181274414062,
"reward_std": 16.354602813720703,
"rewards/rollout_reward_func/mean": 10.420181274414062,
"rewards/rollout_reward_func/std": 17.055269241333008,
"sampling/importance_sampling_ratio/max": 1.23856782913208,
"sampling/importance_sampling_ratio/mean": 0.9714287519454956,
"sampling/importance_sampling_ratio/min": 0.7061982750892639,
"sampling/sampling_logp_difference/max": 0.447023868560791,
"sampling/sampling_logp_difference/mean": 0.00747651606798172,
"step": 199,
"step_time": 30.34761462899951
},
{
"clip_ratio/high_max": 0.029464287217706442,
"clip_ratio/high_mean": 0.010491072083823383,
"clip_ratio/low_mean": 0.02091703994665295,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.031408111681230366,
"entropy": 0.11395548144355416,
"epoch": 0.004,
"grad_norm": 0.3284382224082947,
"kl": 0.5632808655500412,
"learning_rate": 9.999950192836271e-05,
"loss": -0.001,
"step": 200,
"step_time": 8.547375084998748
},
{
"clip_ratio/high_max": 0.004166666883975267,
"clip_ratio/high_mean": 0.0010416667209938169,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0010416667209938169,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1034.0,
"completions/max_terminated_length": 1034.0,
"completions/mean_length": 971.3125,
"completions/mean_terminated_length": 971.3125,
"completions/min_length": 873.0,
"completions/min_terminated_length": 873.0,
"entropy": 0.1107462802901864,
"epoch": 0.00402,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.42466774582862854,
"kl": 0.504519259557128,
"learning_rate": 9.999949583581359e-05,
"loss": 0.0037,
"num_tokens": 10450565.0,
"reward": 12.199589729309082,
"reward_std": 12.77005672454834,
"rewards/rollout_reward_func/mean": 12.199588775634766,
"rewards/rollout_reward_func/std": 13.816198348999023,
"sampling/importance_sampling_ratio/max": 1.1825975179672241,
"sampling/importance_sampling_ratio/mean": 0.9908883571624756,
"sampling/importance_sampling_ratio/min": 0.6934873461723328,
"sampling/sampling_logp_difference/max": 0.3765444755554199,
"sampling/sampling_logp_difference/mean": 0.006183322053402662,
"step": 201,
"step_time": 30.161383785001817
},
{
"clip_ratio/high_max": 0.03750000195577741,
"clip_ratio/high_mean": 0.011458334047347307,
"clip_ratio/low_mean": 0.021875001257285476,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.03333333553746343,
"entropy": 0.10510897357016802,
"epoch": 0.00404,
"grad_norm": 0.21419784426689148,
"kl": 0.5648845955729485,
"learning_rate": 9.999948970622802e-05,
"loss": -0.0012,
"step": 202,
"step_time": 8.714965140998174
},
{
"clip_ratio/high_max": 0.004166666883975267,
"clip_ratio/high_mean": 0.0010416667209938169,
"clip_ratio/low_mean": 0.0010416667209938169,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0020833334419876337,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1046.0,
"completions/max_terminated_length": 1046.0,
"completions/mean_length": 979.578125,
"completions/mean_terminated_length": 979.578125,
"completions/min_length": 294.0,
"completions/min_terminated_length": 294.0,
"entropy": 0.12641333835199475,
"epoch": 0.00406,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.6306821703910828,
"kl": 0.5104430429637432,
"learning_rate": 9.9999483539606e-05,
"loss": -0.0021,
"num_tokens": 10564630.0,
"reward": 10.778827667236328,
"reward_std": 13.483461380004883,
"rewards/rollout_reward_func/mean": 10.778827667236328,
"rewards/rollout_reward_func/std": 14.313225746154785,
"sampling/importance_sampling_ratio/max": 1.4068244695663452,
"sampling/importance_sampling_ratio/mean": 0.9891500473022461,
"sampling/importance_sampling_ratio/min": 0.6753217577934265,
"sampling/sampling_logp_difference/max": 0.3969893455505371,
"sampling/sampling_logp_difference/mean": 0.007549474947154522,
"step": 203,
"step_time": 29.916299866999907
},
{
"clip_ratio/high_max": 0.04583333572372794,
"clip_ratio/high_mean": 0.013541667489334941,
"clip_ratio/low_mean": 0.03132440650369972,
"clip_ratio/low_min": 0.004166666883975267,
"clip_ratio/region_mean": 0.04486607445869595,
"entropy": 0.12076347460970283,
"epoch": 0.00408,
"grad_norm": 0.29815390706062317,
"kl": 0.5736292470246553,
"learning_rate": 9.999947733594757e-05,
"loss": -0.0096,
"step": 204,
"step_time": 7.709945141001299
},
{
"clip_ratio/high_max": 0.004166666883975267,
"clip_ratio/high_mean": 0.0010416667209938169,
"clip_ratio/low_mean": 0.006250000325962901,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.007291667046956718,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1016.0,
"completions/max_terminated_length": 1016.0,
"completions/mean_length": 948.3125,
"completions/mean_terminated_length": 948.3125,
"completions/min_length": 878.0,
"completions/min_terminated_length": 878.0,
"entropy": 0.10954847000539303,
"epoch": 0.0041,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.8904768228530884,
"kl": 0.5102124018594623,
"learning_rate": 9.999947109525271e-05,
"loss": 0.0269,
"num_tokens": 10676487.0,
"reward": 7.509866237640381,
"reward_std": 12.055532455444336,
"rewards/rollout_reward_func/mean": 7.509865760803223,
"rewards/rollout_reward_func/std": 12.425904273986816,
"sampling/importance_sampling_ratio/max": 2.821709156036377,
"sampling/importance_sampling_ratio/mean": 1.0446405410766602,
"sampling/importance_sampling_ratio/min": 0.6838214993476868,
"sampling/sampling_logp_difference/max": 0.6221010684967041,
"sampling/sampling_logp_difference/mean": 0.007641012314707041,
"step": 205,
"step_time": 32.10779399100011
},
{
"clip_ratio/high_max": 0.029166668187826872,
"clip_ratio/high_mean": 0.008333333767950535,
"clip_ratio/low_mean": 0.0238932310603559,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.032226564711891115,
"entropy": 0.09176747733727098,
"epoch": 0.00412,
"grad_norm": 0.5064001083374023,
"kl": 0.6276722047477961,
"learning_rate": 9.999946481752144e-05,
"loss": 0.0257,
"step": 206,
"step_time": 8.04664100899663
},
{
"clip_ratio/high_max": 0.004166666883975267,
"clip_ratio/high_mean": 0.0010416667209938169,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0010416667209938169,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1032.0,
"completions/max_terminated_length": 1032.0,
"completions/mean_length": 948.5,
"completions/mean_terminated_length": 948.5,
"completions/min_length": 695.0,
"completions/min_terminated_length": 695.0,
"entropy": 0.0839753916952759,
"epoch": 0.00414,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.6526506543159485,
"kl": 0.5436345608904958,
"learning_rate": 9.999945850275377e-05,
"loss": -0.0066,
"num_tokens": 10788398.0,
"reward": 4.734495639801025,
"reward_std": 13.251731872558594,
"rewards/rollout_reward_func/mean": 4.734495639801025,
"rewards/rollout_reward_func/std": 15.050627708435059,
"sampling/importance_sampling_ratio/max": 1.249489426612854,
"sampling/importance_sampling_ratio/mean": 1.0017802715301514,
"sampling/importance_sampling_ratio/min": 0.5872460603713989,
"sampling/sampling_logp_difference/max": 0.5519323348999023,
"sampling/sampling_logp_difference/mean": 0.007509762421250343,
"step": 207,
"step_time": 30.502280216000145
},
{
"clip_ratio/high_max": 0.041666668839752674,
"clip_ratio/high_mean": 0.010416667209938169,
"clip_ratio/low_mean": 0.020126489107497036,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.030543157132342458,
"entropy": 0.0718412920832634,
"epoch": 0.00416,
"grad_norm": 0.9516690969467163,
"kl": 1.0864872355014086,
"learning_rate": 9.999945215094969e-05,
"loss": -0.0086,
"step": 208,
"step_time": 8.340999965001174
},
{
"clip_ratio/high_max": 0.008333333767950535,
"clip_ratio/high_mean": 0.0020833334419876337,
"clip_ratio/low_mean": 0.0020833334419876337,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.004166666883975267,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1014.0,
"completions/max_terminated_length": 1014.0,
"completions/mean_length": 948.453125,
"completions/mean_terminated_length": 948.453125,
"completions/min_length": 656.0,
"completions/min_terminated_length": 656.0,
"entropy": 0.0782642443664372,
"epoch": 0.00418,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.9959932565689087,
"kl": 0.5945225208997726,
"learning_rate": 9.99994457621092e-05,
"loss": 0.0171,
"num_tokens": 10900280.0,
"reward": 9.700709342956543,
"reward_std": 13.213409423828125,
"rewards/rollout_reward_func/mean": 9.700709342956543,
"rewards/rollout_reward_func/std": 14.225313186645508,
"sampling/importance_sampling_ratio/max": 1.3207358121871948,
"sampling/importance_sampling_ratio/mean": 0.968299150466919,
"sampling/importance_sampling_ratio/min": 0.3971961438655853,
"sampling/sampling_logp_difference/max": 0.8888199329376221,
"sampling/sampling_logp_difference/mean": 0.00841559562832117,
"step": 209,
"step_time": 29.9366263410011
},
{
"clip_ratio/high_max": 0.025000001303851604,
"clip_ratio/high_mean": 0.006250000325962901,
"clip_ratio/low_mean": 0.02656250144354999,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.03281250165309757,
"entropy": 0.07524953898973763,
"epoch": 0.0042,
"grad_norm": 0.22927281260490417,
"kl": 0.5741278808563948,
"learning_rate": 9.999943933623233e-05,
"loss": 0.0142,
"step": 210,
"step_time": 8.610201505000987
},
{
"clip_ratio/high_max": 0.012500000651925802,
"clip_ratio/high_mean": 0.0031250001629814506,
"clip_ratio/low_mean": 0.0011160714784637094,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00424107164144516,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1033.0,
"completions/max_terminated_length": 1033.0,
"completions/mean_length": 961.625,
"completions/mean_terminated_length": 961.625,
"completions/min_length": 183.0,
"completions/min_terminated_length": 183.0,
"entropy": 0.08438117569312453,
"epoch": 0.00422,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.6056447625160217,
"kl": 0.4765475448220968,
"learning_rate": 9.999943287331907e-05,
"loss": -0.0396,
"num_tokens": 11013133.0,
"reward": 6.143215179443359,
"reward_std": 9.006479263305664,
"rewards/rollout_reward_func/mean": 6.143215179443359,
"rewards/rollout_reward_func/std": 10.255783081054688,
"sampling/importance_sampling_ratio/max": 1.5546733140945435,
"sampling/importance_sampling_ratio/mean": 0.9941245913505554,
"sampling/importance_sampling_ratio/min": 0.5497701168060303,
"sampling/sampling_logp_difference/max": 0.6002916693687439,
"sampling/sampling_logp_difference/mean": 0.0072316620498895645,
"step": 211,
"step_time": 29.858850818000974
},
{
"clip_ratio/high_max": 0.020833334419876337,
"clip_ratio/high_mean": 0.006250000325962901,
"clip_ratio/low_mean": 0.01889881060924381,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.02514881093520671,
"entropy": 0.08111793245188892,
"epoch": 0.00424,
"grad_norm": 0.3952238857746124,
"kl": 0.5354121858254075,
"learning_rate": 9.999942637336943e-05,
"loss": -0.0419,
"step": 212,
"step_time": 8.145115041997997
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0031250001629814506,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0031250001629814506,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1046.0,
"completions/max_terminated_length": 1046.0,
"completions/mean_length": 974.1875,
"completions/mean_terminated_length": 974.1875,
"completions/min_length": 853.0,
"completions/min_terminated_length": 853.0,
"entropy": 0.08407697454094887,
"epoch": 0.00426,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5582248568534851,
"kl": 0.5609004180878401,
"learning_rate": 9.999941983638342e-05,
"loss": -0.0096,
"num_tokens": 11126805.0,
"reward": 7.366800308227539,
"reward_std": 11.575126647949219,
"rewards/rollout_reward_func/mean": 7.366800785064697,
"rewards/rollout_reward_func/std": 12.478679656982422,
"sampling/importance_sampling_ratio/max": 1.7624305486679077,
"sampling/importance_sampling_ratio/mean": 1.0073318481445312,
"sampling/importance_sampling_ratio/min": 0.5805040001869202,
"sampling/sampling_logp_difference/max": 0.5259637832641602,
"sampling/sampling_logp_difference/mean": 0.007181447930634022,
"step": 213,
"step_time": 30.68919447299777
},
{
"clip_ratio/high_max": 0.012500000651925802,
"clip_ratio/high_mean": 0.005208333721384406,
"clip_ratio/low_mean": 0.02083333453629166,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.026041668257676065,
"entropy": 0.08252408797852695,
"epoch": 0.00428,
"grad_norm": 0.4972332715988159,
"kl": 0.8270881623029709,
"learning_rate": 9.999941326236106e-05,
"loss": -0.0102,
"step": 214,
"step_time": 8.636868058998516
},
{
"clip_ratio/high_max": 0.004166666883975267,
"clip_ratio/high_mean": 0.0010416667209938169,
"clip_ratio/low_mean": 0.0010416667209938169,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0020833334419876337,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1021.0,
"completions/max_terminated_length": 1021.0,
"completions/mean_length": 947.234375,
"completions/mean_terminated_length": 947.234375,
"completions/min_length": 827.0,
"completions/min_terminated_length": 827.0,
"entropy": 0.09235736005939543,
"epoch": 0.0043,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.7149195075035095,
"kl": 0.6700677536427975,
"learning_rate": 9.999940665130233e-05,
"loss": 0.0269,
"num_tokens": 11238594.0,
"reward": 8.236663818359375,
"reward_std": 12.342934608459473,
"rewards/rollout_reward_func/mean": 8.236662864685059,
"rewards/rollout_reward_func/std": 13.346291542053223,
"sampling/importance_sampling_ratio/max": 1.3215135335922241,
"sampling/importance_sampling_ratio/mean": 1.0117213726043701,
"sampling/importance_sampling_ratio/min": 0.607474684715271,
"sampling/sampling_logp_difference/max": 0.3575429916381836,
"sampling/sampling_logp_difference/mean": 0.00792029220610857,
"step": 215,
"step_time": 30.079993358000138
},
{
"clip_ratio/high_max": 0.03333333507180214,
"clip_ratio/high_mean": 0.009375000605359674,
"clip_ratio/low_mean": 0.03020833502523601,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.039583335630595684,
"entropy": 0.09003956150263548,
"epoch": 0.00432,
"grad_norm": 0.22825075685977936,
"kl": 0.7063372246921062,
"learning_rate": 9.999940000320725e-05,
"loss": 0.0204,
"step": 216,
"step_time": 8.830438550000508
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0010416667209938169,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0010416667209938169,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1038.0,
"completions/max_terminated_length": 1038.0,
"completions/mean_length": 973.421875,
"completions/mean_terminated_length": 973.421875,
"completions/min_length": 899.0,
"completions/min_terminated_length": 899.0,
"entropy": 0.08580271410755813,
"epoch": 0.00434,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.544740617275238,
"kl": 0.8732884284108877,
"learning_rate": 9.999939331807582e-05,
"loss": -0.0038,
"num_tokens": 11352163.0,
"reward": 6.939154148101807,
"reward_std": 12.035371780395508,
"rewards/rollout_reward_func/mean": 6.939153671264648,
"rewards/rollout_reward_func/std": 12.4366455078125,
"sampling/importance_sampling_ratio/max": 1.316995620727539,
"sampling/importance_sampling_ratio/mean": 1.0068674087524414,
"sampling/importance_sampling_ratio/min": 0.7823165059089661,
"sampling/sampling_logp_difference/max": 0.2636311650276184,
"sampling/sampling_logp_difference/mean": 0.0060178861021995544,
"step": 217,
"step_time": 30.360063980000632
},
{
"clip_ratio/high_max": 0.025000001303851604,
"clip_ratio/high_mean": 0.006250000325962901,
"clip_ratio/low_mean": 0.018750000977888703,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.025000001420266926,
"entropy": 0.08710528793744743,
"epoch": 0.00436,
"grad_norm": 0.38059887290000916,
"kl": 0.908846540376544,
"learning_rate": 9.999938659590807e-05,
"loss": -0.0104,
"step": 218,
"step_time": 7.607007630000226
},
{
"clip_ratio/high_max": 0.008333333767950535,
"clip_ratio/high_mean": 0.0031250001629814506,
"clip_ratio/low_mean": 0.0030598959419876337,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.006184896221384406,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1034.0,
"completions/max_terminated_length": 1034.0,
"completions/mean_length": 963.703125,
"completions/mean_terminated_length": 963.703125,
"completions/min_length": 890.0,
"completions/min_terminated_length": 890.0,
"entropy": 0.09868060098960996,
"epoch": 0.00438,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.48504236340522766,
"kl": 0.5343823749572039,
"learning_rate": 9.999937983670398e-05,
"loss": 0.0194,
"num_tokens": 11465042.0,
"reward": 6.008334636688232,
"reward_std": 13.721019744873047,
"rewards/rollout_reward_func/mean": 6.008334636688232,
"rewards/rollout_reward_func/std": 14.5517578125,
"sampling/importance_sampling_ratio/max": 1.471420407295227,
"sampling/importance_sampling_ratio/mean": 0.9755445718765259,
"sampling/importance_sampling_ratio/min": 0.5715925097465515,
"sampling/sampling_logp_difference/max": 0.46605920791625977,
"sampling/sampling_logp_difference/mean": 0.008881919085979462,
"step": 219,
"step_time": 31.71598935200018
},
{
"clip_ratio/high_max": 0.03750000195577741,
"clip_ratio/high_mean": 0.010416667209938169,
"clip_ratio/low_mean": 0.022851563524454832,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.033268230734393,
"entropy": 0.09918246325105429,
"epoch": 0.0044,
"grad_norm": 0.22379587590694427,
"kl": 0.5816311649978161,
"learning_rate": 9.999937304046355e-05,
"loss": 0.0147,
"step": 220,
"step_time": 8.254640479001864
},
{
"clip_ratio/high_max": 0.012500000651925802,
"clip_ratio/high_mean": 0.0031250001629814506,
"clip_ratio/low_mean": 0.0020833334419876337,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005208333604969084,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1030.0,
"completions/max_terminated_length": 1030.0,
"completions/mean_length": 955.53125,
"completions/mean_terminated_length": 955.53125,
"completions/min_length": 894.0,
"completions/min_terminated_length": 894.0,
"entropy": 0.09134439891204238,
"epoch": 0.00442,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.7109507322311401,
"kl": 0.6976822856813669,
"learning_rate": 9.999936620718681e-05,
"loss": 0.0063,
"num_tokens": 11577407.0,
"reward": 7.297264099121094,
"reward_std": 9.222230911254883,
"rewards/rollout_reward_func/mean": 7.297264575958252,
"rewards/rollout_reward_func/std": 10.19138240814209,
"sampling/importance_sampling_ratio/max": 1.4536468982696533,
"sampling/importance_sampling_ratio/mean": 0.9984610080718994,
"sampling/importance_sampling_ratio/min": 0.7001582384109497,
"sampling/sampling_logp_difference/max": 0.37113046646118164,
"sampling/sampling_logp_difference/mean": 0.006036648992449045,
"step": 221,
"step_time": 29.85640163500102
},
{
"clip_ratio/high_max": 0.03333333507180214,
"clip_ratio/high_mean": 0.009375000488944352,
"clip_ratio/low_mean": 0.014583334210328758,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.02395833469927311,
"entropy": 0.09828592231497169,
"epoch": 0.00444,
"grad_norm": 1.0275782346725464,
"kl": 0.5333473347127438,
"learning_rate": 9.999935933687375e-05,
"loss": 0.0064,
"step": 222,
"step_time": 8.916792385998633
},
{
"clip_ratio/high_max": 0.008333333767950535,
"clip_ratio/high_mean": 0.0020833334419876337,
"clip_ratio/low_mean": 0.0020833334419876337,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.004166666883975267,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1024.0,
"completions/max_terminated_length": 1024.0,
"completions/mean_length": 955.5625,
"completions/mean_terminated_length": 955.5625,
"completions/min_length": 864.0,
"completions/min_terminated_length": 864.0,
"entropy": 0.11322583490982652,
"epoch": 0.00446,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.6080738306045532,
"kl": 0.4430428724735975,
"learning_rate": 9.999935242952441e-05,
"loss": 0.0136,
"num_tokens": 11689757.0,
"reward": 7.010858535766602,
"reward_std": 12.169811248779297,
"rewards/rollout_reward_func/mean": 7.010858535766602,
"rewards/rollout_reward_func/std": 12.808332443237305,
"sampling/importance_sampling_ratio/max": 1.297809362411499,
"sampling/importance_sampling_ratio/mean": 0.9824950695037842,
"sampling/importance_sampling_ratio/min": 0.6718153953552246,
"sampling/sampling_logp_difference/max": 0.3088874816894531,
"sampling/sampling_logp_difference/mean": 0.007251087576150894,
"step": 223,
"step_time": 31.12233561499943
},
{
"clip_ratio/high_max": 0.05000000260770321,
"clip_ratio/high_mean": 0.01770833437331021,
"clip_ratio/low_mean": 0.02285156410653144,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.04055989871267229,
"entropy": 0.11228394089266658,
"epoch": 0.00448,
"grad_norm": 0.5497627258300781,
"kl": 0.6058794800192118,
"learning_rate": 9.999934548513874e-05,
"loss": 0.0127,
"step": 224,
"step_time": 8.354907415001435
},
{
"clip_ratio/high_max": 0.004166666883975267,
"clip_ratio/high_mean": 0.0010416667209938169,
"clip_ratio/low_mean": 0.0031250001629814506,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.004166666883975267,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1039.0,
"completions/max_terminated_length": 1039.0,
"completions/mean_length": 941.9375,
"completions/mean_terminated_length": 941.9375,
"completions/min_length": 811.0,
"completions/min_terminated_length": 811.0,
"entropy": 0.10548029001802206,
"epoch": 0.0045,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5894492864608765,
"kl": 0.5073134936392307,
"learning_rate": 9.999933850371681e-05,
"loss": 0.0086,
"num_tokens": 11801157.0,
"reward": 6.051244258880615,
"reward_std": 9.136930465698242,
"rewards/rollout_reward_func/mean": 6.051244258880615,
"rewards/rollout_reward_func/std": 9.732189178466797,
"sampling/importance_sampling_ratio/max": 1.4082491397857666,
"sampling/importance_sampling_ratio/mean": 0.9974700212478638,
"sampling/importance_sampling_ratio/min": 0.6021063923835754,
"sampling/sampling_logp_difference/max": 0.5747667551040649,
"sampling/sampling_logp_difference/mean": 0.0068025123327970505,
"step": 225,
"step_time": 31.567075909998493
},
{
"clip_ratio/high_max": 0.025000001303851604,
"clip_ratio/high_mean": 0.006250000325962901,
"clip_ratio/low_mean": 0.027083334745839238,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.033333335421048105,
"entropy": 0.10574616026133299,
"epoch": 0.00452,
"grad_norm": 0.2798631489276886,
"kl": 0.7715174313634634,
"learning_rate": 9.999933148525857e-05,
"loss": 0.007,
"step": 226,
"step_time": 7.815335610000147
},
{
"clip_ratio/high_max": 0.012500000651925802,
"clip_ratio/high_mean": 0.0031250001629814506,
"clip_ratio/low_mean": 0.0020833334419876337,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005208333604969084,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1017.0,
"completions/max_terminated_length": 1017.0,
"completions/mean_length": 959.5625,
"completions/mean_terminated_length": 959.5625,
"completions/min_length": 894.0,
"completions/min_terminated_length": 894.0,
"entropy": 0.11399172944948077,
"epoch": 0.00454,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.568816602230072,
"kl": 0.5976129788905382,
"learning_rate": 9.999932442976408e-05,
"loss": -0.0166,
"num_tokens": 11913755.0,
"reward": 8.459915161132812,
"reward_std": 15.611612319946289,
"rewards/rollout_reward_func/mean": 8.459915161132812,
"rewards/rollout_reward_func/std": 15.882699012756348,
"sampling/importance_sampling_ratio/max": 1.710551381111145,
"sampling/importance_sampling_ratio/mean": 1.017797827720642,
"sampling/importance_sampling_ratio/min": 0.7439659833908081,
"sampling/sampling_logp_difference/max": 0.36053359508514404,
"sampling/sampling_logp_difference/mean": 0.008504325523972511,
"step": 227,
"step_time": 31.015096133000043
},
{
"clip_ratio/high_max": 0.03750000195577741,
"clip_ratio/high_mean": 0.013541667489334941,
"clip_ratio/low_mean": 0.020833334769122303,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.03437500272411853,
"entropy": 0.11824841611087322,
"epoch": 0.00456,
"grad_norm": 0.31711068749427795,
"kl": 0.6181838270276785,
"learning_rate": 9.999931733723329e-05,
"loss": -0.0224,
"step": 228,
"step_time": 8.539993245000005
},
{
"clip_ratio/high_max": 0.008333333767950535,
"clip_ratio/high_mean": 0.0020833334419876337,
"clip_ratio/low_mean": 0.0020833334419876337,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.004166666883975267,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1029.0,
"completions/max_terminated_length": 1029.0,
"completions/mean_length": 949.875,
"completions/mean_terminated_length": 949.875,
"completions/min_length": 829.0,
"completions/min_terminated_length": 829.0,
"entropy": 0.11835443088784814,
"epoch": 0.00458,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.510497510433197,
"kl": 0.562442360445857,
"learning_rate": 9.999931020766625e-05,
"loss": -0.0151,
"num_tokens": 12025731.0,
"reward": 7.8150177001953125,
"reward_std": 10.93730640411377,
"rewards/rollout_reward_func/mean": 7.8150177001953125,
"rewards/rollout_reward_func/std": 12.047165870666504,
"sampling/importance_sampling_ratio/max": 1.8081343173980713,
"sampling/importance_sampling_ratio/mean": 1.0230156183242798,
"sampling/importance_sampling_ratio/min": 0.5872366428375244,
"sampling/sampling_logp_difference/max": 0.5174302458763123,
"sampling/sampling_logp_difference/mean": 0.008099589496850967,
"step": 229,
"step_time": 30.196818825002993
},
{
"clip_ratio/high_max": 0.029166668187826872,
"clip_ratio/high_mean": 0.007291667046956718,
"clip_ratio/low_mean": 0.028125001466833055,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.03541666886303574,
"entropy": 0.12235437287017703,
"epoch": 0.0046,
"grad_norm": 0.6880154013633728,
"kl": 0.5698418729007244,
"learning_rate": 9.999930304106295e-05,
"loss": -0.0198,
"step": 230,
"step_time": 9.264137213997856
},
{
"clip_ratio/high_max": 0.01666666753590107,
"clip_ratio/high_mean": 0.004166666883975267,
"clip_ratio/low_mean": 0.0031250001629814506,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.007291667046956718,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1022.0,
"completions/max_terminated_length": 1022.0,
"completions/mean_length": 953.078125,
"completions/mean_terminated_length": 953.078125,
"completions/min_length": 641.0,
"completions/min_terminated_length": 641.0,
"entropy": 0.10955408262088895,
"epoch": 0.00462,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.4517523944377899,
"kl": 0.562993137165904,
"learning_rate": 9.99992958374234e-05,
"loss": 0.0172,
"num_tokens": 12137928.0,
"reward": 6.407680034637451,
"reward_std": 12.907535552978516,
"rewards/rollout_reward_func/mean": 6.407680034637451,
"rewards/rollout_reward_func/std": 14.238213539123535,
"sampling/importance_sampling_ratio/max": 1.3800395727157593,
"sampling/importance_sampling_ratio/mean": 0.989479124546051,
"sampling/importance_sampling_ratio/min": 0.5886368155479431,
"sampling/sampling_logp_difference/max": 0.4858388900756836,
"sampling/sampling_logp_difference/mean": 0.007202588953077793,
"step": 231,
"step_time": 30.985224181000376
},
{
"clip_ratio/high_max": 0.04583333572372794,
"clip_ratio/high_mean": 0.012500000768341124,
"clip_ratio/low_mean": 0.023177084513008595,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.03567708551418036,
"entropy": 0.09367383690550923,
"epoch": 0.00464,
"grad_norm": 0.3370003402233124,
"kl": 0.5898754354566336,
"learning_rate": 9.99992885967476e-05,
"loss": 0.0141,
"step": 232,
"step_time": 7.848031210001864
},
{
"clip_ratio/high_max": 0.008333333767950535,
"clip_ratio/high_mean": 0.0020833334419876337,
"clip_ratio/low_mean": 0.0031250001629814506,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005208333604969084,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1034.0,
"completions/max_terminated_length": 1034.0,
"completions/mean_length": 956.953125,
"completions/mean_terminated_length": 956.953125,
"completions/min_length": 643.0,
"completions/min_terminated_length": 643.0,
"entropy": 0.09109799051657319,
"epoch": 0.00466,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.9314411878585815,
"kl": 0.6043513156473637,
"learning_rate": 9.999928131903557e-05,
"loss": -0.0214,
"num_tokens": 12250409.0,
"reward": 4.949494361877441,
"reward_std": 13.414144515991211,
"rewards/rollout_reward_func/mean": 4.949494361877441,
"rewards/rollout_reward_func/std": 14.449867248535156,
"sampling/importance_sampling_ratio/max": 1.7543931007385254,
"sampling/importance_sampling_ratio/mean": 1.0081617832183838,
"sampling/importance_sampling_ratio/min": 0.7344788908958435,
"sampling/sampling_logp_difference/max": 0.40094685554504395,
"sampling/sampling_logp_difference/mean": 0.007154828868806362,
"step": 233,
"step_time": 31.97192203099803
},
{
"clip_ratio/high_max": 0.06250000279396772,
"clip_ratio/high_mean": 0.018750001094304025,
"clip_ratio/low_mean": 0.026041668141260743,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.04479166946839541,
"entropy": 0.07302908715792,
"epoch": 0.00468,
"grad_norm": 0.6798368692398071,
"kl": 1.063211616128683,
"learning_rate": 9.999927400428733e-05,
"loss": -0.0247,
"step": 234,
"step_time": 8.365730943999552
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0017361111240461469,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0017361111240461469,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1030.0,
"completions/max_terminated_length": 1030.0,
"completions/mean_length": 939.296875,
"completions/mean_terminated_length": 939.296875,
"completions/min_length": 455.0,
"completions/min_terminated_length": 455.0,
"entropy": 0.08065455732867122,
"epoch": 0.0047,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.6933859586715698,
"kl": 0.5536066945642233,
"learning_rate": 9.999926665250286e-05,
"loss": -0.0262,
"num_tokens": 12361673.0,
"reward": 6.229083061218262,
"reward_std": 13.382326126098633,
"rewards/rollout_reward_func/mean": 6.229083061218262,
"rewards/rollout_reward_func/std": 14.236706733703613,
"sampling/importance_sampling_ratio/max": 1.6296361684799194,
"sampling/importance_sampling_ratio/mean": 0.9904996752738953,
"sampling/importance_sampling_ratio/min": 0.554724395275116,
"sampling/sampling_logp_difference/max": 0.5841927528381348,
"sampling/sampling_logp_difference/mean": 0.007334005553275347,
"step": 235,
"step_time": 30.556930122000267
},
{
"clip_ratio/high_max": 0.041666668839752674,
"clip_ratio/high_mean": 0.012500000768341124,
"clip_ratio/low_mean": 0.023177084629423916,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.03567708586342633,
"entropy": 0.07718153693713248,
"epoch": 0.00472,
"grad_norm": 0.373847097158432,
"kl": 0.7903371974825859,
"learning_rate": 9.999925926368217e-05,
"loss": -0.0281,
"step": 236,
"step_time": 8.405578448001506
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0010416667209938169,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0010416667209938169,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1029.0,
"completions/max_terminated_length": 1029.0,
"completions/mean_length": 969.71875,
"completions/mean_terminated_length": 969.71875,
"completions/min_length": 882.0,
"completions/min_terminated_length": 882.0,
"entropy": 0.09033584129065275,
"epoch": 0.00474,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.672709584236145,
"kl": 0.4807140491902828,
"learning_rate": 9.999925183782528e-05,
"loss": 0.0206,
"num_tokens": 12475023.0,
"reward": 7.968780517578125,
"reward_std": 14.767425537109375,
"rewards/rollout_reward_func/mean": 7.968780517578125,
"rewards/rollout_reward_func/std": 15.451577186584473,
"sampling/importance_sampling_ratio/max": 1.446393370628357,
"sampling/importance_sampling_ratio/mean": 1.0078678131103516,
"sampling/importance_sampling_ratio/min": 0.7536318898200989,
"sampling/sampling_logp_difference/max": 0.36260342597961426,
"sampling/sampling_logp_difference/mean": 0.006248952820897102,
"step": 237,
"step_time": 30.60275455199826
},
{
"clip_ratio/high_max": 0.054166669491678476,
"clip_ratio/high_mean": 0.01770833448972553,
"clip_ratio/low_mean": 0.010416667209938169,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.028125002165324986,
"entropy": 0.10236532241106033,
"epoch": 0.00476,
"grad_norm": 0.1718801110982895,
"kl": 0.45644159242510796,
"learning_rate": 9.999924437493219e-05,
"loss": 0.0137,
"step": 238,
"step_time": 8.174696675001542
},
{
"clip_ratio/high_max": 0.008333333767950535,
"clip_ratio/high_mean": 0.0020833334419876337,
"clip_ratio/low_mean": 0.0020833334419876337,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.004166666883975267,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1038.0,
"completions/max_terminated_length": 1038.0,
"completions/mean_length": 986.125,
"completions/mean_terminated_length": 986.125,
"completions/min_length": 910.0,
"completions/min_terminated_length": 910.0,
"entropy": 0.11793840350583196,
"epoch": 0.00478,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.403202623128891,
"kl": 0.5491157062351704,
"learning_rate": 9.99992368750029e-05,
"loss": 0.0205,
"num_tokens": 12589470.0,
"reward": 7.7764387130737305,
"reward_std": 10.855308532714844,
"rewards/rollout_reward_func/mean": 7.776438236236572,
"rewards/rollout_reward_func/std": 11.845745086669922,
"sampling/importance_sampling_ratio/max": 1.4125927686691284,
"sampling/importance_sampling_ratio/mean": 1.0074553489685059,
"sampling/importance_sampling_ratio/min": 0.6787428855895996,
"sampling/sampling_logp_difference/max": 0.36117464303970337,
"sampling/sampling_logp_difference/mean": 0.007153394166380167,
"step": 239,
"step_time": 31.446214477003195
},
{
"clip_ratio/high_max": 0.06614583590999246,
"clip_ratio/high_mean": 0.02070312586147338,
"clip_ratio/low_mean": 0.018750000977888703,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.03945312695577741,
"entropy": 0.12702699471265078,
"epoch": 0.0048,
"grad_norm": 0.28333526849746704,
"kl": 0.5486433319747448,
"learning_rate": 9.999922933803743e-05,
"loss": 0.0157,
"step": 240,
"step_time": 8.18167577299937
},
{
"clip_ratio/high_max": 0.012500000651925802,
"clip_ratio/high_mean": 0.004101562546566129,
"clip_ratio/low_mean": 0.0015997024602256715,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0057012650067918,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1028.0,
"completions/max_terminated_length": 1028.0,
"completions/mean_length": 947.078125,
"completions/mean_terminated_length": 947.078125,
"completions/min_length": 194.0,
"completions/min_terminated_length": 194.0,
"entropy": 0.15819989750161767,
"epoch": 0.00482,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5497854948043823,
"kl": 0.6653936766088009,
"learning_rate": 9.999922176403578e-05,
"loss": 0.0274,
"num_tokens": 12701387.0,
"reward": 4.367884635925293,
"reward_std": 14.958491325378418,
"rewards/rollout_reward_func/mean": 4.367884635925293,
"rewards/rollout_reward_func/std": 15.79384708404541,
"sampling/importance_sampling_ratio/max": 1.7386236190795898,
"sampling/importance_sampling_ratio/mean": 1.0037915706634521,
"sampling/importance_sampling_ratio/min": 1.5100153958014693e-17,
"sampling/sampling_logp_difference/max": 32.36700439453125,
"sampling/sampling_logp_difference/mean": 0.050702136009931564,
"step": 241,
"step_time": 30.312135679001585
},
{
"clip_ratio/high_max": 0.0713541698642075,
"clip_ratio/high_mean": 0.02304687607102096,
"clip_ratio/low_mean": 0.019182722782716155,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.042229598737321794,
"entropy": 0.1654459210112691,
"epoch": 0.00484,
"grad_norm": 0.2647717595100403,
"kl": 0.6706695519387722,
"learning_rate": 9.999921415299796e-05,
"loss": 0.0208,
"step": 242,
"step_time": 8.75198459999956
},
{
"clip_ratio/high_max": 0.008333333767950535,
"clip_ratio/high_mean": 0.0020833334419876337,
"clip_ratio/low_mean": 0.0010416667209938169,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0031250001629814506,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1034.0,
"completions/max_terminated_length": 1034.0,
"completions/mean_length": 975.015625,
"completions/mean_terminated_length": 975.015625,
"completions/min_length": 829.0,
"completions/min_terminated_length": 829.0,
"entropy": 0.1656077685765922,
"epoch": 0.00486,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5207899808883667,
"kl": 0.4836368393152952,
"learning_rate": 9.999920650492399e-05,
"loss": -0.0058,
"num_tokens": 12815104.0,
"reward": 8.474200248718262,
"reward_std": 13.949186325073242,
"rewards/rollout_reward_func/mean": 8.474200248718262,
"rewards/rollout_reward_func/std": 15.287591934204102,
"sampling/importance_sampling_ratio/max": 1.3931519985198975,
"sampling/importance_sampling_ratio/mean": 0.9963239431381226,
"sampling/importance_sampling_ratio/min": 7.17475301392767e-10,
"sampling/sampling_logp_difference/max": 14.08260726928711,
"sampling/sampling_logp_difference/mean": 0.0297236330807209,
"step": 243,
"step_time": 29.95098099400184
},
{
"clip_ratio/high_max": 0.04583333572372794,
"clip_ratio/high_mean": 0.01562500116415322,
"clip_ratio/low_mean": 0.019308037008158863,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.034933038405142725,
"entropy": 0.17500293161720037,
"epoch": 0.00488,
"grad_norm": 0.2083873599767685,
"kl": 0.4789597373455763,
"learning_rate": 9.999919881981386e-05,
"loss": -0.0127,
"step": 244,
"step_time": 9.603725530998418
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0005580357392318547,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0005580357392318547,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1032.0,
"completions/max_terminated_length": 1032.0,
"completions/mean_length": 960.71875,
"completions/mean_terminated_length": 960.71875,
"completions/min_length": 870.0,
"completions/min_terminated_length": 870.0,
"entropy": 0.1825911095365882,
"epoch": 0.0049,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5889225602149963,
"kl": 0.9096355475485325,
"learning_rate": 9.999919109766759e-05,
"loss": 0.0086,
"num_tokens": 12927807.0,
"reward": 4.357412338256836,
"reward_std": 10.907012939453125,
"rewards/rollout_reward_func/mean": 4.357412338256836,
"rewards/rollout_reward_func/std": 11.52568531036377,
"sampling/importance_sampling_ratio/max": 1.709380030632019,
"sampling/importance_sampling_ratio/mean": 1.0086195468902588,
"sampling/importance_sampling_ratio/min": 0.7435536980628967,
"sampling/sampling_logp_difference/max": 0.24706459045410156,
"sampling/sampling_logp_difference/mean": 0.007025801111012697,
"step": 245,
"step_time": 31.461640581997926
},
{
"clip_ratio/high_max": 0.04583333572372794,
"clip_ratio/high_mean": 0.014583334210328758,
"clip_ratio/low_mean": 0.02460007555782795,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.039183410233817995,
"entropy": 0.1923405658453703,
"epoch": 0.00492,
"grad_norm": 0.2593821585178375,
"kl": 0.6401933804154396,
"learning_rate": 9.999918333848517e-05,
"loss": -0.0009,
"step": 246,
"step_time": 7.859529026000018
},
{
"clip_ratio/high_max": 0.004464285913854837,
"clip_ratio/high_mean": 0.0011160714784637094,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0011160714784637094,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1032.0,
"completions/max_terminated_length": 1032.0,
"completions/mean_length": 962.265625,
"completions/mean_terminated_length": 962.265625,
"completions/min_length": 699.0,
"completions/min_terminated_length": 699.0,
"entropy": 0.21913561783730984,
"epoch": 0.00494,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5455278158187866,
"kl": 0.5414405167102814,
"learning_rate": 9.999917554226662e-05,
"loss": 0.0071,
"num_tokens": 13040672.0,
"reward": 8.484970092773438,
"reward_std": 13.802679061889648,
"rewards/rollout_reward_func/mean": 8.484970092773438,
"rewards/rollout_reward_func/std": 13.872236251831055,
"sampling/importance_sampling_ratio/max": 1.3358259201049805,
"sampling/importance_sampling_ratio/mean": 0.9915522336959839,
"sampling/importance_sampling_ratio/min": 0.004653692711144686,
"sampling/sampling_logp_difference/max": 4.521495819091797,
"sampling/sampling_logp_difference/mean": 0.016463816165924072,
"step": 247,
"step_time": 32.296578387999034
},
{
"clip_ratio/high_max": 0.07113095559179783,
"clip_ratio/high_mean": 0.025074406410567462,
"clip_ratio/low_mean": 0.03020833490882069,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.05528274131938815,
"entropy": 0.2346064280718565,
"epoch": 0.00496,
"grad_norm": 0.28457146883010864,
"kl": 0.5182771291583776,
"learning_rate": 9.999916770901196e-05,
"loss": 0.0003,
"step": 248,
"step_time": 8.322071146998496
},
{
"clip_ratio/high_max": 0.008333333767950535,
"clip_ratio/high_mean": 0.0020833334419876337,
"clip_ratio/low_mean": 0.0010416667209938169,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0031250001629814506,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1038.0,
"completions/max_terminated_length": 1038.0,
"completions/mean_length": 947.125,
"completions/mean_terminated_length": 947.125,
"completions/min_length": 887.0,
"completions/min_terminated_length": 887.0,
"entropy": 0.22256971709430218,
"epoch": 0.00498,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.4911050498485565,
"kl": 0.5106718242168427,
"learning_rate": 9.999915983872117e-05,
"loss": 0.0185,
"num_tokens": 13152426.0,
"reward": 8.859310150146484,
"reward_std": 14.251840591430664,
"rewards/rollout_reward_func/mean": 8.859310150146484,
"rewards/rollout_reward_func/std": 15.995503425598145,
"sampling/importance_sampling_ratio/max": 1.1877168416976929,
"sampling/importance_sampling_ratio/mean": 1.0040578842163086,
"sampling/importance_sampling_ratio/min": 0.7722747921943665,
"sampling/sampling_logp_difference/max": 0.2686450481414795,
"sampling/sampling_logp_difference/mean": 0.008781258016824722,
"step": 249,
"step_time": 31.257320647998313
},
{
"clip_ratio/high_max": 0.058333335909992456,
"clip_ratio/high_mean": 0.019791667815297842,
"clip_ratio/low_mean": 0.02544642984867096,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.04523809743113816,
"entropy": 0.22759789694100618,
"epoch": 0.005,
"grad_norm": 0.32039502263069153,
"kl": 0.5074543356895447,
"learning_rate": 9.999915193139428e-05,
"loss": 0.0067,
"step": 250,
"step_time": 8.727711221999925
},
{
"clip_ratio/high_max": 0.004166666883975267,
"clip_ratio/high_mean": 0.0010416667209938169,
"clip_ratio/low_mean": 0.0010416667209938169,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0020833334419876337,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1033.0,
"completions/max_terminated_length": 1033.0,
"completions/mean_length": 972.3125,
"completions/mean_terminated_length": 972.3125,
"completions/min_length": 885.0,
"completions/min_terminated_length": 885.0,
"entropy": 0.2637898661196232,
"epoch": 0.00502,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.6115661263465881,
"kl": 0.5041398257017136,
"learning_rate": 9.999914398703127e-05,
"loss": 0.0222,
"num_tokens": 13265926.0,
"reward": 8.36276626586914,
"reward_std": 12.634754180908203,
"rewards/rollout_reward_func/mean": 8.36276626586914,
"rewards/rollout_reward_func/std": 13.79938793182373,
"sampling/importance_sampling_ratio/max": 1.3860008716583252,
"sampling/importance_sampling_ratio/mean": 0.9989358186721802,
"sampling/importance_sampling_ratio/min": 0.6789365410804749,
"sampling/sampling_logp_difference/max": 0.4403858184814453,
"sampling/sampling_logp_difference/mean": 0.011640775017440319,
"step": 251,
"step_time": 30.78323078500125
},
{
"clip_ratio/high_max": 0.08333333721384406,
"clip_ratio/high_mean": 0.026041668141260743,
"clip_ratio/low_mean": 0.024479167768731713,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.05052083625923842,
"entropy": 0.25762353744357824,
"epoch": 0.00504,
"grad_norm": 0.33068326115608215,
"kl": 0.5315965916961432,
"learning_rate": 9.99991360056322e-05,
"loss": 0.0108,
"step": 252,
"step_time": 8.417674423999415
},
{
"clip_ratio/high_max": 0.004166666883975267,
"clip_ratio/high_mean": 0.0010416667209938169,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0010416667209938169,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1039.0,
"completions/max_terminated_length": 1039.0,
"completions/mean_length": 962.484375,
"completions/mean_terminated_length": 962.484375,
"completions/min_length": 640.0,
"completions/min_terminated_length": 640.0,
"entropy": 0.2320685014128685,
"epoch": 0.00506,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.606693685054779,
"kl": 0.5243742056190968,
"learning_rate": 9.999912798719702e-05,
"loss": 0.0154,
"num_tokens": 13378838.0,
"reward": 5.471320152282715,
"reward_std": 16.305179595947266,
"rewards/rollout_reward_func/mean": 5.471320629119873,
"rewards/rollout_reward_func/std": 16.513338088989258,
"sampling/importance_sampling_ratio/max": 1.3763768672943115,
"sampling/importance_sampling_ratio/mean": 0.9975243806838989,
"sampling/importance_sampling_ratio/min": 0.706875205039978,
"sampling/sampling_logp_difference/max": 0.28901320695877075,
"sampling/sampling_logp_difference/mean": 0.009924216195940971,
"step": 253,
"step_time": 31.570553302000008
},
{
"clip_ratio/high_max": 0.058333336375653744,
"clip_ratio/high_mean": 0.018750001094304025,
"clip_ratio/low_mean": 0.026041667792014778,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.044791669119149446,
"entropy": 0.21940706949681044,
"epoch": 0.00508,
"grad_norm": 0.3425885736942291,
"kl": 0.6501965597271919,
"learning_rate": 9.999911993172577e-05,
"loss": 0.0077,
"step": 254,
"step_time": 8.264053588000024
},
{
"clip_ratio/high_max": 0.004166666883975267,
"clip_ratio/high_mean": 0.0010416667209938169,
"clip_ratio/low_mean": 0.0020833334419876337,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0031250001629814506,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1031.0,
"completions/max_terminated_length": 1031.0,
"completions/mean_length": 953.46875,
"completions/mean_terminated_length": 953.46875,
"completions/min_length": 900.0,
"completions/min_terminated_length": 900.0,
"entropy": 0.21140480507165194,
"epoch": 0.0051,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5503631234169006,
"kl": 0.5327232480049133,
"learning_rate": 9.999911183921846e-05,
"loss": 0.0038,
"num_tokens": 13491042.0,
"reward": 8.259774208068848,
"reward_std": 10.848722457885742,
"rewards/rollout_reward_func/mean": 8.259774208068848,
"rewards/rollout_reward_func/std": 11.306256294250488,
"sampling/importance_sampling_ratio/max": 1.4545994997024536,
"sampling/importance_sampling_ratio/mean": 0.9899890422821045,
"sampling/importance_sampling_ratio/min": 0.6251944303512573,
"sampling/sampling_logp_difference/max": 0.42076706886291504,
"sampling/sampling_logp_difference/mean": 0.01122802309691906,
"step": 255,
"step_time": 31.768314117000045
},
{
"clip_ratio/high_max": 0.05476190708577633,
"clip_ratio/high_mean": 0.022023811121471226,
"clip_ratio/low_mean": 0.022916668327525258,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.04494047968182713,
"entropy": 0.21271847933530807,
"epoch": 0.00512,
"grad_norm": 1.165165662765503,
"kl": 0.5640581175684929,
"learning_rate": 9.999910370967507e-05,
"loss": -0.0008,
"step": 256,
"step_time": 8.655397303001337
},
{
"clip_ratio/high_max": 0.0034722222480922937,
"clip_ratio/high_mean": 0.0008680555620230734,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0008680555620230734,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1343.0,
"completions/max_terminated_length": 1343.0,
"completions/mean_length": 1241.921875,
"completions/mean_terminated_length": 1241.921875,
"completions/min_length": 1084.0,
"completions/min_terminated_length": 1084.0,
"entropy": 0.2426544101908803,
"epoch": 0.00514,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.8530539870262146,
"kl": 0.5651203468441963,
"learning_rate": 9.999909554309565e-05,
"loss": 0.0047,
"num_tokens": 13621717.0,
"reward": 4.329623699188232,
"reward_std": 14.394445419311523,
"rewards/rollout_reward_func/mean": 4.329623222351074,
"rewards/rollout_reward_func/std": 14.93822193145752,
"sampling/importance_sampling_ratio/max": 1.2321135997772217,
"sampling/importance_sampling_ratio/mean": 0.9581992626190186,
"sampling/importance_sampling_ratio/min": 0.2660026550292969,
"sampling/sampling_logp_difference/max": 1.2229857444763184,
"sampling/sampling_logp_difference/mean": 0.013221165165305138,
"step": 257,
"step_time": 37.82865672800108
},
{
"clip_ratio/high_max": 0.06597222317941487,
"clip_ratio/high_mean": 0.027732091082725674,
"clip_ratio/low_mean": 0.038194445020053536,
"clip_ratio/low_min": 0.0034722222480922937,
"clip_ratio/region_mean": 0.06592653610277921,
"entropy": 0.2270987592637539,
"epoch": 0.00516,
"grad_norm": 0.43574994802474976,
"kl": 0.6589642316102982,
"learning_rate": 9.999908733948017e-05,
"loss": -0.0093,
"step": 258,
"step_time": 10.512357729995529
},
{
"clip_ratio/high_max": 0.0034722222480922937,
"clip_ratio/high_mean": 0.0008680555620230734,
"clip_ratio/low_mean": 0.0017361111240461469,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0026041666860692203,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1366.0,
"completions/max_terminated_length": 1366.0,
"completions/mean_length": 1250.96875,
"completions/mean_terminated_length": 1250.96875,
"completions/min_length": 735.0,
"completions/min_terminated_length": 735.0,
"entropy": 0.213697855360806,
"epoch": 0.00518,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.6505308747291565,
"kl": 0.5941296126693487,
"learning_rate": 9.999907909882866e-05,
"loss": -0.0204,
"num_tokens": 13753091.0,
"reward": 6.001728057861328,
"reward_std": 15.827871322631836,
"rewards/rollout_reward_func/mean": 6.001728057861328,
"rewards/rollout_reward_func/std": 16.02460479736328,
"sampling/importance_sampling_ratio/max": 1.3350346088409424,
"sampling/importance_sampling_ratio/mean": 0.9674654006958008,
"sampling/importance_sampling_ratio/min": 0.5470981001853943,
"sampling/sampling_logp_difference/max": 0.5512038469314575,
"sampling/sampling_logp_difference/mean": 0.011880462057888508,
"step": 259,
"step_time": 38.35317581399886
},
{
"clip_ratio/high_max": 0.05208333395421505,
"clip_ratio/high_mean": 0.015625000174622983,
"clip_ratio/low_mean": 0.04037990275537595,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.056004903570283204,
"entropy": 0.1957033844664693,
"epoch": 0.0052,
"grad_norm": 0.45430490374565125,
"kl": 0.781089099124074,
"learning_rate": 9.999907082114112e-05,
"loss": -0.0313,
"step": 260,
"step_time": 9.020615039000404
},
{
"clip_ratio/high_max": 0.016812865156680346,
"clip_ratio/high_mean": 0.004203216289170086,
"clip_ratio/low_mean": 0.0026041666860692203,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.006807382975239307,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1335.0,
"completions/max_terminated_length": 1335.0,
"completions/mean_length": 1213.078125,
"completions/mean_terminated_length": 1213.078125,
"completions/min_length": 626.0,
"completions/min_terminated_length": 626.0,
"entropy": 0.18847014661878347,
"epoch": 0.00522,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.6835371851921082,
"kl": 0.542348800227046,
"learning_rate": 9.999906250641758e-05,
"loss": 0.0145,
"num_tokens": 13881882.0,
"reward": 5.304417610168457,
"reward_std": 14.105676651000977,
"rewards/rollout_reward_func/mean": 5.304417133331299,
"rewards/rollout_reward_func/std": 14.791868209838867,
"sampling/importance_sampling_ratio/max": 1.3720983266830444,
"sampling/importance_sampling_ratio/mean": 0.9722362756729126,
"sampling/importance_sampling_ratio/min": 8.055465437370129e-20,
"sampling/sampling_logp_difference/max": 38.39814376831055,
"sampling/sampling_logp_difference/mean": 0.05052501708269119,
"step": 261,
"step_time": 39.094199752998065
},
{
"clip_ratio/high_max": 0.05559855583123863,
"clip_ratio/high_mean": 0.018239916767925024,
"clip_ratio/low_mean": 0.02690972271375358,
"clip_ratio/low_min": 0.0034722222480922937,
"clip_ratio/region_mean": 0.04514963936526328,
"entropy": 0.1835553077980876,
"epoch": 0.00524,
"grad_norm": 0.266696572303772,
"kl": 0.5767297390848398,
"learning_rate": 9.9999054154658e-05,
"loss": 0.0025,
"step": 262,
"step_time": 9.272368914001163
},
{
"clip_ratio/high_max": 0.010416666744276881,
"clip_ratio/high_mean": 0.0026041666860692203,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0026041666860692203,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1344.0,
"completions/max_terminated_length": 1344.0,
"completions/mean_length": 1241.4375,
"completions/mean_terminated_length": 1241.4375,
"completions/min_length": 902.0,
"completions/min_terminated_length": 902.0,
"entropy": 0.1920191366225481,
"epoch": 0.00526,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.8278535604476929,
"kl": 0.5627734903246164,
"learning_rate": 9.999904576586242e-05,
"loss": -0.0123,
"num_tokens": 14012567.0,
"reward": 3.131194591522217,
"reward_std": 12.649508476257324,
"rewards/rollout_reward_func/mean": 3.131195068359375,
"rewards/rollout_reward_func/std": 12.768006324768066,
"sampling/importance_sampling_ratio/max": 1.5249695777893066,
"sampling/importance_sampling_ratio/mean": 1.0084636211395264,
"sampling/importance_sampling_ratio/min": 0.6291685700416565,
"sampling/sampling_logp_difference/max": 0.48067259788513184,
"sampling/sampling_logp_difference/mean": 0.011031190864741802,
"step": 263,
"step_time": 37.94770654900185
},
{
"clip_ratio/high_max": 0.06597222364507616,
"clip_ratio/high_mean": 0.02170138922519982,
"clip_ratio/low_mean": 0.027777778508607298,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.04947916854871437,
"entropy": 0.1821621311828494,
"epoch": 0.00528,
"grad_norm": 0.30830591917037964,
"kl": 0.6204142663627863,
"learning_rate": 9.999903734003084e-05,
"loss": -0.0238,
"step": 264,
"step_time": 9.788196208000045
},
{
"clip_ratio/high_max": 0.010416666744276881,
"clip_ratio/high_mean": 0.0026041666860692203,
"clip_ratio/low_mean": 0.0017361111240461469,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.004340277810115367,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1354.0,
"completions/max_terminated_length": 1354.0,
"completions/mean_length": 1215.875,
"completions/mean_terminated_length": 1215.875,
"completions/min_length": 194.0,
"completions/min_terminated_length": 194.0,
"entropy": 0.18026093766093254,
"epoch": 0.0053,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.7985097765922546,
"kl": 0.5250384621322155,
"learning_rate": 9.999902887716329e-05,
"loss": -0.0455,
"num_tokens": 14141610.0,
"reward": 2.849423408508301,
"reward_std": 12.35162353515625,
"rewards/rollout_reward_func/mean": 2.849423408508301,
"rewards/rollout_reward_func/std": 12.910691261291504,
"sampling/importance_sampling_ratio/max": 1.7354861497879028,
"sampling/importance_sampling_ratio/mean": 0.9913997650146484,
"sampling/importance_sampling_ratio/min": 0.53452068567276,
"sampling/sampling_logp_difference/max": 0.5425161123275757,
"sampling/sampling_logp_difference/mean": 0.012209449894726276,
"step": 265,
"step_time": 38.11352587100009
},
{
"clip_ratio/high_max": 0.06311274622566998,
"clip_ratio/high_mean": 0.01925040880450979,
"clip_ratio/low_mean": 0.030831291631329805,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.050081700494047254,
"entropy": 0.1743807177990675,
"epoch": 0.00532,
"grad_norm": 0.9081993103027344,
"kl": 1.4623642209917307,
"learning_rate": 9.999902037725976e-05,
"loss": -0.0483,
"step": 266,
"step_time": 9.7086338709978
},
{
"clip_ratio/high_max": 0.010416666744276881,
"clip_ratio/high_mean": 0.0026041666860692203,
"clip_ratio/low_mean": 0.005259395460598171,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.007863562146667391,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1349.0,
"completions/max_terminated_length": 1349.0,
"completions/mean_length": 1235.953125,
"completions/mean_terminated_length": 1235.953125,
"completions/min_length": 769.0,
"completions/min_terminated_length": 769.0,
"entropy": 0.1784328306093812,
"epoch": 0.00534,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.8215274810791016,
"kl": 0.49557984061539173,
"learning_rate": 9.999901184032026e-05,
"loss": 0.0099,
"num_tokens": 14271910.0,
"reward": 6.570675849914551,
"reward_std": 11.428293228149414,
"rewards/rollout_reward_func/mean": 6.570675849914551,
"rewards/rollout_reward_func/std": 11.919609069824219,
"sampling/importance_sampling_ratio/max": 1.5103188753128052,
"sampling/importance_sampling_ratio/mean": 1.018727421760559,
"sampling/importance_sampling_ratio/min": 1.0843930725359919e-15,
"sampling/sampling_logp_difference/max": 27.71844482421875,
"sampling/sampling_logp_difference/mean": 0.04110131412744522,
"step": 267,
"step_time": 40.345479872002215
},
{
"clip_ratio/high_max": 0.08261846494860947,
"clip_ratio/high_mean": 0.025862949551083148,
"clip_ratio/low_mean": 0.025904605397954583,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.05176755564752966,
"entropy": 0.18014734331518412,
"epoch": 0.00536,
"grad_norm": 0.3775624632835388,
"kl": 0.5100179798901081,
"learning_rate": 9.99990032663448e-05,
"loss": -0.0005,
"step": 268,
"step_time": 8.8038962849987
},
{
"clip_ratio/high_max": 0.010620915098115802,
"clip_ratio/high_mean": 0.0026552287745289505,
"clip_ratio/low_mean": 0.001787173212505877,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0044424019870348275,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1348.0,
"completions/max_terminated_length": 1348.0,
"completions/mean_length": 1209.5625,
"completions/mean_terminated_length": 1209.5625,
"completions/min_length": 423.0,
"completions/min_terminated_length": 423.0,
"entropy": 0.19260858092457056,
"epoch": 0.00538,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.9969385266304016,
"kl": 0.48063670098781586,
"learning_rate": 9.999899465533337e-05,
"loss": -0.0145,
"num_tokens": 14400520.0,
"reward": 4.870312690734863,
"reward_std": 12.755669593811035,
"rewards/rollout_reward_func/mean": 4.870312690734863,
"rewards/rollout_reward_func/std": 12.786203384399414,
"sampling/importance_sampling_ratio/max": 1.391904354095459,
"sampling/importance_sampling_ratio/mean": 0.9837595224380493,
"sampling/importance_sampling_ratio/min": 0.5111071467399597,
"sampling/sampling_logp_difference/max": 0.6141395568847656,
"sampling/sampling_logp_difference/mean": 0.012509889900684357,
"step": 269,
"step_time": 39.08969898599935
},
{
"clip_ratio/high_max": 0.07679738639853895,
"clip_ratio/high_mean": 0.025275735883042216,
"clip_ratio/low_mean": 0.03416053985711187,
"clip_ratio/low_min": 0.0034722222480922937,
"clip_ratio/region_mean": 0.05943627591477707,
"entropy": 0.190420214086771,
"epoch": 0.0054,
"grad_norm": 2.133584499359131,
"kl": 1.2963667679578066,
"learning_rate": 9.999898600728599e-05,
"loss": -0.0154,
"step": 270,
"step_time": 10.067689283000618
},
{
"clip_ratio/high_max": 0.0034722222480922937,
"clip_ratio/high_mean": 0.0008680555620230734,
"clip_ratio/low_mean": 0.0008680555620230734,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0017361111240461469,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1356.0,
"completions/max_terminated_length": 1356.0,
"completions/mean_length": 1241.09375,
"completions/mean_terminated_length": 1241.09375,
"completions/min_length": 641.0,
"completions/min_terminated_length": 641.0,
"entropy": 0.17894164565950632,
"epoch": 0.00542,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.759871780872345,
"kl": 0.4996040966361761,
"learning_rate": 9.999897732220269e-05,
"loss": -0.0452,
"num_tokens": 14531215.0,
"reward": 6.598260879516602,
"reward_std": 12.557649612426758,
"rewards/rollout_reward_func/mean": 6.598260402679443,
"rewards/rollout_reward_func/std": 12.858835220336914,
"sampling/importance_sampling_ratio/max": 1.6857366561889648,
"sampling/importance_sampling_ratio/mean": 1.0335665941238403,
"sampling/importance_sampling_ratio/min": 0.555221676826477,
"sampling/sampling_logp_difference/max": 0.583274245262146,
"sampling/sampling_logp_difference/mean": 0.010182222351431847,
"step": 271,
"step_time": 38.82532325500051
},
{
"clip_ratio/high_max": 0.03513071942143142,
"clip_ratio/high_mean": 0.009650735417380929,
"clip_ratio/low_mean": 0.020450367941521108,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.030101103708148003,
"entropy": 0.17972450237721205,
"epoch": 0.00544,
"grad_norm": 0.39173194766044617,
"kl": 0.5205750651657581,
"learning_rate": 9.999896860008347e-05,
"loss": -0.052,
"step": 272,
"step_time": 10.316601943999558
},
{
"clip_ratio/high_max": 0.0069444444961845875,
"clip_ratio/high_mean": 0.0017361111240461469,
"clip_ratio/low_mean": 0.0026041666860692203,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.004340277810115367,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1340.0,
"completions/max_terminated_length": 1340.0,
"completions/mean_length": 1249.921875,
"completions/mean_terminated_length": 1249.921875,
"completions/min_length": 663.0,
"completions/min_terminated_length": 663.0,
"entropy": 0.18446057755500078,
"epoch": 0.00546,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5517368912696838,
"kl": 0.5441189091652632,
"learning_rate": 9.999895984092831e-05,
"loss": 0.0131,
"num_tokens": 14662474.0,
"reward": 5.8217644691467285,
"reward_std": 11.078777313232422,
"rewards/rollout_reward_func/mean": 5.8217644691467285,
"rewards/rollout_reward_func/std": 11.748489379882812,
"sampling/importance_sampling_ratio/max": 2.5323374271392822,
"sampling/importance_sampling_ratio/mean": 0.9794174432754517,
"sampling/importance_sampling_ratio/min": 1.1464784742225287e-13,
"sampling/sampling_logp_difference/max": 23.46839714050293,
"sampling/sampling_logp_difference/mean": 0.03605649992823601,
"step": 273,
"step_time": 38.64782374399874
},
{
"clip_ratio/high_max": 0.041483918437734246,
"clip_ratio/high_mean": 0.01210709079168737,
"clip_ratio/low_mean": 0.049096201779320836,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.06120329239638522,
"entropy": 0.17116414476186037,
"epoch": 0.00548,
"grad_norm": 0.3144800662994385,
"kl": 0.6644695494323969,
"learning_rate": 9.999895104473725e-05,
"loss": 0.0043,
"step": 274,
"step_time": 8.841590996999912
},
{
"clip_ratio/high_max": 0.01756535959430039,
"clip_ratio/high_mean": 0.006127451022621244,
"clip_ratio/low_mean": 0.0026552287745289505,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.008782679797150195,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1370.0,
"completions/max_terminated_length": 1370.0,
"completions/mean_length": 1246.375,
"completions/mean_terminated_length": 1246.375,
"completions/min_length": 1053.0,
"completions/min_terminated_length": 1053.0,
"entropy": 0.17010682448744774,
"epoch": 0.0055,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5666040182113647,
"kl": 0.5793958213180304,
"learning_rate": 9.99989422115103e-05,
"loss": 0.0183,
"num_tokens": 14793449.0,
"reward": 2.9848508834838867,
"reward_std": 12.649776458740234,
"rewards/rollout_reward_func/mean": 2.984851121902466,
"rewards/rollout_reward_func/std": 13.012813568115234,
"sampling/importance_sampling_ratio/max": 1.5569446086883545,
"sampling/importance_sampling_ratio/mean": 0.9847633838653564,
"sampling/importance_sampling_ratio/min": 0.6424822807312012,
"sampling/sampling_logp_difference/max": 0.4623146057128906,
"sampling/sampling_logp_difference/mean": 0.00930742733180523,
"step": 275,
"step_time": 39.34428709400072
},
{
"clip_ratio/high_max": 0.054125817492604256,
"clip_ratio/high_mean": 0.016135621059220284,
"clip_ratio/low_mean": 0.0301164222182706,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.046252043335698545,
"entropy": 0.16701707802712917,
"epoch": 0.00552,
"grad_norm": 0.6509947180747986,
"kl": 0.6457913182675838,
"learning_rate": 9.999893334124744e-05,
"loss": 0.0127,
"step": 276,
"step_time": 9.492375128998901
},
{
"clip_ratio/high_max": 0.010416666744276881,
"clip_ratio/high_mean": 0.0026041666860692203,
"clip_ratio/low_mean": 0.0008680555620230734,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0034722222480922937,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1342.0,
"completions/max_terminated_length": 1342.0,
"completions/mean_length": 1211.75,
"completions/mean_terminated_length": 1211.75,
"completions/min_length": 786.0,
"completions/min_terminated_length": 786.0,
"entropy": 0.16395934531465173,
"epoch": 0.00554,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.6039373874664307,
"kl": 0.6709765158593655,
"learning_rate": 9.999892443394869e-05,
"loss": -0.0199,
"num_tokens": 14922202.0,
"reward": 9.567506790161133,
"reward_std": 12.886774063110352,
"rewards/rollout_reward_func/mean": 9.567506790161133,
"rewards/rollout_reward_func/std": 14.272911071777344,
"sampling/importance_sampling_ratio/max": 1.3702117204666138,
"sampling/importance_sampling_ratio/mean": 0.9926258325576782,
"sampling/importance_sampling_ratio/min": 2.4751771812714374e-13,
"sampling/sampling_logp_difference/max": 22.218292236328125,
"sampling/sampling_logp_difference/mean": 0.03479118272662163,
"step": 277,
"step_time": 38.66672314299831
},
{
"clip_ratio/high_max": 0.056832108180969954,
"clip_ratio/high_mean": 0.015944138227496296,
"clip_ratio/low_mean": 0.026416234264615923,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.04236037301598117,
"entropy": 0.16752836294472218,
"epoch": 0.00556,
"grad_norm": 0.2952568829059601,
"kl": 0.7055745627731085,
"learning_rate": 9.999891548961409e-05,
"loss": -0.0283,
"step": 278,
"step_time": 10.249573535998024
},
{
"clip_ratio/high_max": 0.0034722222480922937,
"clip_ratio/high_mean": 0.0008680555620230734,
"clip_ratio/low_mean": 0.0026552287745289505,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.003523284336552024,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1344.0,
"completions/max_terminated_length": 1344.0,
"completions/mean_length": 1206.921875,
"completions/mean_terminated_length": 1206.921875,
"completions/min_length": 193.0,
"completions/min_terminated_length": 193.0,
"entropy": 0.16296257637441158,
"epoch": 0.00558,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.6338427662849426,
"kl": 0.5948988310992718,
"learning_rate": 9.99989065082436e-05,
"loss": -0.0193,
"num_tokens": 15050626.0,
"reward": 6.859679222106934,
"reward_std": 13.335336685180664,
"rewards/rollout_reward_func/mean": 6.859679222106934,
"rewards/rollout_reward_func/std": 13.806427955627441,
"sampling/importance_sampling_ratio/max": 1.73651123046875,
"sampling/importance_sampling_ratio/mean": 1.0235867500305176,
"sampling/importance_sampling_ratio/min": 0.6915313005447388,
"sampling/sampling_logp_difference/max": 0.3299523591995239,
"sampling/sampling_logp_difference/mean": 0.00790142547339201,
"step": 279,
"step_time": 37.97661670400066
},
{
"clip_ratio/high_max": 0.04227941203862429,
"clip_ratio/high_mean": 0.01235702628036961,
"clip_ratio/low_mean": 0.023852379759773612,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.036209406214766204,
"entropy": 0.17103052884340286,
"epoch": 0.0056,
"grad_norm": 0.310857355594635,
"kl": 0.6127588897943497,
"learning_rate": 9.999889748983726e-05,
"loss": -0.0289,
"step": 280,
"step_time": 9.740956478998669
},
{
"clip_ratio/high_max": 0.007148692850023508,
"clip_ratio/high_mean": 0.001787173212505877,
"clip_ratio/low_mean": 0.0035807291860692203,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005367902398575097,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1347.0,
"completions/max_terminated_length": 1347.0,
"completions/mean_length": 1242.203125,
"completions/mean_terminated_length": 1242.203125,
"completions/min_length": 1071.0,
"completions/min_terminated_length": 1071.0,
"entropy": 0.18003392685204744,
"epoch": 0.00562,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5553699731826782,
"kl": 0.5920679531991482,
"learning_rate": 9.999888843439508e-05,
"loss": 0.0235,
"num_tokens": 15181392.0,
"reward": 3.9797325134277344,
"reward_std": 11.883782386779785,
"rewards/rollout_reward_func/mean": 3.9797325134277344,
"rewards/rollout_reward_func/std": 12.557183265686035,
"sampling/importance_sampling_ratio/max": 2.3216447830200195,
"sampling/importance_sampling_ratio/mean": 1.0259038209915161,
"sampling/importance_sampling_ratio/min": 0.37790024280548096,
"sampling/sampling_logp_difference/max": 1.4781968593597412,
"sampling/sampling_logp_difference/mean": 0.011046608909964561,
"step": 281,
"step_time": 39.572295284000575
},
{
"clip_ratio/high_max": 0.04312193673104048,
"clip_ratio/high_mean": 0.01343571295728907,
"clip_ratio/low_mean": 0.023201337666250765,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.03663705079816282,
"entropy": 0.18677309434860945,
"epoch": 0.00564,
"grad_norm": 0.5401102304458618,
"kl": 0.5960894413292408,
"learning_rate": 9.999887934191704e-05,
"loss": 0.0166,
"step": 282,
"step_time": 9.04144253800041
},
{
"clip_ratio/high_max": 0.007582720601931214,
"clip_ratio/high_mean": 0.0018956801504828036,
"clip_ratio/low_mean": 0.0009191176504828036,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.002814797800965607,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1338.0,
"completions/max_terminated_length": 1338.0,
"completions/mean_length": 1215.859375,
"completions/mean_terminated_length": 1215.859375,
"completions/min_length": 1067.0,
"completions/min_terminated_length": 1067.0,
"entropy": 0.18976869899779558,
"epoch": 0.00566,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.6778466105461121,
"kl": 0.8045855388045311,
"learning_rate": 9.99988702124032e-05,
"loss": 0.0435,
"num_tokens": 15310402.0,
"reward": 9.532302856445312,
"reward_std": 13.447786331176758,
"rewards/rollout_reward_func/mean": 9.532302856445312,
"rewards/rollout_reward_func/std": 14.893537521362305,
"sampling/importance_sampling_ratio/max": 1.5014206171035767,
"sampling/importance_sampling_ratio/mean": 0.984979510307312,
"sampling/importance_sampling_ratio/min": 0.5918754935264587,
"sampling/sampling_logp_difference/max": 0.49157631397247314,
"sampling/sampling_logp_difference/mean": 0.009644631296396255,
"step": 283,
"step_time": 37.75929147200077
},
{
"clip_ratio/high_max": 0.04337724717333913,
"clip_ratio/high_mean": 0.017022824671585113,
"clip_ratio/low_mean": 0.019767412508372217,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.036790237470995635,
"entropy": 0.19258240424096584,
"epoch": 0.00568,
"grad_norm": 0.23295848071575165,
"kl": 0.7846251800656319,
"learning_rate": 9.999886104585351e-05,
"loss": 0.0377,
"step": 284,
"step_time": 9.718582901003174
},
{
"clip_ratio/high_max": 0.0034722222480922937,
"clip_ratio/high_mean": 0.0008680555620230734,
"clip_ratio/low_mean": 0.0036764706601388752,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.004544526163954288,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1352.0,
"completions/max_terminated_length": 1352.0,
"completions/mean_length": 1182.8125,
"completions/mean_terminated_length": 1182.8125,
"completions/min_length": 644.0,
"completions/min_terminated_length": 644.0,
"entropy": 0.21535112708806992,
"epoch": 0.0057,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.8615608215332031,
"kl": 0.8677664548158646,
"learning_rate": 9.999885184226802e-05,
"loss": -0.0133,
"num_tokens": 15437277.0,
"reward": 4.226072788238525,
"reward_std": 8.478089332580566,
"rewards/rollout_reward_func/mean": 4.226072788238525,
"rewards/rollout_reward_func/std": 9.509638786315918,
"sampling/importance_sampling_ratio/max": 1.921204924583435,
"sampling/importance_sampling_ratio/mean": 0.9768272638320923,
"sampling/importance_sampling_ratio/min": 0.7244350910186768,
"sampling/sampling_logp_difference/max": 0.4400520324707031,
"sampling/sampling_logp_difference/mean": 0.010012689046561718,
"step": 285,
"step_time": 36.26275280400023
},
{
"clip_ratio/high_max": 0.04353043343871832,
"clip_ratio/high_mean": 0.01611264329403639,
"clip_ratio/low_mean": 0.029692606767639518,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.045805250061675906,
"entropy": 0.21773250121623278,
"epoch": 0.00572,
"grad_norm": 0.48840415477752686,
"kl": 1.049767030403018,
"learning_rate": 9.999884260164671e-05,
"loss": -0.0254,
"step": 286,
"step_time": 10.79683871799898
},
{
"clip_ratio/high_max": 0.01797385630197823,
"clip_ratio/high_mean": 0.004493464075494558,
"clip_ratio/low_mean": 0.003523284336552024,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.008016748412046582,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1349.0,
"completions/max_terminated_length": 1349.0,
"completions/mean_length": 1224.109375,
"completions/mean_terminated_length": 1224.109375,
"completions/min_length": 697.0,
"completions/min_terminated_length": 697.0,
"entropy": 0.22404625453054905,
"epoch": 0.00574,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.1797651052474976,
"kl": 0.8139622360467911,
"learning_rate": 9.999883332398962e-05,
"loss": -0.0606,
"num_tokens": 15566944.0,
"reward": 5.630161762237549,
"reward_std": 12.35897445678711,
"rewards/rollout_reward_func/mean": 5.630161762237549,
"rewards/rollout_reward_func/std": 13.792024612426758,
"sampling/importance_sampling_ratio/max": 2.566322088241577,
"sampling/importance_sampling_ratio/mean": 0.9811519384384155,
"sampling/importance_sampling_ratio/min": 0.388390451669693,
"sampling/sampling_logp_difference/max": 1.9641337394714355,
"sampling/sampling_logp_difference/mean": 0.015188181772828102,
"step": 287,
"step_time": 36.44446017899918
},
{
"clip_ratio/high_max": 0.053717320784926414,
"clip_ratio/high_mean": 0.01695261470740661,
"clip_ratio/low_mean": 0.04400914063444361,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.060961755632888526,
"entropy": 0.2100704526528716,
"epoch": 0.00576,
"grad_norm": 1.2667500972747803,
"kl": 2.074540827423334,
"learning_rate": 9.999882400929674e-05,
"loss": -0.057,
"step": 288,
"step_time": 8.99862431100064
},
{
"clip_ratio/high_max": 0.010850694496184587,
"clip_ratio/high_mean": 0.0035807291860692203,
"clip_ratio/low_mean": 0.001787173212505877,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005367902398575097,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1377.0,
"completions/max_terminated_length": 1377.0,
"completions/mean_length": 1206.5,
"completions/mean_terminated_length": 1206.5,
"completions/min_length": 202.0,
"completions/min_terminated_length": 202.0,
"entropy": 0.21379029098898172,
"epoch": 0.00578,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.9330686926841736,
"kl": 0.6734739989042282,
"learning_rate": 9.999881465756809e-05,
"loss": -0.0075,
"num_tokens": 15695392.0,
"reward": 5.131735801696777,
"reward_std": 13.59388256072998,
"rewards/rollout_reward_func/mean": 5.1317362785339355,
"rewards/rollout_reward_func/std": 15.563157081604004,
"sampling/importance_sampling_ratio/max": 1.5120335817337036,
"sampling/importance_sampling_ratio/mean": 0.9915132522583008,
"sampling/importance_sampling_ratio/min": 0.7389032244682312,
"sampling/sampling_logp_difference/max": 0.3448265790939331,
"sampling/sampling_logp_difference/mean": 0.010639440268278122,
"step": 289,
"step_time": 38.13999567300107
},
{
"clip_ratio/high_max": 0.052309082355350256,
"clip_ratio/high_mean": 0.015732499363366514,
"clip_ratio/low_mean": 0.04032860859297216,
"clip_ratio/low_min": 0.0034722222480922937,
"clip_ratio/region_mean": 0.056061108596622944,
"entropy": 0.20592329651117325,
"epoch": 0.0058,
"grad_norm": 1.7719804048538208,
"kl": 2.317722400650382,
"learning_rate": 9.999880526880367e-05,
"loss": 0.0124,
"step": 290,
"step_time": 9.157000397000957
},
{
"clip_ratio/high_max": 0.0034722222480922937,
"clip_ratio/high_mean": 0.0008680555620230734,
"clip_ratio/low_mean": 0.0017361111240461469,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0026041666860692203,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 1351.0,
"completions/max_terminated_length": 1351.0,
"completions/mean_length": 1227.28125,
"completions/mean_terminated_length": 1227.2381591796875,
"completions/min_length": 1085.0,
"completions/min_terminated_length": 1085.0,
"entropy": 0.2259034337475896,
"epoch": 0.00582,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5913375616073608,
"kl": 0.5888024400919676,
"learning_rate": 9.999879584300349e-05,
"loss": -0.0201,
"num_tokens": 15825170.0,
"reward": 4.390281677246094,
"reward_std": 13.705522537231445,
"rewards/rollout_reward_func/mean": 4.390281677246094,
"rewards/rollout_reward_func/std": 13.848193168640137,
"sampling/importance_sampling_ratio/max": 1.4006119966506958,
"sampling/importance_sampling_ratio/mean": 0.9845026731491089,
"sampling/importance_sampling_ratio/min": 0.57123863697052,
"sampling/sampling_logp_difference/max": 0.4590674638748169,
"sampling/sampling_logp_difference/mean": 0.010318214073777199,
"step": 291,
"step_time": 38.378031336002095
},
{
"clip_ratio/high_max": 0.05575980432331562,
"clip_ratio/high_mean": 0.018331290979404002,
"clip_ratio/low_mean": 0.025366254791151732,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0436975461198017,
"entropy": 0.25106900557875633,
"epoch": 0.00584,
"grad_norm": 0.3503086268901825,
"kl": 0.6040437389165163,
"learning_rate": 9.999878638016755e-05,
"loss": -0.0261,
"step": 292,
"step_time": 10.2765881680034
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1355.0,
"completions/max_terminated_length": 1355.0,
"completions/mean_length": 1203.28125,
"completions/mean_terminated_length": 1203.28125,
"completions/min_length": 196.0,
"completions/min_terminated_length": 196.0,
"entropy": 0.255804393440485,
"epoch": 0.00586,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.733392059803009,
"kl": 0.7084890268743038,
"learning_rate": 9.99987768802959e-05,
"loss": -0.0295,
"num_tokens": 15953483.0,
"reward": 4.652621269226074,
"reward_std": 13.876627922058105,
"rewards/rollout_reward_func/mean": 4.652621269226074,
"rewards/rollout_reward_func/std": 14.444734573364258,
"sampling/importance_sampling_ratio/max": 1.5388283729553223,
"sampling/importance_sampling_ratio/mean": 0.9943192005157471,
"sampling/importance_sampling_ratio/min": 0.66633540391922,
"sampling/sampling_logp_difference/max": 0.3228440284729004,
"sampling/sampling_logp_difference/mean": 0.009999222122132778,
"step": 293,
"step_time": 36.11094247699839
},
{
"clip_ratio/high_max": 0.07255117082968354,
"clip_ratio/high_mean": 0.020741959451697767,
"clip_ratio/low_mean": 0.026092729007359594,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0468346884008497,
"entropy": 0.2808321360498667,
"epoch": 0.00588,
"grad_norm": 0.3306209444999695,
"kl": 0.6640463471412659,
"learning_rate": 9.99987673433885e-05,
"loss": -0.0368,
"step": 294,
"step_time": 9.500305491999825
},
{
"clip_ratio/high_max": 0.0034722222480922937,
"clip_ratio/high_mean": 0.0008680555620230734,
"clip_ratio/low_mean": 0.0026041666860692203,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0034722222480922937,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1353.0,
"completions/max_terminated_length": 1353.0,
"completions/mean_length": 1203.546875,
"completions/mean_terminated_length": 1203.546875,
"completions/min_length": 246.0,
"completions/min_terminated_length": 246.0,
"entropy": 0.29141946602612734,
"epoch": 0.0059,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.7943304777145386,
"kl": 0.6770852543413639,
"learning_rate": 9.999875776944538e-05,
"loss": -0.0049,
"num_tokens": 16081715.0,
"reward": 2.4234745502471924,
"reward_std": 10.446115493774414,
"rewards/rollout_reward_func/mean": 2.4234743118286133,
"rewards/rollout_reward_func/std": 11.454586029052734,
"sampling/importance_sampling_ratio/max": 1.456477165222168,
"sampling/importance_sampling_ratio/mean": 0.9954730272293091,
"sampling/importance_sampling_ratio/min": 0.6373972296714783,
"sampling/sampling_logp_difference/max": 0.3991684913635254,
"sampling/sampling_logp_difference/mean": 0.010583357885479927,
"step": 295,
"step_time": 37.65815602800012
},
{
"clip_ratio/high_max": 0.049019608180969954,
"clip_ratio/high_mean": 0.022671569080557674,
"clip_ratio/low_mean": 0.029513889458030462,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.05218545877141878,
"entropy": 0.2855409812182188,
"epoch": 0.00592,
"grad_norm": 0.3455994129180908,
"kl": 0.6785521320998669,
"learning_rate": 9.999874815846655e-05,
"loss": -0.0152,
"step": 296,
"step_time": 8.978216180002164
},
{
"clip_ratio/high_max": 0.010620915098115802,
"clip_ratio/high_mean": 0.003523284336552024,
"clip_ratio/low_mean": 0.0008680555620230734,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.004391339898575097,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1353.0,
"completions/max_terminated_length": 1353.0,
"completions/mean_length": 1205.796875,
"completions/mean_terminated_length": 1205.796875,
"completions/min_length": 210.0,
"completions/min_terminated_length": 210.0,
"entropy": 0.3199264472350478,
"epoch": 0.00594,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5850762128829956,
"kl": 0.8113113101571798,
"learning_rate": 9.999873851045201e-05,
"loss": -0.0035,
"num_tokens": 16210207.0,
"reward": 2.8288328647613525,
"reward_std": 14.778526306152344,
"rewards/rollout_reward_func/mean": 2.8288326263427734,
"rewards/rollout_reward_func/std": 16.02610969543457,
"sampling/importance_sampling_ratio/max": 1.4778478145599365,
"sampling/importance_sampling_ratio/mean": 1.0175740718841553,
"sampling/importance_sampling_ratio/min": 0.6004241108894348,
"sampling/sampling_logp_difference/max": 0.35140562057495117,
"sampling/sampling_logp_difference/mean": 0.012288028374314308,
"step": 297,
"step_time": 36.8206370079979
},
{
"clip_ratio/high_max": 0.0490196084138006,
"clip_ratio/high_mean": 0.015590063121635467,
"clip_ratio/low_mean": 0.026909722771961242,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.04249978606821969,
"entropy": 0.3119704835116863,
"epoch": 0.00596,
"grad_norm": 0.6210339069366455,
"kl": 0.8435764815658331,
"learning_rate": 9.99987288254018e-05,
"loss": -0.0125,
"step": 298,
"step_time": 9.949690105999252
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0017361111240461469,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0017361111240461469,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1320.0,
"completions/max_terminated_length": 1320.0,
"completions/mean_length": 1166.171875,
"completions/mean_terminated_length": 1166.171875,
"completions/min_length": 865.0,
"completions/min_terminated_length": 865.0,
"entropy": 0.3107016496360302,
"epoch": 0.00598,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.7741795778274536,
"kl": 0.8151105176657438,
"learning_rate": 9.99987191033159e-05,
"loss": 0.013,
"num_tokens": 16335952.0,
"reward": 0.605268120765686,
"reward_std": 9.44769287109375,
"rewards/rollout_reward_func/mean": 0.6052679419517517,
"rewards/rollout_reward_func/std": 10.618112564086914,
"sampling/importance_sampling_ratio/max": 1.4987179040908813,
"sampling/importance_sampling_ratio/mean": 1.0136826038360596,
"sampling/importance_sampling_ratio/min": 0.7334418892860413,
"sampling/sampling_logp_difference/max": 0.23920416831970215,
"sampling/sampling_logp_difference/mean": 0.011820180341601372,
"step": 299,
"step_time": 35.96857580300002
},
{
"clip_ratio/high_max": 0.07089971494860947,
"clip_ratio/high_mean": 0.022029462968930602,
"clip_ratio/low_mean": 0.03730450588045642,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.05933396948967129,
"entropy": 0.28032723255455494,
"epoch": 0.006,
"grad_norm": 0.37243181467056274,
"kl": 0.8571038488298655,
"learning_rate": 9.999870934419433e-05,
"loss": -0.0014,
"step": 300,
"step_time": 9.851978641999267
},
{
"clip_ratio/high_max": 0.014093137346208096,
"clip_ratio/high_mean": 0.004391339898575097,
"clip_ratio/low_mean": 0.0008680555620230734,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005259395460598171,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1347.0,
"completions/max_terminated_length": 1347.0,
"completions/mean_length": 1216.4375,
"completions/mean_terminated_length": 1216.4375,
"completions/min_length": 825.0,
"completions/min_terminated_length": 825.0,
"entropy": 0.2521855002269149,
"epoch": 0.00602,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.6381392478942871,
"kl": 0.8058282844722271,
"learning_rate": 9.999869954803708e-05,
"loss": 0.0246,
"num_tokens": 16465057.0,
"reward": 4.353306293487549,
"reward_std": 11.903841018676758,
"rewards/rollout_reward_func/mean": 4.353306293487549,
"rewards/rollout_reward_func/std": 13.071228981018066,
"sampling/importance_sampling_ratio/max": 1.870285153388977,
"sampling/importance_sampling_ratio/mean": 1.014232873916626,
"sampling/importance_sampling_ratio/min": 0.6221296191215515,
"sampling/sampling_logp_difference/max": 0.5893880128860474,
"sampling/sampling_logp_difference/mean": 0.010516786947846413,
"step": 301,
"step_time": 37.21239350799988
},
{
"clip_ratio/high_max": 0.0890522887930274,
"clip_ratio/high_mean": 0.02920751681085676,
"clip_ratio/low_mean": 0.026308735250495374,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.05551625177031383,
"entropy": 0.26322738360613585,
"epoch": 0.00604,
"grad_norm": 0.3497966527938843,
"kl": 0.8271188456565142,
"learning_rate": 9.999868971484418e-05,
"loss": 0.0178,
"step": 302,
"step_time": 9.60695320800096
},
{
"clip_ratio/high_max": 0.007504480192437768,
"clip_ratio/high_mean": 0.0027441755519248545,
"clip_ratio/low_mean": 0.004391339898575097,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.007135515450499952,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 1357.0,
"completions/max_terminated_length": 1357.0,
"completions/mean_length": 1233.578125,
"completions/mean_terminated_length": 1232.1270751953125,
"completions/min_length": 963.0,
"completions/min_terminated_length": 963.0,
"entropy": 0.3282460719347,
"epoch": 0.00606,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.8028053641319275,
"kl": 1.0723739713430405,
"learning_rate": 9.999867984461563e-05,
"loss": -0.009,
"num_tokens": 16595305.0,
"reward": 4.948282718658447,
"reward_std": 13.311994552612305,
"rewards/rollout_reward_func/mean": 4.9482831954956055,
"rewards/rollout_reward_func/std": 13.96406078338623,
"sampling/importance_sampling_ratio/max": 1.4959264993667603,
"sampling/importance_sampling_ratio/mean": 1.0020052194595337,
"sampling/importance_sampling_ratio/min": 0.664537250995636,
"sampling/sampling_logp_difference/max": 0.4060518741607666,
"sampling/sampling_logp_difference/mean": 0.01389513909816742,
"step": 303,
"step_time": 37.319652419000704
},
{
"clip_ratio/high_max": 0.10214776475913823,
"clip_ratio/high_mean": 0.0419843090348877,
"clip_ratio/low_mean": 0.032362769707106054,
"clip_ratio/low_min": 0.0029761905316263437,
"clip_ratio/region_mean": 0.07434707973152399,
"entropy": 0.35452230647206306,
"epoch": 0.00608,
"grad_norm": 0.5315479040145874,
"kl": 0.8896235972642899,
"learning_rate": 9.999866993735147e-05,
"loss": -0.0191,
"step": 304,
"step_time": 9.31032760600101
},
{
"clip_ratio/high_max": 0.010416666744276881,
"clip_ratio/high_mean": 0.0026041666860692203,
"clip_ratio/low_mean": 0.0013720877468585968,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.003976254432927817,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1349.0,
"completions/max_terminated_length": 1349.0,
"completions/mean_length": 1178.75,
"completions/mean_terminated_length": 1178.75,
"completions/min_length": 276.0,
"completions/min_terminated_length": 276.0,
"entropy": 0.3360240999609232,
"epoch": 0.0061,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.8160866498947144,
"kl": 0.7584020271897316,
"learning_rate": 9.999865999305169e-05,
"loss": 0.0343,
"num_tokens": 16721992.0,
"reward": 5.459122657775879,
"reward_std": 12.891645431518555,
"rewards/rollout_reward_func/mean": 5.459122657775879,
"rewards/rollout_reward_func/std": 13.743046760559082,
"sampling/importance_sampling_ratio/max": 1.5625214576721191,
"sampling/importance_sampling_ratio/mean": 0.9862264394760132,
"sampling/importance_sampling_ratio/min": 0.7355522513389587,
"sampling/sampling_logp_difference/max": 0.3090386390686035,
"sampling/sampling_logp_difference/mean": 0.012195384129881859,
"step": 305,
"step_time": 36.28136819499923
},
{
"clip_ratio/high_max": 0.10116884484887123,
"clip_ratio/high_mean": 0.03489725984400138,
"clip_ratio/low_mean": 0.055271854158490896,
"clip_ratio/low_min": 0.0069444444961845875,
"clip_ratio/region_mean": 0.09016911429353058,
"entropy": 0.3460291214287281,
"epoch": 0.00612,
"grad_norm": 0.4136711359024048,
"kl": 0.7873252909630537,
"learning_rate": 9.999865001171627e-05,
"loss": 0.0177,
"step": 306,
"step_time": 10.277970246998848
},
{
"clip_ratio/high_max": 0.017422385746613145,
"clip_ratio/high_mean": 0.004355596436653286,
"clip_ratio/low_mean": 0.0008680555620230734,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00522365199867636,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1335.0,
"completions/max_terminated_length": 1335.0,
"completions/mean_length": 1148.03125,
"completions/mean_terminated_length": 1148.03125,
"completions/min_length": 197.0,
"completions/min_terminated_length": 197.0,
"entropy": 0.4305746052414179,
"epoch": 0.00614,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.9119190573692322,
"kl": 0.862309418618679,
"learning_rate": 9.999863999334527e-05,
"loss": -0.039,
"num_tokens": 16846641.0,
"reward": 4.154011249542236,
"reward_std": 13.017316818237305,
"rewards/rollout_reward_func/mean": 4.1540117263793945,
"rewards/rollout_reward_func/std": 12.931968688964844,
"sampling/importance_sampling_ratio/max": 1.4475128650665283,
"sampling/importance_sampling_ratio/mean": 0.9774882793426514,
"sampling/importance_sampling_ratio/min": 9.214395739476355e-13,
"sampling/sampling_logp_difference/max": 23.965322494506836,
"sampling/sampling_logp_difference/mean": 0.03728090599179268,
"step": 307,
"step_time": 33.57406117500068
},
{
"clip_ratio/high_max": 0.08014640025794506,
"clip_ratio/high_mean": 0.026986419688910246,
"clip_ratio/low_mean": 0.03807902126573026,
"clip_ratio/low_min": 0.0034722222480922937,
"clip_ratio/region_mean": 0.06506544025614858,
"entropy": 0.4615292586386204,
"epoch": 0.00616,
"grad_norm": 0.544769287109375,
"kl": 0.8701771721243858,
"learning_rate": 9.999862993793865e-05,
"loss": -0.0498,
"step": 308,
"step_time": 9.750469571001304
},
{
"clip_ratio/high_max": 0.0021551724057644606,
"clip_ratio/high_mean": 0.0005387931014411151,
"clip_ratio/low_mean": 0.004073183808941394,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.004611976910382509,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1337.0,
"completions/max_terminated_length": 1337.0,
"completions/mean_length": 1176.34375,
"completions/mean_terminated_length": 1176.34375,
"completions/min_length": 862.0,
"completions/min_terminated_length": 862.0,
"entropy": 0.4520879667252302,
"epoch": 0.00618,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.6592026352882385,
"kl": 1.228204183280468,
"learning_rate": 9.999861984549645e-05,
"loss": 0.0146,
"num_tokens": 16973130.0,
"reward": 5.186724662780762,
"reward_std": 12.892146110534668,
"rewards/rollout_reward_func/mean": 5.186724662780762,
"rewards/rollout_reward_func/std": 12.396245002746582,
"sampling/importance_sampling_ratio/max": 1.4907543659210205,
"sampling/importance_sampling_ratio/mean": 0.992376446723938,
"sampling/importance_sampling_ratio/min": 0.6941927671432495,
"sampling/sampling_logp_difference/max": 0.338625431060791,
"sampling/sampling_logp_difference/mean": 0.014916637912392616,
"step": 309,
"step_time": 35.957562657998096
},
{
"clip_ratio/high_max": 0.07991837477311492,
"clip_ratio/high_mean": 0.02518792706541717,
"clip_ratio/low_mean": 0.04024840978672728,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.06543633691035211,
"entropy": 0.44149017706513405,
"epoch": 0.0062,
"grad_norm": 0.5282915234565735,
"kl": 1.2467477656900883,
"learning_rate": 9.999860971601868e-05,
"loss": -0.002,
"step": 310,
"step_time": 8.980034561999673
},
{
"clip_ratio/high_max": 0.0034722222480922937,
"clip_ratio/high_mean": 0.0008680555620230734,
"clip_ratio/low_mean": 0.0008223684271797538,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0016904239892028272,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1348.0,
"completions/max_terminated_length": 1348.0,
"completions/mean_length": 1184.96875,
"completions/mean_terminated_length": 1184.96875,
"completions/min_length": 917.0,
"completions/min_terminated_length": 917.0,
"entropy": 0.5129956435412169,
"epoch": 0.00622,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.7465701699256897,
"kl": 1.0012187995016575,
"learning_rate": 9.999859954950535e-05,
"loss": 0.018,
"num_tokens": 17100245.0,
"reward": 4.89565372467041,
"reward_std": 13.874456405639648,
"rewards/rollout_reward_func/mean": 4.89565372467041,
"rewards/rollout_reward_func/std": 14.702526092529297,
"sampling/importance_sampling_ratio/max": 1.5052223205566406,
"sampling/importance_sampling_ratio/mean": 1.027785062789917,
"sampling/importance_sampling_ratio/min": 0.5468899607658386,
"sampling/sampling_logp_difference/max": 0.4049875736236572,
"sampling/sampling_logp_difference/mean": 0.017239127308130264,
"step": 311,
"step_time": 34.84647880300054
},
{
"clip_ratio/high_max": 0.08282635360956192,
"clip_ratio/high_mean": 0.031164190906565636,
"clip_ratio/low_mean": 0.05087516509229317,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.08203935588244349,
"entropy": 0.5392583776265383,
"epoch": 0.00624,
"grad_norm": 0.5834032297134399,
"kl": 1.0580051615834236,
"learning_rate": 9.999858934595648e-05,
"loss": 0.0006,
"step": 312,
"step_time": 9.797768173001714
},
{
"clip_ratio/high_max": 0.006076388992369175,
"clip_ratio/high_mean": 0.0015190972480922937,
"clip_ratio/low_mean": 0.0006793478387407959,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0021984450868330896,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 1312.0,
"completions/max_terminated_length": 1312.0,
"completions/mean_length": 1134.828125,
"completions/mean_terminated_length": 1133.3968505859375,
"completions/min_length": 492.0,
"completions/min_terminated_length": 492.0,
"entropy": 0.5534908715635538,
"epoch": 0.00626,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.7070833444595337,
"kl": 0.9324860982596874,
"learning_rate": 9.999857910537204e-05,
"loss": 0.0171,
"num_tokens": 17224021.0,
"reward": 2.2121267318725586,
"reward_std": 12.934229850769043,
"rewards/rollout_reward_func/mean": 2.2121264934539795,
"rewards/rollout_reward_func/std": 13.348692893981934,
"sampling/importance_sampling_ratio/max": 1.3357430696487427,
"sampling/importance_sampling_ratio/mean": 0.9801706075668335,
"sampling/importance_sampling_ratio/min": 0.6364750862121582,
"sampling/sampling_logp_difference/max": 0.28098082542419434,
"sampling/sampling_logp_difference/mean": 0.01595621556043625,
"step": 313,
"step_time": 34.49526453500039
},
{
"clip_ratio/high_max": 0.09578519035130739,
"clip_ratio/high_mean": 0.028240888088475913,
"clip_ratio/low_mean": 0.043463885551318526,
"clip_ratio/low_min": 0.003289473708719015,
"clip_ratio/region_mean": 0.07170477387262508,
"entropy": 0.5145694836974144,
"epoch": 0.00628,
"grad_norm": 8.991610527038574,
"kl": 2.500880379229784,
"learning_rate": 9.999856882775207e-05,
"loss": 0.0362,
"step": 314,
"step_time": 9.686568430998705
},
{
"clip_ratio/high_max": 0.0034722222480922937,
"clip_ratio/high_mean": 0.0008680555620230734,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0008680555620230734,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1341.0,
"completions/max_terminated_length": 1341.0,
"completions/mean_length": 1152.8125,
"completions/mean_terminated_length": 1152.8125,
"completions/min_length": 203.0,
"completions/min_terminated_length": 203.0,
"entropy": 0.47399672865867615,
"epoch": 0.0063,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.9034998416900635,
"kl": 0.8184376284480095,
"learning_rate": 9.999855851309658e-05,
"loss": 0.0293,
"num_tokens": 17349042.0,
"reward": 2.9872653484344482,
"reward_std": 10.313895225524902,
"rewards/rollout_reward_func/mean": 2.9872655868530273,
"rewards/rollout_reward_func/std": 11.123116493225098,
"sampling/importance_sampling_ratio/max": 1.5023268461227417,
"sampling/importance_sampling_ratio/mean": 0.9912445545196533,
"sampling/importance_sampling_ratio/min": 0.5293837189674377,
"sampling/sampling_logp_difference/max": 0.49621057510375977,
"sampling/sampling_logp_difference/mean": 0.01648723892867565,
"step": 315,
"step_time": 36.106211563002034
},
{
"clip_ratio/high_max": 0.07373366155661643,
"clip_ratio/high_mean": 0.02871302078710869,
"clip_ratio/low_mean": 0.052897133806254715,
"clip_ratio/low_min": 0.0034722222480922937,
"clip_ratio/region_mean": 0.08161015470977873,
"entropy": 0.439556997269392,
"epoch": 0.00632,
"grad_norm": 1.1542975902557373,
"kl": 0.8081017658114433,
"learning_rate": 9.999854816140556e-05,
"loss": 0.0112,
"step": 316,
"step_time": 9.529359940000177
},
{
"clip_ratio/high_max": 0.0069444444961845875,
"clip_ratio/high_mean": 0.0017361111240461469,
"clip_ratio/low_mean": 0.0008680555620230734,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0026041666860692203,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 1345.0,
"completions/max_terminated_length": 1345.0,
"completions/mean_length": 1153.015625,
"completions/mean_terminated_length": 1152.2857666015625,
"completions/min_length": 391.0,
"completions/min_terminated_length": 391.0,
"entropy": 0.4202824104577303,
"epoch": 0.00634,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.8387591242790222,
"kl": 0.7999376337975264,
"learning_rate": 9.999853777267906e-05,
"loss": -0.0113,
"num_tokens": 17474080.0,
"reward": 3.8928003311157227,
"reward_std": 13.945871353149414,
"rewards/rollout_reward_func/mean": 3.8928003311157227,
"rewards/rollout_reward_func/std": 14.018685340881348,
"sampling/importance_sampling_ratio/max": 1.3972409963607788,
"sampling/importance_sampling_ratio/mean": 0.9933174252510071,
"sampling/importance_sampling_ratio/min": 0.66861891746521,
"sampling/sampling_logp_difference/max": 0.3364081382751465,
"sampling/sampling_logp_difference/mean": 0.013292517513036728,
"step": 317,
"step_time": 35.11344056699909
},
{
"clip_ratio/high_max": 0.062046968610957265,
"clip_ratio/high_mean": 0.02252296026563272,
"clip_ratio/low_mean": 0.06663749110884964,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.08916045201476663,
"entropy": 0.3698996100574732,
"epoch": 0.00636,
"grad_norm": 0.5773271918296814,
"kl": 0.9309169836342335,
"learning_rate": 9.999852734691706e-05,
"loss": -0.0303,
"step": 318,
"step_time": 9.084739140999773
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0034635705524124205,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0034635705524124205,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1360.0,
"completions/max_terminated_length": 1360.0,
"completions/mean_length": 1189.828125,
"completions/mean_terminated_length": 1189.828125,
"completions/min_length": 1056.0,
"completions/min_terminated_length": 1056.0,
"entropy": 0.3289623577147722,
"epoch": 0.00638,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.7116008400917053,
"kl": 0.9500053711235523,
"learning_rate": 9.999851688411959e-05,
"loss": 0.0123,
"num_tokens": 17601410.0,
"reward": 4.444620609283447,
"reward_std": 12.232638359069824,
"rewards/rollout_reward_func/mean": 4.444620609283447,
"rewards/rollout_reward_func/std": 12.037857055664062,
"sampling/importance_sampling_ratio/max": 1.8450855016708374,
"sampling/importance_sampling_ratio/mean": 0.9873309135437012,
"sampling/importance_sampling_ratio/min": 2.6370022485067146e-11,
"sampling/sampling_logp_difference/max": 11.255170822143555,
"sampling/sampling_logp_difference/mean": 0.033629726618528366,
"step": 319,
"step_time": 38.23992628900032
},
{
"clip_ratio/high_max": 0.06827694294042885,
"clip_ratio/high_mean": 0.024013680347707123,
"clip_ratio/low_mean": 0.04197527136420831,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.06598895112983882,
"entropy": 0.3273693434894085,
"epoch": 0.0064,
"grad_norm": 0.5724111795425415,
"kl": 1.0788306891918182,
"learning_rate": 9.999850638428662e-05,
"loss": 0.0049,
"step": 320,
"step_time": 10.348325264999403
},
{
"clip_ratio/high_max": 0.009027777938172221,
"clip_ratio/high_mean": 0.0022569444845430553,
"clip_ratio/low_mean": 0.002170138934161514,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.004427083418704569,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1336.0,
"completions/max_terminated_length": 1336.0,
"completions/mean_length": 1203.65625,
"completions/mean_terminated_length": 1203.65625,
"completions/min_length": 898.0,
"completions/min_terminated_length": 898.0,
"entropy": 0.3083435148000717,
"epoch": 0.00642,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.8611142039299011,
"kl": 0.9252029061317444,
"learning_rate": 9.99984958474182e-05,
"loss": 0.024,
"num_tokens": 17729704.0,
"reward": 2.3030171394348145,
"reward_std": 10.394119262695312,
"rewards/rollout_reward_func/mean": 2.3030171394348145,
"rewards/rollout_reward_func/std": 11.775047302246094,
"sampling/importance_sampling_ratio/max": 1.6587164402008057,
"sampling/importance_sampling_ratio/mean": 1.0117642879486084,
"sampling/importance_sampling_ratio/min": 0.4190000295639038,
"sampling/sampling_logp_difference/max": 0.3578883409500122,
"sampling/sampling_logp_difference/mean": 0.015003521926701069,
"step": 321,
"step_time": 35.224505068000326
},
{
"clip_ratio/high_max": 0.07542938669212162,
"clip_ratio/high_mean": 0.03128034179098904,
"clip_ratio/low_mean": 0.03897239360958338,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0702527352841571,
"entropy": 0.2796294568106532,
"epoch": 0.00644,
"grad_norm": 0.5799975395202637,
"kl": 0.8807330075651407,
"learning_rate": 9.999848527351433e-05,
"loss": 0.0091,
"step": 322,
"step_time": 9.679342022999663
},
{
"clip_ratio/high_max": 0.0069444444961845875,
"clip_ratio/high_mean": 0.0017361111240461469,
"clip_ratio/low_mean": 0.0017361111240461469,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0034722222480922937,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1294.0,
"completions/max_terminated_length": 1294.0,
"completions/mean_length": 1194.96875,
"completions/mean_terminated_length": 1194.96875,
"completions/min_length": 300.0,
"completions/min_terminated_length": 300.0,
"entropy": 0.24392448458820581,
"epoch": 0.00646,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.7959789633750916,
"kl": 0.779301343485713,
"learning_rate": 9.9998474662575e-05,
"loss": -0.0199,
"num_tokens": 17857450.0,
"reward": 4.581869125366211,
"reward_std": 11.262429237365723,
"rewards/rollout_reward_func/mean": 4.581869602203369,
"rewards/rollout_reward_func/std": 12.287596702575684,
"sampling/importance_sampling_ratio/max": 2.240818977355957,
"sampling/importance_sampling_ratio/mean": 1.018520712852478,
"sampling/importance_sampling_ratio/min": 0.3999040722846985,
"sampling/sampling_logp_difference/max": 0.5928263664245605,
"sampling/sampling_logp_difference/mean": 0.012124484404921532,
"step": 323,
"step_time": 37.114893339001355
},
{
"clip_ratio/high_max": 0.0683479537256062,
"clip_ratio/high_mean": 0.022346383950207382,
"clip_ratio/low_mean": 0.03898888279218227,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.06133526662597433,
"entropy": 0.2431696206331253,
"epoch": 0.00648,
"grad_norm": 0.35803645849227905,
"kl": 0.767679963260889,
"learning_rate": 9.999846401460026e-05,
"loss": -0.0339,
"step": 324,
"step_time": 8.890772448001371
},
{
"clip_ratio/high_max": 0.015318243764340878,
"clip_ratio/high_mean": 0.0038295609410852194,
"clip_ratio/low_mean": 0.0023561508278362453,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.006185711768921465,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1359.0,
"completions/max_terminated_length": 1359.0,
"completions/mean_length": 1233.796875,
"completions/mean_terminated_length": 1233.796875,
"completions/min_length": 663.0,
"completions/min_terminated_length": 663.0,
"entropy": 0.24552472867071629,
"epoch": 0.0065,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.7995591163635254,
"kl": 0.8211700264364481,
"learning_rate": 9.99984533295901e-05,
"loss": -0.0057,
"num_tokens": 17987636.0,
"reward": 2.361278533935547,
"reward_std": 11.01347541809082,
"rewards/rollout_reward_func/mean": 2.361278533935547,
"rewards/rollout_reward_func/std": 11.316116333007812,
"sampling/importance_sampling_ratio/max": 1.4373282194137573,
"sampling/importance_sampling_ratio/mean": 0.9916459321975708,
"sampling/importance_sampling_ratio/min": 0.7290171384811401,
"sampling/sampling_logp_difference/max": 0.3705787658691406,
"sampling/sampling_logp_difference/mean": 0.01046331413090229,
"step": 325,
"step_time": 38.97714790000191
},
{
"clip_ratio/high_max": 0.05977182672359049,
"clip_ratio/high_mean": 0.02233774628257379,
"clip_ratio/low_mean": 0.027810412109829485,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.05014815804315731,
"entropy": 0.24062953237444162,
"epoch": 0.00652,
"grad_norm": 0.6732361316680908,
"kl": 0.9134266618639231,
"learning_rate": 9.999844260754451e-05,
"loss": -0.011,
"step": 326,
"step_time": 9.702275648000068
},
{
"clip_ratio/high_max": 0.0069444444961845875,
"clip_ratio/high_mean": 0.0017361111240461469,
"clip_ratio/low_mean": 0.0017361111240461469,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0034722222480922937,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1343.0,
"completions/max_terminated_length": 1343.0,
"completions/mean_length": 1266.625,
"completions/mean_terminated_length": 1266.625,
"completions/min_length": 1005.0,
"completions/min_terminated_length": 1005.0,
"entropy": 0.19475865550339222,
"epoch": 0.00654,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.7081814408302307,
"kl": 0.7645703088492155,
"learning_rate": 9.999843184846354e-05,
"loss": 0.0194,
"num_tokens": 18120014.0,
"reward": 6.7441205978393555,
"reward_std": 12.950173377990723,
"rewards/rollout_reward_func/mean": 6.7441205978393555,
"rewards/rollout_reward_func/std": 13.17819881439209,
"sampling/importance_sampling_ratio/max": 2.733876943588257,
"sampling/importance_sampling_ratio/mean": 1.017435908317566,
"sampling/importance_sampling_ratio/min": 0.8077232837677002,
"sampling/sampling_logp_difference/max": 1.0649070739746094,
"sampling/sampling_logp_difference/mean": 0.008961044251918793,
"step": 327,
"step_time": 38.45977644300001
},
{
"clip_ratio/high_max": 0.05813231039792299,
"clip_ratio/high_mean": 0.0188276685657911,
"clip_ratio/low_mean": 0.017785656382329762,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.03661332529736683,
"entropy": 0.18874722812324762,
"epoch": 0.00656,
"grad_norm": 0.29629969596862793,
"kl": 0.7521160487085581,
"learning_rate": 9.999842105234716e-05,
"loss": 0.0089,
"step": 328,
"step_time": 9.240072366999811
},
{
"clip_ratio/high_max": 0.01736111124046147,
"clip_ratio/high_mean": 0.004340277810115367,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.004340277810115367,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1335.0,
"completions/max_terminated_length": 1335.0,
"completions/mean_length": 1223.046875,
"completions/mean_terminated_length": 1223.046875,
"completions/min_length": 1069.0,
"completions/min_terminated_length": 1069.0,
"entropy": 0.17775962874293327,
"epoch": 0.00658,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5002116560935974,
"kl": 0.5365802068263292,
"learning_rate": 9.999841021919543e-05,
"loss": -0.0003,
"num_tokens": 18249422.0,
"reward": 5.926024436950684,
"reward_std": 10.913434028625488,
"rewards/rollout_reward_func/mean": 5.926024436950684,
"rewards/rollout_reward_func/std": 11.495051383972168,
"sampling/importance_sampling_ratio/max": 1.340820550918579,
"sampling/importance_sampling_ratio/mean": 0.9783110618591309,
"sampling/importance_sampling_ratio/min": 0.5937914848327637,
"sampling/sampling_logp_difference/max": 0.4624512195587158,
"sampling/sampling_logp_difference/mean": 0.009367045015096664,
"step": 329,
"step_time": 40.06546166299813
},
{
"clip_ratio/high_max": 0.049223856534808874,
"clip_ratio/high_mean": 0.015778186498209834,
"clip_ratio/low_mean": 0.02711397095117718,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.04289215768221766,
"entropy": 0.16470052115619183,
"epoch": 0.0066,
"grad_norm": 0.272549033164978,
"kl": 0.5706925727427006,
"learning_rate": 9.999839934900832e-05,
"loss": -0.0098,
"step": 330,
"step_time": 9.584335595999619
},
{
"clip_ratio/high_max": 0.006761695956811309,
"clip_ratio/high_mean": 0.0016904239892028272,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0016904239892028272,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1346.0,
"completions/max_terminated_length": 1346.0,
"completions/mean_length": 1230.453125,
"completions/mean_terminated_length": 1230.453125,
"completions/min_length": 195.0,
"completions/min_terminated_length": 195.0,
"entropy": 0.15301176952198148,
"epoch": 0.00662,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5951566696166992,
"kl": 0.6225019320845604,
"learning_rate": 9.999838844178584e-05,
"loss": -0.0415,
"num_tokens": 18379457.0,
"reward": 5.441021919250488,
"reward_std": 11.596078872680664,
"rewards/rollout_reward_func/mean": 5.441021919250488,
"rewards/rollout_reward_func/std": 13.130385398864746,
"sampling/importance_sampling_ratio/max": 1.2981815338134766,
"sampling/importance_sampling_ratio/mean": 0.9712120294570923,
"sampling/importance_sampling_ratio/min": 0.5313878655433655,
"sampling/sampling_logp_difference/max": 0.4391303062438965,
"sampling/sampling_logp_difference/mean": 0.008532309904694557,
"step": 331,
"step_time": 38.03544032799982
},
{
"clip_ratio/high_max": 0.040491855004802346,
"clip_ratio/high_mean": 0.015382359270006418,
"clip_ratio/low_mean": 0.02635878958972171,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.04174114967463538,
"entropy": 0.13200736604630947,
"epoch": 0.00664,
"grad_norm": 0.4791216552257538,
"kl": 0.6594886407256126,
"learning_rate": 9.999837749752803e-05,
"loss": -0.0494,
"step": 332,
"step_time": 9.623436052000216
},
{
"clip_ratio/high_max": 0.010416666744276881,
"clip_ratio/high_mean": 0.0034722222480922937,
"clip_ratio/low_mean": 0.0008680555620230734,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.004340277810115367,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1344.0,
"completions/max_terminated_length": 1344.0,
"completions/mean_length": 1219.734375,
"completions/mean_terminated_length": 1219.734375,
"completions/min_length": 196.0,
"completions/min_terminated_length": 196.0,
"entropy": 0.11573670757934451,
"epoch": 0.00666,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.6563036441802979,
"kl": 0.851641334593296,
"learning_rate": 9.999836651623487e-05,
"loss": -0.0048,
"num_tokens": 18508741.0,
"reward": 5.738734245300293,
"reward_std": 12.408971786499023,
"rewards/rollout_reward_func/mean": 5.738734245300293,
"rewards/rollout_reward_func/std": 12.671599388122559,
"sampling/importance_sampling_ratio/max": 1.3496527671813965,
"sampling/importance_sampling_ratio/mean": 1.0077811479568481,
"sampling/importance_sampling_ratio/min": 0.6974970102310181,
"sampling/sampling_logp_difference/max": 0.3328993320465088,
"sampling/sampling_logp_difference/mean": 0.00713011808693409,
"step": 333,
"step_time": 39.30893147200186
},
{
"clip_ratio/high_max": 0.03472222248092294,
"clip_ratio/high_mean": 0.011284722364507616,
"clip_ratio/low_mean": 0.024913194763939828,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.03619791695382446,
"entropy": 0.11344034224748611,
"epoch": 0.00668,
"grad_norm": 0.4923565983772278,
"kl": 0.6980615984648466,
"learning_rate": 9.999835549790641e-05,
"loss": -0.0079,
"step": 334,
"step_time": 10.117216201999327
},
{
"clip_ratio/high_max": 0.0036764706019312143,
"clip_ratio/high_mean": 0.0009191176504828036,
"clip_ratio/low_mean": 0.0034722222480922937,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.004391339898575097,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1359.0,
"completions/max_terminated_length": 1359.0,
"completions/mean_length": 1238.984375,
"completions/mean_terminated_length": 1238.984375,
"completions/min_length": 635.0,
"completions/min_terminated_length": 635.0,
"entropy": 0.12856985442340374,
"epoch": 0.0067,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.8339588642120361,
"kl": 0.6355916745960712,
"learning_rate": 9.999834444254262e-05,
"loss": -0.0042,
"num_tokens": 18639311.0,
"reward": 6.144355297088623,
"reward_std": 13.870889663696289,
"rewards/rollout_reward_func/mean": 6.144355297088623,
"rewards/rollout_reward_func/std": 14.220029830932617,
"sampling/importance_sampling_ratio/max": 1.3300856351852417,
"sampling/importance_sampling_ratio/mean": 0.9924861788749695,
"sampling/importance_sampling_ratio/min": 0.6479190587997437,
"sampling/sampling_logp_difference/max": 0.2639361619949341,
"sampling/sampling_logp_difference/mean": 0.006817285902798176,
"step": 335,
"step_time": 37.15376971599926
},
{
"clip_ratio/high_max": 0.03817401989363134,
"clip_ratio/high_mean": 0.012147671717684716,
"clip_ratio/low_mean": 0.020067402394488454,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.03221507422858849,
"entropy": 0.12425063038244843,
"epoch": 0.00672,
"grad_norm": 0.32718658447265625,
"kl": 0.7219895403832197,
"learning_rate": 9.999833335014352e-05,
"loss": -0.011,
"step": 336,
"step_time": 9.780628326000624
},
{
"clip_ratio/high_max": 0.0069444444961845875,
"clip_ratio/high_mean": 0.0017361111240461469,
"clip_ratio/low_mean": 0.0008680555620230734,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0026041666860692203,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1356.0,
"completions/max_terminated_length": 1356.0,
"completions/mean_length": 1266.78125,
"completions/mean_terminated_length": 1266.78125,
"completions/min_length": 794.0,
"completions/min_terminated_length": 794.0,
"entropy": 0.11600295826792717,
"epoch": 0.00674,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.7365610003471375,
"kl": 0.5392901804298162,
"learning_rate": 9.999832222070914e-05,
"loss": 0.0023,
"num_tokens": 18771742.0,
"reward": 5.880302429199219,
"reward_std": 12.320051193237305,
"rewards/rollout_reward_func/mean": 5.880302429199219,
"rewards/rollout_reward_func/std": 12.716879844665527,
"sampling/importance_sampling_ratio/max": 1.3457348346710205,
"sampling/importance_sampling_ratio/mean": 0.9991644620895386,
"sampling/importance_sampling_ratio/min": 0.6999140381813049,
"sampling/sampling_logp_difference/max": 0.3562436103820801,
"sampling/sampling_logp_difference/mean": 0.005911126732826233,
"step": 337,
"step_time": 38.84034024799803
},
{
"clip_ratio/high_max": 0.03513071918860078,
"clip_ratio/high_mean": 0.011386846599634737,
"clip_ratio/low_mean": 0.024994894862174988,
"clip_ratio/low_min": 0.0034722222480922937,
"clip_ratio/region_mean": 0.036381741403602064,
"entropy": 0.1102461889386177,
"epoch": 0.00676,
"grad_norm": 0.2775817811489105,
"kl": 0.6620934028178453,
"learning_rate": 9.999831105423947e-05,
"loss": -0.006,
"step": 338,
"step_time": 9.023721004000436
},
{
"clip_ratio/high_max": 0.0069444444961845875,
"clip_ratio/high_mean": 0.0017361111240461469,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0017361111240461469,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1350.0,
"completions/max_terminated_length": 1350.0,
"completions/mean_length": 1238.953125,
"completions/mean_terminated_length": 1238.953125,
"completions/min_length": 1062.0,
"completions/min_terminated_length": 1062.0,
"entropy": 0.11055759433656931,
"epoch": 0.00678,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5181728601455688,
"kl": 0.462925398722291,
"learning_rate": 9.999829985073453e-05,
"loss": 0.0105,
"num_tokens": 18902239.0,
"reward": 7.53302526473999,
"reward_std": 12.4171142578125,
"rewards/rollout_reward_func/mean": 7.533025741577148,
"rewards/rollout_reward_func/std": 13.036537170410156,
"sampling/importance_sampling_ratio/max": 1.3853559494018555,
"sampling/importance_sampling_ratio/mean": 1.000986933708191,
"sampling/importance_sampling_ratio/min": 0.702711284160614,
"sampling/sampling_logp_difference/max": 0.4794572591781616,
"sampling/sampling_logp_difference/mean": 0.00590522913262248,
"step": 339,
"step_time": 39.08791527499943
},
{
"clip_ratio/high_max": 0.039215686498209834,
"clip_ratio/high_mean": 0.013276143989060074,
"clip_ratio/low_mean": 0.02185995056061074,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.03513609484070912,
"entropy": 0.11709691304713488,
"epoch": 0.0068,
"grad_norm": 0.30310577154159546,
"kl": 0.5310502368956804,
"learning_rate": 9.999828861019435e-05,
"loss": 0.006,
"step": 340,
"step_time": 9.792470953999327
},
{
"clip_ratio/high_max": 0.013888888992369175,
"clip_ratio/high_mean": 0.0034722222480922937,
"clip_ratio/low_mean": 0.0009191176504828036,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.004391339898575097,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 1347.0,
"completions/max_terminated_length": 1347.0,
"completions/mean_length": 1241.765625,
"completions/mean_terminated_length": 1240.635009765625,
"completions/min_length": 1101.0,
"completions/min_terminated_length": 1101.0,
"entropy": 0.12758585345000029,
"epoch": 0.00682,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.6102613210678101,
"kl": 0.6113391723483801,
"learning_rate": 9.99982773326189e-05,
"loss": 0.0158,
"num_tokens": 19032925.0,
"reward": 3.9826180934906006,
"reward_std": 12.427906036376953,
"rewards/rollout_reward_func/mean": 3.9826183319091797,
"rewards/rollout_reward_func/std": 13.354879379272461,
"sampling/importance_sampling_ratio/max": 1.1800951957702637,
"sampling/importance_sampling_ratio/mean": 0.997043251991272,
"sampling/importance_sampling_ratio/min": 0.7389504313468933,
"sampling/sampling_logp_difference/max": 0.2936210632324219,
"sampling/sampling_logp_difference/mean": 0.005317248869687319,
"step": 341,
"step_time": 39.12008871799935
},
{
"clip_ratio/high_max": 0.021037581842392683,
"clip_ratio/high_mean": 0.006995506584644318,
"clip_ratio/low_mean": 0.016595179855357856,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.023590686498209834,
"entropy": 0.12726877955719829,
"epoch": 0.00684,
"grad_norm": 0.49857455492019653,
"kl": 0.6323374789208174,
"learning_rate": 9.999826601800824e-05,
"loss": 0.0106,
"step": 342,
"step_time": 9.27907708300063
},
{
"clip_ratio/high_max": 0.0034722222480922937,
"clip_ratio/high_mean": 0.0008680555620230734,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0008680555620230734,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1341.0,
"completions/max_terminated_length": 1341.0,
"completions/mean_length": 1219.203125,
"completions/mean_terminated_length": 1219.203125,
"completions/min_length": 735.0,
"completions/min_terminated_length": 735.0,
"entropy": 0.11009268835186958,
"epoch": 0.00686,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.6059837341308594,
"kl": 0.7338532544672489,
"learning_rate": 9.999825466636233e-05,
"loss": -0.0167,
"num_tokens": 19162127.0,
"reward": 4.720416069030762,
"reward_std": 10.753931999206543,
"rewards/rollout_reward_func/mean": 4.720416069030762,
"rewards/rollout_reward_func/std": 12.976871490478516,
"sampling/importance_sampling_ratio/max": 1.5183767080307007,
"sampling/importance_sampling_ratio/mean": 1.0038487911224365,
"sampling/importance_sampling_ratio/min": 0.6935895681381226,
"sampling/sampling_logp_difference/max": 0.4249706268310547,
"sampling/sampling_logp_difference/mean": 0.004785279743373394,
"step": 343,
"step_time": 39.04033868700208
},
{
"clip_ratio/high_max": 0.02798202633857727,
"clip_ratio/high_mean": 0.012203840189613402,
"clip_ratio/low_mean": 0.013071895577013493,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.025275735824834555,
"entropy": 0.11364737106487155,
"epoch": 0.00688,
"grad_norm": 0.2909540832042694,
"kl": 0.7444342169910669,
"learning_rate": 9.999824327768122e-05,
"loss": -0.0205,
"step": 344,
"step_time": 9.620514073000777
},
{
"clip_ratio/high_max": 0.0034722222480922937,
"clip_ratio/high_mean": 0.0008680555620230734,
"clip_ratio/low_mean": 0.0008680555620230734,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0017361111240461469,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1359.0,
"completions/max_terminated_length": 1359.0,
"completions/mean_length": 1240.3125,
"completions/mean_terminated_length": 1240.3125,
"completions/min_length": 1088.0,
"completions/min_terminated_length": 1088.0,
"entropy": 0.13259067060425878,
"epoch": 0.0069,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.8596341609954834,
"kl": 0.7895576078444719,
"learning_rate": 9.99982318519649e-05,
"loss": -0.0057,
"num_tokens": 19292776.0,
"reward": 2.67919659614563,
"reward_std": 14.777613639831543,
"rewards/rollout_reward_func/mean": 2.679196357727051,
"rewards/rollout_reward_func/std": 15.276268005371094,
"sampling/importance_sampling_ratio/max": 1.4407294988632202,
"sampling/importance_sampling_ratio/mean": 0.9704160690307617,
"sampling/importance_sampling_ratio/min": 0.6675639152526855,
"sampling/sampling_logp_difference/max": 0.4319186210632324,
"sampling/sampling_logp_difference/mean": 0.007419218309223652,
"step": 345,
"step_time": 38.83812410200153
},
{
"clip_ratio/high_max": 0.03472222248092294,
"clip_ratio/high_mean": 0.009548611182253808,
"clip_ratio/low_mean": 0.03599877539090812,
"clip_ratio/low_min": 0.0034722222480922937,
"clip_ratio/region_mean": 0.045547386282123625,
"entropy": 0.1278433846309781,
"epoch": 0.00692,
"grad_norm": 0.6303772330284119,
"kl": 1.1542848944664001,
"learning_rate": 9.999822038921338e-05,
"loss": -0.0049,
"step": 346,
"step_time": 9.405012558002454
},
{
"clip_ratio/high_max": 0.0069444444961845875,
"clip_ratio/high_mean": 0.0025584796094335616,
"clip_ratio/low_mean": 0.0026041666860692203,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005162646295502782,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1353.0,
"completions/max_terminated_length": 1353.0,
"completions/mean_length": 1241.171875,
"completions/mean_terminated_length": 1241.171875,
"completions/min_length": 719.0,
"completions/min_terminated_length": 719.0,
"entropy": 0.1106796741951257,
"epoch": 0.00694,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.4797411561012268,
"kl": 0.6278085261583328,
"learning_rate": 9.99982088894267e-05,
"loss": 0.0106,
"num_tokens": 19423491.0,
"reward": 5.54637336730957,
"reward_std": 12.041938781738281,
"rewards/rollout_reward_func/mean": 5.54637336730957,
"rewards/rollout_reward_func/std": 13.066041946411133,
"sampling/importance_sampling_ratio/max": 1.482460618019104,
"sampling/importance_sampling_ratio/mean": 0.9943655133247375,
"sampling/importance_sampling_ratio/min": 0.6257169246673584,
"sampling/sampling_logp_difference/max": 0.510839581489563,
"sampling/sampling_logp_difference/mean": 0.006749512627720833,
"step": 347,
"step_time": 39.46550670700071
},
{
"clip_ratio/high_max": 0.05993883335031569,
"clip_ratio/high_mean": 0.015852763841394335,
"clip_ratio/low_mean": 0.02315665892092511,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0390094225294888,
"entropy": 0.11026378069072962,
"epoch": 0.00696,
"grad_norm": 0.3181508183479309,
"kl": 0.6370288580656052,
"learning_rate": 9.999819735260483e-05,
"loss": 0.0068,
"step": 348,
"step_time": 10.100482684999406
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.002517361135687679,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.002517361135687679,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1332.0,
"completions/max_terminated_length": 1332.0,
"completions/mean_length": 1208.171875,
"completions/mean_terminated_length": 1208.171875,
"completions/min_length": 189.0,
"completions/min_terminated_length": 189.0,
"entropy": 0.13927970174700022,
"epoch": 0.00698,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5229349732398987,
"kl": 0.5507344976067543,
"learning_rate": 9.999818577874781e-05,
"loss": 0.0234,
"num_tokens": 19552011.0,
"reward": 5.104192733764648,
"reward_std": 11.615788459777832,
"rewards/rollout_reward_func/mean": 5.104192733764648,
"rewards/rollout_reward_func/std": 12.1382474899292,
"sampling/importance_sampling_ratio/max": 1.415814995765686,
"sampling/importance_sampling_ratio/mean": 1.0048539638519287,
"sampling/importance_sampling_ratio/min": 1.834242700438695e-16,
"sampling/sampling_logp_difference/max": 27.188508987426758,
"sampling/sampling_logp_difference/mean": 0.039306361228227615,
"step": 349,
"step_time": 37.862728561997756
},
{
"clip_ratio/high_max": 0.05868378118611872,
"clip_ratio/high_mean": 0.017275112157221884,
"clip_ratio/low_mean": 0.013766340038273484,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.03104145231191069,
"entropy": 0.14939681394025683,
"epoch": 0.007,
"grad_norm": 0.3000262379646301,
"kl": 0.5239376667886972,
"learning_rate": 9.999817416785565e-05,
"loss": 0.0173,
"step": 350,
"step_time": 9.859687822999149
},
{
"clip_ratio/high_max": 0.013706140452995896,
"clip_ratio/high_mean": 0.003426535113248974,
"clip_ratio/low_mean": 0.0026041666860692203,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.006030701799318194,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1342.0,
"completions/max_terminated_length": 1342.0,
"completions/mean_length": 1236.90625,
"completions/mean_terminated_length": 1236.90625,
"completions/min_length": 193.0,
"completions/min_terminated_length": 193.0,
"entropy": 0.14154944382607937,
"epoch": 0.00702,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.47570475935935974,
"kl": 0.5030098669230938,
"learning_rate": 9.999816251992836e-05,
"loss": -0.0158,
"num_tokens": 19682494.0,
"reward": 4.190635681152344,
"reward_std": 14.216930389404297,
"rewards/rollout_reward_func/mean": 4.190635681152344,
"rewards/rollout_reward_func/std": 14.30445671081543,
"sampling/importance_sampling_ratio/max": 1.5223360061645508,
"sampling/importance_sampling_ratio/mean": 1.0101966857910156,
"sampling/importance_sampling_ratio/min": 0.7218723297119141,
"sampling/sampling_logp_difference/max": 0.302712082862854,
"sampling/sampling_logp_difference/mean": 0.007096399553120136,
"step": 351,
"step_time": 39.30754639200131
},
{
"clip_ratio/high_max": 0.03492647083476186,
"clip_ratio/high_mean": 0.013026208442170173,
"clip_ratio/low_mean": 0.018280228949151933,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.031306437274906784,
"entropy": 0.1433765795081854,
"epoch": 0.00704,
"grad_norm": 0.2731061577796936,
"kl": 0.5208645444363356,
"learning_rate": 9.999815083496594e-05,
"loss": -0.0214,
"step": 352,
"step_time": 9.398018900999887
},
{
"clip_ratio/high_max": 0.013888888992369175,
"clip_ratio/high_mean": 0.0034722222480922937,
"clip_ratio/low_mean": 0.0034722222480922937,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0069444444961845875,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1348.0,
"completions/max_terminated_length": 1348.0,
"completions/mean_length": 1237.3125,
"completions/mean_terminated_length": 1237.3125,
"completions/min_length": 181.0,
"completions/min_terminated_length": 181.0,
"entropy": 0.1480951178818941,
"epoch": 0.00706,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5071465373039246,
"kl": 0.5567373130470514,
"learning_rate": 9.99981391129684e-05,
"loss": -0.008,
"num_tokens": 19812942.0,
"reward": 4.355041980743408,
"reward_std": 13.132366180419922,
"rewards/rollout_reward_func/mean": 4.355041980743408,
"rewards/rollout_reward_func/std": 13.851308822631836,
"sampling/importance_sampling_ratio/max": 1.4629567861557007,
"sampling/importance_sampling_ratio/mean": 1.0300343036651611,
"sampling/importance_sampling_ratio/min": 0.6640676856040955,
"sampling/sampling_logp_difference/max": 0.5005507469177246,
"sampling/sampling_logp_difference/mean": 0.007855242118239403,
"step": 353,
"step_time": 38.34223270599978
},
{
"clip_ratio/high_max": 0.024305555736646056,
"clip_ratio/high_mean": 0.007812500116415322,
"clip_ratio/low_mean": 0.024994894512929022,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.032807394745759666,
"entropy": 0.13850665464997292,
"epoch": 0.00708,
"grad_norm": 0.27875232696533203,
"kl": 0.5900795683264732,
"learning_rate": 9.999812735393576e-05,
"loss": -0.0167,
"step": 354,
"step_time": 9.843489302002126
},
{
"clip_ratio/high_max": 0.010416666744276881,
"clip_ratio/high_mean": 0.0026041666860692203,
"clip_ratio/low_mean": 0.0008680555620230734,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0034722222480922937,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1358.0,
"completions/max_terminated_length": 1358.0,
"completions/mean_length": 1228.9375,
"completions/mean_terminated_length": 1228.9375,
"completions/min_length": 699.0,
"completions/min_terminated_length": 699.0,
"entropy": 0.13509350316599011,
"epoch": 0.0071,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5068008899688721,
"kl": 0.4818702656775713,
"learning_rate": 9.999811555786804e-05,
"loss": 0.0278,
"num_tokens": 19942820.0,
"reward": 5.804719924926758,
"reward_std": 13.167655944824219,
"rewards/rollout_reward_func/mean": 5.804719924926758,
"rewards/rollout_reward_func/std": 13.18018913269043,
"sampling/importance_sampling_ratio/max": 1.306014895439148,
"sampling/importance_sampling_ratio/mean": 1.0091025829315186,
"sampling/importance_sampling_ratio/min": 0.625792384147644,
"sampling/sampling_logp_difference/max": 0.3519221544265747,
"sampling/sampling_logp_difference/mean": 0.006864185445010662,
"step": 355,
"step_time": 39.10083819900228
},
{
"clip_ratio/high_max": 0.031250000232830644,
"clip_ratio/high_mean": 0.013888889225199819,
"clip_ratio/low_mean": 0.026143791212234646,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.040032680495642126,
"entropy": 0.1260006483644247,
"epoch": 0.00712,
"grad_norm": 0.2814493179321289,
"kl": 0.5503856968134642,
"learning_rate": 9.999810372476525e-05,
"loss": 0.0244,
"step": 356,
"step_time": 9.433313735999036
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1365.0,
"completions/max_terminated_length": 1365.0,
"completions/mean_length": 1269.96875,
"completions/mean_terminated_length": 1269.96875,
"completions/min_length": 1127.0,
"completions/min_terminated_length": 1127.0,
"entropy": 0.1161547633819282,
"epoch": 0.00714,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.4718828499317169,
"kl": 0.9211016893386841,
"learning_rate": 9.999809185462739e-05,
"loss": 0.039,
"num_tokens": 20075371.0,
"reward": 3.961371898651123,
"reward_std": 11.789936065673828,
"rewards/rollout_reward_func/mean": 3.961371898651123,
"rewards/rollout_reward_func/std": 12.59416675567627,
"sampling/importance_sampling_ratio/max": 1.223926067352295,
"sampling/importance_sampling_ratio/mean": 0.9972316026687622,
"sampling/importance_sampling_ratio/min": 0.7068163156509399,
"sampling/sampling_logp_difference/max": 0.21517443656921387,
"sampling/sampling_logp_difference/mean": 0.0053621698170900345,
"step": 357,
"step_time": 38.67136614899937
},
{
"clip_ratio/high_max": 0.049019608180969954,
"clip_ratio/high_mean": 0.013991013227496296,
"clip_ratio/low_mean": 0.01996527804294601,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.03395629138685763,
"entropy": 0.11733251390978694,
"epoch": 0.00716,
"grad_norm": 0.16502372920513153,
"kl": 0.7801671754568815,
"learning_rate": 9.999807994745449e-05,
"loss": 0.0324,
"step": 358,
"step_time": 9.794801944999563
},
{
"clip_ratio/high_max": 0.0069444444961845875,
"clip_ratio/high_mean": 0.0017361111240461469,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0017361111240461469,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1343.0,
"completions/max_terminated_length": 1343.0,
"completions/mean_length": 1256.96875,
"completions/mean_terminated_length": 1256.96875,
"completions/min_length": 1011.0,
"completions/min_terminated_length": 1011.0,
"entropy": 0.13261962542310357,
"epoch": 0.00718,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.4223484992980957,
"kl": 0.6454224642366171,
"learning_rate": 9.999806800324652e-05,
"loss": -0.0021,
"num_tokens": 20207093.0,
"reward": 4.6267523765563965,
"reward_std": 13.086963653564453,
"rewards/rollout_reward_func/mean": 4.626751899719238,
"rewards/rollout_reward_func/std": 14.676898956298828,
"sampling/importance_sampling_ratio/max": 1.336045265197754,
"sampling/importance_sampling_ratio/mean": 0.9978616237640381,
"sampling/importance_sampling_ratio/min": 0.6580431461334229,
"sampling/sampling_logp_difference/max": 0.287054181098938,
"sampling/sampling_logp_difference/mean": 0.0057748714461922646,
"step": 359,
"step_time": 38.74908411499746
},
{
"clip_ratio/high_max": 0.058662281604483724,
"clip_ratio/high_mean": 0.018137792707420886,
"clip_ratio/low_mean": 0.018183479725848883,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.03632127266610041,
"entropy": 0.1370791387744248,
"epoch": 0.0072,
"grad_norm": 0.26030489802360535,
"kl": 0.6394520290195942,
"learning_rate": 9.999805602200354e-05,
"loss": -0.0085,
"step": 360,
"step_time": 9.32019592799952
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0008680555620230734,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0008680555620230734,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1344.0,
"completions/max_terminated_length": 1344.0,
"completions/mean_length": 1241.734375,
"completions/mean_terminated_length": 1241.734375,
"completions/min_length": 1025.0,
"completions/min_terminated_length": 1025.0,
"entropy": 0.1345509896054864,
"epoch": 0.00722,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5100898742675781,
"kl": 0.8387140035629272,
"learning_rate": 9.999804400372554e-05,
"loss": 0.007,
"num_tokens": 20337789.0,
"reward": 9.449151039123535,
"reward_std": 12.286431312561035,
"rewards/rollout_reward_func/mean": 9.449151039123535,
"rewards/rollout_reward_func/std": 13.57576847076416,
"sampling/importance_sampling_ratio/max": 1.4115562438964844,
"sampling/importance_sampling_ratio/mean": 0.990313708782196,
"sampling/importance_sampling_ratio/min": 0.6950281858444214,
"sampling/sampling_logp_difference/max": 0.3388124704360962,
"sampling/sampling_logp_difference/mean": 0.00530852098017931,
"step": 361,
"step_time": 39.235169910002696
},
{
"clip_ratio/high_max": 0.0349264710675925,
"clip_ratio/high_mean": 0.010467728832736611,
"clip_ratio/low_mean": 0.01741217344533652,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.027879902394488454,
"entropy": 0.15170079609379172,
"epoch": 0.00724,
"grad_norm": 0.33363601565361023,
"kl": 0.6247174255549908,
"learning_rate": 9.999803194841253e-05,
"loss": 0.0003,
"step": 362,
"step_time": 9.35076707999906
},
{
"clip_ratio/high_max": 0.0034722222480922937,
"clip_ratio/high_mean": 0.0008680555620230734,
"clip_ratio/low_mean": 0.0017361111240461469,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0026041666860692203,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1359.0,
"completions/max_terminated_length": 1359.0,
"completions/mean_length": 1251.59375,
"completions/mean_terminated_length": 1251.59375,
"completions/min_length": 700.0,
"completions/min_terminated_length": 700.0,
"entropy": 0.1958311009220779,
"epoch": 0.00726,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5317199230194092,
"kl": 0.5815525501966476,
"learning_rate": 9.999801985606452e-05,
"loss": 0.0042,
"num_tokens": 20469218.0,
"reward": 3.8903188705444336,
"reward_std": 13.076482772827148,
"rewards/rollout_reward_func/mean": 3.8903186321258545,
"rewards/rollout_reward_func/std": 13.372103691101074,
"sampling/importance_sampling_ratio/max": 1.3623380661010742,
"sampling/importance_sampling_ratio/mean": 1.0135592222213745,
"sampling/importance_sampling_ratio/min": 0.7123748064041138,
"sampling/sampling_logp_difference/max": 0.29522740840911865,
"sampling/sampling_logp_difference/mean": 0.006953438278287649,
"step": 363,
"step_time": 39.74288477299888
},
{
"clip_ratio/high_max": 0.06527777831070125,
"clip_ratio/high_mean": 0.01979166700039059,
"clip_ratio/low_mean": 0.021701389166992158,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.04149305640021339,
"entropy": 0.2018888248130679,
"epoch": 0.00728,
"grad_norm": 0.2644880712032318,
"kl": 0.5717838387936354,
"learning_rate": 9.999800772668153e-05,
"loss": -0.0029,
"step": 364,
"step_time": 9.321073625997087
},
{
"clip_ratio/high_max": 0.0069444444961845875,
"clip_ratio/high_mean": 0.0017361111240461469,
"clip_ratio/low_mean": 0.0008680555620230734,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0026041666860692203,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1354.0,
"completions/max_terminated_length": 1354.0,
"completions/mean_length": 1249.453125,
"completions/mean_terminated_length": 1249.453125,
"completions/min_length": 467.0,
"completions/min_terminated_length": 467.0,
"entropy": 0.18510928004980087,
"epoch": 0.0073,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.442364364862442,
"kl": 0.4795332048088312,
"learning_rate": 9.999799556026358e-05,
"loss": -0.0238,
"num_tokens": 20600462.0,
"reward": 6.273903846740723,
"reward_std": 12.39173698425293,
"rewards/rollout_reward_func/mean": 6.273903846740723,
"rewards/rollout_reward_func/std": 13.681985855102539,
"sampling/importance_sampling_ratio/max": 1.3438879251480103,
"sampling/importance_sampling_ratio/mean": 0.9609812498092651,
"sampling/importance_sampling_ratio/min": 0.6316797733306885,
"sampling/sampling_logp_difference/max": 0.33423590660095215,
"sampling/sampling_logp_difference/mean": 0.007498072925955057,
"step": 365,
"step_time": 38.43022035099784
},
{
"clip_ratio/high_max": 0.05171783687546849,
"clip_ratio/high_mean": 0.013797514839097857,
"clip_ratio/low_mean": 0.007766812981572002,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0215643277624622,
"entropy": 0.18268039543181658,
"epoch": 0.00732,
"grad_norm": 0.2666545808315277,
"kl": 0.47542588133364916,
"learning_rate": 9.999798335681066e-05,
"loss": -0.0309,
"step": 366,
"step_time": 9.454387761999897
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0016904239892028272,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0016904239892028272,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1344.0,
"completions/max_terminated_length": 1344.0,
"completions/mean_length": 1222.125,
"completions/mean_terminated_length": 1222.125,
"completions/min_length": 999.0,
"completions/min_terminated_length": 999.0,
"entropy": 0.21282331459224224,
"epoch": 0.00734,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.8037834763526917,
"kl": 0.6722489278763533,
"learning_rate": 9.99979711163228e-05,
"loss": 0.0148,
"num_tokens": 20729886.0,
"reward": 5.174856662750244,
"reward_std": 11.355770111083984,
"rewards/rollout_reward_func/mean": 5.174857139587402,
"rewards/rollout_reward_func/std": 11.9678955078125,
"sampling/importance_sampling_ratio/max": 1.8758124113082886,
"sampling/importance_sampling_ratio/mean": 1.0103557109832764,
"sampling/importance_sampling_ratio/min": 0.7285647392272949,
"sampling/sampling_logp_difference/max": 0.3263084888458252,
"sampling/sampling_logp_difference/mean": 0.008695240132510662,
"step": 367,
"step_time": 37.67410640100388
},
{
"clip_ratio/high_max": 0.036011905409395695,
"clip_ratio/high_mean": 0.01073908741818741,
"clip_ratio/low_mean": 0.02561856439569965,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0363576520467177,
"entropy": 0.21391737554222345,
"epoch": 0.00736,
"grad_norm": 0.4400097727775574,
"kl": 0.749302851036191,
"learning_rate": 9.999795883880001e-05,
"loss": 0.005,
"step": 368,
"step_time": 9.906740201999128
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.003426535113248974,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.003426535113248974,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1348.0,
"completions/max_terminated_length": 1348.0,
"completions/mean_length": 1231.171875,
"completions/mean_terminated_length": 1231.171875,
"completions/min_length": 994.0,
"completions/min_terminated_length": 994.0,
"entropy": 0.2234082594513893,
"epoch": 0.00738,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.0522061586380005,
"kl": 0.9143509455025196,
"learning_rate": 9.999794652424228e-05,
"loss": 0.0039,
"num_tokens": 20859908.0,
"reward": 8.738959312438965,
"reward_std": 11.845466613769531,
"rewards/rollout_reward_func/mean": 8.738959312438965,
"rewards/rollout_reward_func/std": 12.123114585876465,
"sampling/importance_sampling_ratio/max": 1.3250545263290405,
"sampling/importance_sampling_ratio/mean": 1.020609974861145,
"sampling/importance_sampling_ratio/min": 0.5885343551635742,
"sampling/sampling_logp_difference/max": 0.4251088500022888,
"sampling/sampling_logp_difference/mean": 0.008847212418913841,
"step": 369,
"step_time": 38.576368669004296
},
{
"clip_ratio/high_max": 0.05455874605104327,
"clip_ratio/high_mean": 0.016198166005779058,
"clip_ratio/low_mean": 0.03593064745655283,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.05212881352053955,
"entropy": 0.22893889155238867,
"epoch": 0.0074,
"grad_norm": 0.39881831407546997,
"kl": 1.0176227018237114,
"learning_rate": 9.999793417264966e-05,
"loss": -0.0017,
"step": 370,
"step_time": 8.888960126005259
},
{
"clip_ratio/high_max": 0.0034722222480922937,
"clip_ratio/high_mean": 0.0008680555620230734,
"clip_ratio/low_mean": 0.0008680555620230734,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0017361111240461469,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1348.0,
"completions/max_terminated_length": 1348.0,
"completions/mean_length": 1242.28125,
"completions/mean_terminated_length": 1242.28125,
"completions/min_length": 913.0,
"completions/min_terminated_length": 913.0,
"entropy": 0.2308051260188222,
"epoch": 0.00742,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5715855956077576,
"kl": 0.87254199385643,
"learning_rate": 9.999792178402214e-05,
"loss": -0.0234,
"num_tokens": 20990697.0,
"reward": 6.123772621154785,
"reward_std": 10.485084533691406,
"rewards/rollout_reward_func/mean": 6.123772144317627,
"rewards/rollout_reward_func/std": 11.30632209777832,
"sampling/importance_sampling_ratio/max": 1.4539350271224976,
"sampling/importance_sampling_ratio/mean": 1.0005998611450195,
"sampling/importance_sampling_ratio/min": 0.5505736470222473,
"sampling/sampling_logp_difference/max": 0.3510777950286865,
"sampling/sampling_logp_difference/mean": 0.009203520603477955,
"step": 371,
"step_time": 38.65458783400027
},
{
"clip_ratio/high_max": 0.045200163731351495,
"clip_ratio/high_mean": 0.013036152173299342,
"clip_ratio/low_mean": 0.023381332110147923,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.03641748463269323,
"entropy": 0.2447242382913828,
"epoch": 0.00744,
"grad_norm": 0.298229455947876,
"kl": 0.8313354179263115,
"learning_rate": 9.999790935835973e-05,
"loss": -0.0303,
"step": 372,
"step_time": 9.79756171700501
},
{
"clip_ratio/high_max": 0.010051169665530324,
"clip_ratio/high_mean": 0.002512792416382581,
"clip_ratio/low_mean": 0.0008680555620230734,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0033808479784056544,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1358.0,
"completions/max_terminated_length": 1358.0,
"completions/mean_length": 1223.75,
"completions/mean_terminated_length": 1223.75,
"completions/min_length": 908.0,
"completions/min_terminated_length": 908.0,
"entropy": 0.25220474135130644,
"epoch": 0.00746,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5470872521400452,
"kl": 0.7501334678381681,
"learning_rate": 9.999789689566245e-05,
"loss": 0.0016,
"num_tokens": 21120250.0,
"reward": 4.980414867401123,
"reward_std": 13.811859130859375,
"rewards/rollout_reward_func/mean": 4.980414867401123,
"rewards/rollout_reward_func/std": 15.705443382263184,
"sampling/importance_sampling_ratio/max": 1.4358227252960205,
"sampling/importance_sampling_ratio/mean": 0.9660643339157104,
"sampling/importance_sampling_ratio/min": 0.4660157859325409,
"sampling/sampling_logp_difference/max": 0.5715584754943848,
"sampling/sampling_logp_difference/mean": 0.011051887646317482,
"step": 373,
"step_time": 37.840678287995615
},
{
"clip_ratio/high_max": 0.07236842135898769,
"clip_ratio/high_mean": 0.021564327646046877,
"clip_ratio/low_mean": 0.025087612215429544,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.04665194044355303,
"entropy": 0.2590667102485895,
"epoch": 0.00748,
"grad_norm": 0.33827197551727295,
"kl": 0.7311984747648239,
"learning_rate": 9.999788439593031e-05,
"loss": -0.0111,
"step": 374,
"step_time": 8.941922394003996
},
{
"clip_ratio/high_max": 0.0034722222480922937,
"clip_ratio/high_mean": 0.0008680555620230734,
"clip_ratio/low_mean": 0.0008680555620230734,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0017361111240461469,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1351.0,
"completions/max_terminated_length": 1351.0,
"completions/mean_length": 1216.34375,
"completions/mean_terminated_length": 1216.34375,
"completions/min_length": 993.0,
"completions/min_terminated_length": 993.0,
"entropy": 0.2550716269761324,
"epoch": 0.0075,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.8335476517677307,
"kl": 0.9307033438235521,
"learning_rate": 9.999787185916331e-05,
"loss": 0.0294,
"num_tokens": 21249311.0,
"reward": 5.41689920425415,
"reward_std": 12.388166427612305,
"rewards/rollout_reward_func/mean": 5.416898727416992,
"rewards/rollout_reward_func/std": 13.267603874206543,
"sampling/importance_sampling_ratio/max": 1.4684741497039795,
"sampling/importance_sampling_ratio/mean": 1.0054916143417358,
"sampling/importance_sampling_ratio/min": 0.6331810355186462,
"sampling/sampling_logp_difference/max": 0.2799299955368042,
"sampling/sampling_logp_difference/mean": 0.01017037034034729,
"step": 375,
"step_time": 38.13824768100312
},
{
"clip_ratio/high_max": 0.06950894417241216,
"clip_ratio/high_mean": 0.021580452797934413,
"clip_ratio/low_mean": 0.02794391370844096,
"clip_ratio/low_min": 0.0034722222480922937,
"clip_ratio/region_mean": 0.049524366680998355,
"entropy": 0.2559032328426838,
"epoch": 0.00752,
"grad_norm": 0.2704547047615051,
"kl": 0.9575543515384197,
"learning_rate": 9.999785928536148e-05,
"loss": 0.0164,
"step": 376,
"step_time": 9.173922988000413
},
{
"clip_ratio/high_max": 0.00657894741743803,
"clip_ratio/high_mean": 0.0016447368543595076,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0016447368543595076,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1330.0,
"completions/max_terminated_length": 1330.0,
"completions/mean_length": 1201.6875,
"completions/mean_terminated_length": 1201.6875,
"completions/min_length": 1002.0,
"completions/min_terminated_length": 1002.0,
"entropy": 0.2529036393389106,
"epoch": 0.00754,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5121884346008301,
"kl": 0.7823121659457684,
"learning_rate": 9.999784667452484e-05,
"loss": -0.0058,
"num_tokens": 21377388.0,
"reward": 5.937844753265381,
"reward_std": 10.75206184387207,
"rewards/rollout_reward_func/mean": 5.937845230102539,
"rewards/rollout_reward_func/std": 10.630824089050293,
"sampling/importance_sampling_ratio/max": 1.2855048179626465,
"sampling/importance_sampling_ratio/mean": 0.9824115037918091,
"sampling/importance_sampling_ratio/min": 0.7048435807228088,
"sampling/sampling_logp_difference/max": 0.3823585510253906,
"sampling/sampling_logp_difference/mean": 0.010149901732802391,
"step": 377,
"step_time": 37.091166489997704
},
{
"clip_ratio/high_max": 0.030701754614710808,
"clip_ratio/high_mean": 0.011025112122297287,
"clip_ratio/low_mean": 0.024379125621635467,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.03540423803497106,
"entropy": 0.24686269089579582,
"epoch": 0.00756,
"grad_norm": 0.308444082736969,
"kl": 0.7720872350037098,
"learning_rate": 9.999783402665338e-05,
"loss": -0.0141,
"step": 378,
"step_time": 8.784612373994605
},
{
"clip_ratio/high_max": 0.01686507952399552,
"clip_ratio/high_mean": 0.00421626988099888,
"clip_ratio/low_mean": 0.0035807291860692203,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0077969990670681,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1343.0,
"completions/max_terminated_length": 1343.0,
"completions/mean_length": 1210.0625,
"completions/mean_terminated_length": 1210.0625,
"completions/min_length": 720.0,
"completions/min_terminated_length": 720.0,
"entropy": 0.2716317633166909,
"epoch": 0.00758,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.6121698021888733,
"kl": 0.925221860408783,
"learning_rate": 9.999782134174711e-05,
"loss": -0.0013,
"num_tokens": 21506045.0,
"reward": 2.2015371322631836,
"reward_std": 15.299311637878418,
"rewards/rollout_reward_func/mean": 2.2015371322631836,
"rewards/rollout_reward_func/std": 15.50017261505127,
"sampling/importance_sampling_ratio/max": 1.400822639465332,
"sampling/importance_sampling_ratio/mean": 0.9883875846862793,
"sampling/importance_sampling_ratio/min": 0.625456690788269,
"sampling/sampling_logp_difference/max": 0.3319031000137329,
"sampling/sampling_logp_difference/mean": 0.011153844185173512,
"step": 379,
"step_time": 36.844649089001905
},
{
"clip_ratio/high_max": 0.05834899842739105,
"clip_ratio/high_mean": 0.017242478381376714,
"clip_ratio/low_mean": 0.03576550219440833,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.05300798005191609,
"entropy": 0.2637836243957281,
"epoch": 0.0076,
"grad_norm": 0.4454600512981415,
"kl": 0.9274842478334904,
"learning_rate": 9.999780861980607e-05,
"loss": -0.0126,
"step": 380,
"step_time": 9.811575109000842
},
{
"clip_ratio/high_max": 0.010620915098115802,
"clip_ratio/high_mean": 0.0026552287745289505,
"clip_ratio/low_mean": 0.0009191176504828036,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.003574346425011754,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1341.0,
"completions/max_terminated_length": 1341.0,
"completions/mean_length": 1219.8125,
"completions/mean_terminated_length": 1219.8125,
"completions/min_length": 991.0,
"completions/min_terminated_length": 991.0,
"entropy": 0.2292822152376175,
"epoch": 0.00762,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.6078642010688782,
"kl": 0.803357319906354,
"learning_rate": 9.999779586083025e-05,
"loss": 0.004,
"num_tokens": 21635298.0,
"reward": 5.408005714416504,
"reward_std": 9.926593780517578,
"rewards/rollout_reward_func/mean": 5.4080047607421875,
"rewards/rollout_reward_func/std": 11.208430290222168,
"sampling/importance_sampling_ratio/max": 1.2364863157272339,
"sampling/importance_sampling_ratio/mean": 0.997908890247345,
"sampling/importance_sampling_ratio/min": 0.6723216772079468,
"sampling/sampling_logp_difference/max": 0.38260674476623535,
"sampling/sampling_logp_difference/mean": 0.007029087748378515,
"step": 381,
"step_time": 37.91188854600114
},
{
"clip_ratio/high_max": 0.05433114105835557,
"clip_ratio/high_mean": 0.014450840826611966,
"clip_ratio/low_mean": 0.02246758935507387,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.03691843029810116,
"entropy": 0.21105156652629375,
"epoch": 0.00764,
"grad_norm": 0.3075491189956665,
"kl": 0.9027222413569689,
"learning_rate": 9.999778306481968e-05,
"loss": -0.0043,
"step": 382,
"step_time": 9.498284126000726
},
{
"clip_ratio/high_max": 0.0036764706019312143,
"clip_ratio/high_mean": 0.0009191176504828036,
"clip_ratio/low_mean": 0.0008680555620230734,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.001787173212505877,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1357.0,
"completions/max_terminated_length": 1357.0,
"completions/mean_length": 1213.765625,
"completions/mean_terminated_length": 1213.765625,
"completions/min_length": 957.0,
"completions/min_terminated_length": 957.0,
"entropy": 0.20513668935745955,
"epoch": 0.00766,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.6294713020324707,
"kl": 0.7374063562601805,
"learning_rate": 9.999777023177434e-05,
"loss": 0.0252,
"num_tokens": 21764144.0,
"reward": 8.72990894317627,
"reward_std": 11.312125205993652,
"rewards/rollout_reward_func/mean": 8.729909896850586,
"rewards/rollout_reward_func/std": 11.270212173461914,
"sampling/importance_sampling_ratio/max": 1.667926549911499,
"sampling/importance_sampling_ratio/mean": 1.0118814706802368,
"sampling/importance_sampling_ratio/min": 0.7219305038452148,
"sampling/sampling_logp_difference/max": 0.31063222885131836,
"sampling/sampling_logp_difference/mean": 0.007431398145854473,
"step": 383,
"step_time": 37.184195774003456
},
{
"clip_ratio/high_max": 0.024509804090484977,
"clip_ratio/high_mean": 0.006995506584644318,
"clip_ratio/low_mean": 0.034743722644634545,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.041739229462109506,
"entropy": 0.19153737649321556,
"epoch": 0.00768,
"grad_norm": 0.37729939818382263,
"kl": 1.0056524686515331,
"learning_rate": 9.999775736169427e-05,
"loss": 0.0245,
"step": 384,
"step_time": 8.749921898001048
},
{
"clip_ratio/high_max": 0.0059523810632526875,
"clip_ratio/high_mean": 0.0014880952658131719,
"clip_ratio/low_mean": 0.004579809028655291,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0060679042944684625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1551.0,
"completions/max_terminated_length": 1551.0,
"completions/mean_length": 1410.3125,
"completions/mean_terminated_length": 1410.3125,
"completions/min_length": 765.0,
"completions/min_terminated_length": 765.0,
"entropy": 0.20708153676241636,
"epoch": 0.0077,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.738293468952179,
"kl": 0.887890812009573,
"learning_rate": 9.99977444545795e-05,
"loss": -0.0628,
"num_tokens": 21905662.0,
"reward": 9.202791213989258,
"reward_std": 15.181166648864746,
"rewards/rollout_reward_func/mean": 9.202792167663574,
"rewards/rollout_reward_func/std": 15.67770767211914,
"sampling/importance_sampling_ratio/max": 1.5441806316375732,
"sampling/importance_sampling_ratio/mean": 0.9917982816696167,
"sampling/importance_sampling_ratio/min": 5.679499839178481e-13,
"sampling/sampling_logp_difference/max": 22.66815948486328,
"sampling/sampling_logp_difference/mean": 0.030492324382066727,
"step": 385,
"step_time": 39.20689014000345
},
{
"clip_ratio/high_max": 0.023971861926838756,
"clip_ratio/high_mean": 0.007481060747522861,
"clip_ratio/low_mean": 0.03716492815874517,
"clip_ratio/low_min": 0.0029761905316263437,
"clip_ratio/region_mean": 0.04464598890626803,
"entropy": 0.1879758802242577,
"epoch": 0.00772,
"grad_norm": 0.41567400097846985,
"kl": 0.8819043859839439,
"learning_rate": 9.999773151042999e-05,
"loss": -0.0737,
"step": 386,
"step_time": 10.475744582005063
},
{
"clip_ratio/high_max": 0.009424603311344981,
"clip_ratio/high_mean": 0.0023561508278362453,
"clip_ratio/low_mean": 0.0007440476329065859,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0031001984607428312,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1556.0,
"completions/max_terminated_length": 1556.0,
"completions/mean_length": 1433.34375,
"completions/mean_terminated_length": 1433.34375,
"completions/min_length": 1211.0,
"completions/min_terminated_length": 1211.0,
"entropy": 0.15365674067288637,
"epoch": 0.00774,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5962560176849365,
"kl": 0.6632435545325279,
"learning_rate": 9.99977185292458e-05,
"loss": 0.0216,
"num_tokens": 22048591.0,
"reward": 13.268250465393066,
"reward_std": 13.775822639465332,
"rewards/rollout_reward_func/mean": 13.268250465393066,
"rewards/rollout_reward_func/std": 14.63206958770752,
"sampling/importance_sampling_ratio/max": 1.2218079566955566,
"sampling/importance_sampling_ratio/mean": 0.9793609380722046,
"sampling/importance_sampling_ratio/min": 0.6325286626815796,
"sampling/sampling_logp_difference/max": 0.38329482078552246,
"sampling/sampling_logp_difference/mean": 0.0064071910455822945,
"step": 387,
"step_time": 41.232673029999205
},
{
"clip_ratio/high_max": 0.05530754057690501,
"clip_ratio/high_mean": 0.01680307579226792,
"clip_ratio/low_mean": 0.014248512219637632,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.031051588244736195,
"entropy": 0.14256418915465474,
"epoch": 0.00776,
"grad_norm": 0.527172863483429,
"kl": 0.646535612642765,
"learning_rate": 9.999770551102692e-05,
"loss": 0.0167,
"step": 388,
"step_time": 10.636822301992652
},
{
"clip_ratio/high_max": 0.0031250000465661287,
"clip_ratio/high_mean": 0.0007812500116415322,
"clip_ratio/low_mean": 0.0007440476329065859,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0015252976445481181,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1545.0,
"completions/max_terminated_length": 1545.0,
"completions/mean_length": 1429.21875,
"completions/mean_terminated_length": 1429.21875,
"completions/min_length": 1226.0,
"completions/min_terminated_length": 1226.0,
"entropy": 0.14011064730584621,
"epoch": 0.00778,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5465406775474548,
"kl": 0.6449617743492126,
"learning_rate": 9.999769245577337e-05,
"loss": -0.0416,
"num_tokens": 22191273.0,
"reward": 10.615909576416016,
"reward_std": 10.947202682495117,
"rewards/rollout_reward_func/mean": 10.615909576416016,
"rewards/rollout_reward_func/std": 12.735282897949219,
"sampling/importance_sampling_ratio/max": 2.317744016647339,
"sampling/importance_sampling_ratio/mean": 1.0246381759643555,
"sampling/importance_sampling_ratio/min": 0.2836526930332184,
"sampling/sampling_logp_difference/max": 1.3213729858398438,
"sampling/sampling_logp_difference/mean": 0.008526146411895752,
"step": 389,
"step_time": 41.60411787599878
},
{
"clip_ratio/high_max": 0.03645833395421505,
"clip_ratio/high_mean": 0.01361607201397419,
"clip_ratio/low_mean": 0.013582785322796553,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.02719885722035542,
"entropy": 0.15200490225106478,
"epoch": 0.0078,
"grad_norm": 0.4452749192714691,
"kl": 0.5898908544331789,
"learning_rate": 9.999767936348516e-05,
"loss": -0.05,
"step": 390,
"step_time": 10.0632308130007
},
{
"clip_ratio/high_max": 0.0029761905316263437,
"clip_ratio/high_mean": 0.0007440476329065859,
"clip_ratio/low_mean": 0.0014880952658131719,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.002232142898719758,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1550.0,
"completions/max_terminated_length": 1550.0,
"completions/mean_length": 1416.0625,
"completions/mean_terminated_length": 1416.0625,
"completions/min_length": 193.0,
"completions/min_terminated_length": 193.0,
"entropy": 0.16107679810374975,
"epoch": 0.00782,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5122284889221191,
"kl": 0.5392248686403036,
"learning_rate": 9.999766623416232e-05,
"loss": -0.0577,
"num_tokens": 22333164.0,
"reward": 14.949935913085938,
"reward_std": 16.67510414123535,
"rewards/rollout_reward_func/mean": 14.949935913085938,
"rewards/rollout_reward_func/std": 18.703474044799805,
"sampling/importance_sampling_ratio/max": 1.4272053241729736,
"sampling/importance_sampling_ratio/mean": 0.9347177743911743,
"sampling/importance_sampling_ratio/min": 0.16998553276062012,
"sampling/sampling_logp_difference/max": 1.3626210689544678,
"sampling/sampling_logp_difference/mean": 0.009718427434563637,
"step": 391,
"step_time": 39.99297312899398
},
{
"clip_ratio/high_max": 0.02976190554909408,
"clip_ratio/high_mean": 0.008928571594879031,
"clip_ratio/low_mean": 0.015298011188860983,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.024226582725532353,
"entropy": 0.14390948927029967,
"epoch": 0.00784,
"grad_norm": 0.4105764627456665,
"kl": 0.5607901010662317,
"learning_rate": 9.999765306780482e-05,
"loss": -0.0626,
"step": 392,
"step_time": 10.043341166003302
},
{
"clip_ratio/high_max": 0.009077381109818816,
"clip_ratio/high_mean": 0.002269345277454704,
"clip_ratio/low_mean": 0.0007440476329065859,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00301339291036129,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1554.0,
"completions/max_terminated_length": 1554.0,
"completions/mean_length": 1443.640625,
"completions/mean_terminated_length": 1443.640625,
"completions/min_length": 1069.0,
"completions/min_terminated_length": 1069.0,
"entropy": 0.12537508364766836,
"epoch": 0.00786,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.7089640498161316,
"kl": 0.9093821812421083,
"learning_rate": 9.99976398644127e-05,
"loss": 0.0186,
"num_tokens": 22476782.0,
"reward": 11.492524147033691,
"reward_std": 15.943157196044922,
"rewards/rollout_reward_func/mean": 11.492524147033691,
"rewards/rollout_reward_func/std": 16.71925163269043,
"sampling/importance_sampling_ratio/max": 1.7644160985946655,
"sampling/importance_sampling_ratio/mean": 0.9914994239807129,
"sampling/importance_sampling_ratio/min": 0.7484045028686523,
"sampling/sampling_logp_difference/max": 0.4207209348678589,
"sampling/sampling_logp_difference/mean": 0.006036281120032072,
"step": 393,
"step_time": 40.15463091899983
},
{
"clip_ratio/high_max": 0.01800595293752849,
"clip_ratio/high_mean": 0.005245535809081048,
"clip_ratio/low_mean": 0.018960813991725445,
"clip_ratio/low_min": 0.0029761905316263437,
"clip_ratio/region_mean": 0.024206349917221814,
"entropy": 0.11524984752759337,
"epoch": 0.00788,
"grad_norm": 0.7076042890548706,
"kl": 0.7414491530507803,
"learning_rate": 9.9997626623986e-05,
"loss": 0.0115,
"step": 394,
"step_time": 10.618017185999634
},
{
"clip_ratio/high_max": 0.0029761905316263437,
"clip_ratio/high_mean": 0.0007440476329065859,
"clip_ratio/low_mean": 0.0007440476329065859,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0014880952658131719,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1563.0,
"completions/max_terminated_length": 1563.0,
"completions/mean_length": 1443.5,
"completions/mean_terminated_length": 1443.5,
"completions/min_length": 427.0,
"completions/min_terminated_length": 427.0,
"entropy": 0.12252500653266907,
"epoch": 0.0079,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.7846469283103943,
"kl": 0.7317866403609514,
"learning_rate": 9.999761334652469e-05,
"loss": 0.0075,
"num_tokens": 22620477.0,
"reward": 11.849661827087402,
"reward_std": 16.187042236328125,
"rewards/rollout_reward_func/mean": 11.849662780761719,
"rewards/rollout_reward_func/std": 17.399803161621094,
"sampling/importance_sampling_ratio/max": 1.4447773694992065,
"sampling/importance_sampling_ratio/mean": 1.0075819492340088,
"sampling/importance_sampling_ratio/min": 0.663360595703125,
"sampling/sampling_logp_difference/max": 0.43144845962524414,
"sampling/sampling_logp_difference/mean": 0.007304108701646328,
"step": 395,
"step_time": 40.574595107005734
},
{
"clip_ratio/high_max": 0.033482143422588706,
"clip_ratio/high_mean": 0.011425047181546688,
"clip_ratio/low_mean": 0.01829117111628875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.02971621841425076,
"entropy": 0.12393791414797306,
"epoch": 0.00792,
"grad_norm": 0.38290056586265564,
"kl": 0.7317893952131271,
"learning_rate": 9.999760003202881e-05,
"loss": 0.0033,
"step": 396,
"step_time": 10.742739806995814
},
{
"clip_ratio/high_max": 0.01205357164144516,
"clip_ratio/high_mean": 0.00301339291036129,
"clip_ratio/low_mean": 0.0007440476329065859,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.003757440543267876,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1543.0,
"completions/max_terminated_length": 1543.0,
"completions/mean_length": 1444.578125,
"completions/mean_terminated_length": 1444.578125,
"completions/min_length": 1287.0,
"completions/min_terminated_length": 1287.0,
"entropy": 0.13034009747207165,
"epoch": 0.00794,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.7314363121986389,
"kl": 0.6178351659327745,
"learning_rate": 9.999758668049833e-05,
"loss": -0.0157,
"num_tokens": 22764146.0,
"reward": 11.904011726379395,
"reward_std": 15.453010559082031,
"rewards/rollout_reward_func/mean": 11.904010772705078,
"rewards/rollout_reward_func/std": 16.291580200195312,
"sampling/importance_sampling_ratio/max": 1.2977502346038818,
"sampling/importance_sampling_ratio/mean": 0.9704984426498413,
"sampling/importance_sampling_ratio/min": 0.6586284637451172,
"sampling/sampling_logp_difference/max": 0.34184467792510986,
"sampling/sampling_logp_difference/mean": 0.006397986318916082,
"step": 397,
"step_time": 40.79752054799974
},
{
"clip_ratio/high_max": 0.043154762824997306,
"clip_ratio/high_mean": 0.014508928987197578,
"clip_ratio/low_mean": 0.025279997498728335,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.039788926660548896,
"entropy": 0.11147738387808204,
"epoch": 0.00796,
"grad_norm": 0.28320473432540894,
"kl": 0.7334012817591429,
"learning_rate": 9.999757329193333e-05,
"loss": -0.021,
"step": 398,
"step_time": 9.331709539997973
},
{
"clip_ratio/high_max": 0.0028409091755747795,
"clip_ratio/high_mean": 0.0007102272938936949,
"clip_ratio/low_mean": 0.0007440476329065859,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0014542749268002808,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1546.0,
"completions/max_terminated_length": 1546.0,
"completions/mean_length": 1455.65625,
"completions/mean_terminated_length": 1455.65625,
"completions/min_length": 1290.0,
"completions/min_terminated_length": 1290.0,
"entropy": 0.12526550004258752,
"epoch": 0.00798,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.7262594103813171,
"kl": 0.6103415302932262,
"learning_rate": 9.999755986633378e-05,
"loss": -0.0318,
"num_tokens": 22908577.0,
"reward": 9.555159568786621,
"reward_std": 12.746781349182129,
"rewards/rollout_reward_func/mean": 9.555160522460938,
"rewards/rollout_reward_func/std": 14.475045204162598,
"sampling/importance_sampling_ratio/max": 1.3095201253890991,
"sampling/importance_sampling_ratio/mean": 0.9779645204544067,
"sampling/importance_sampling_ratio/min": 7.978658610397404e-16,
"sampling/sampling_logp_difference/max": 27.056884765625,
"sampling/sampling_logp_difference/mean": 0.034079719334840775,
"step": 399,
"step_time": 41.13124246700136
},
{
"clip_ratio/high_max": 0.033107627648860216,
"clip_ratio/high_mean": 0.010509049869142473,
"clip_ratio/low_mean": 0.02362351247575134,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.03413256263593212,
"entropy": 0.1209962465800345,
"epoch": 0.008,
"grad_norm": 0.35216766595840454,
"kl": 0.6524146590381861,
"learning_rate": 9.99975464036997e-05,
"loss": -0.044,
"step": 400,
"step_time": 10.55650918399806
},
{
"clip_ratio/high_max": 0.0059523810632526875,
"clip_ratio/high_mean": 0.0014880952658131719,
"clip_ratio/low_mean": 0.0015252976445481181,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00301339291036129,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1555.0,
"completions/max_terminated_length": 1555.0,
"completions/mean_length": 1432.03125,
"completions/mean_terminated_length": 1432.03125,
"completions/min_length": 1248.0,
"completions/min_terminated_length": 1248.0,
"entropy": 0.11135548166930676,
"epoch": 0.00802,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.133269190788269,
"kl": 0.7685734387487173,
"learning_rate": 9.99975329040311e-05,
"loss": 0.0237,
"num_tokens": 23051446.0,
"reward": 10.270038604736328,
"reward_std": 15.682093620300293,
"rewards/rollout_reward_func/mean": 10.270038604736328,
"rewards/rollout_reward_func/std": 16.255008697509766,
"sampling/importance_sampling_ratio/max": 1.4514762163162231,
"sampling/importance_sampling_ratio/mean": 1.0226449966430664,
"sampling/importance_sampling_ratio/min": 0.7271938920021057,
"sampling/sampling_logp_difference/max": 0.45901012420654297,
"sampling/sampling_logp_difference/mean": 0.005241828970611095,
"step": 401,
"step_time": 41.26497857599861
},
{
"clip_ratio/high_max": 0.03290043352171779,
"clip_ratio/high_mean": 0.008225108380429447,
"clip_ratio/low_mean": 0.015327381319366395,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.023552489699795842,
"entropy": 0.11777450842782855,
"epoch": 0.00804,
"grad_norm": 0.928424060344696,
"kl": 1.0143736563622952,
"learning_rate": 9.999751936732799e-05,
"loss": 0.0269,
"step": 402,
"step_time": 10.720703897995918
},
{
"clip_ratio/high_max": 0.008928571594879031,
"clip_ratio/high_mean": 0.0029761905316263437,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0029761905316263437,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1558.0,
"completions/max_terminated_length": 1558.0,
"completions/mean_length": 1402.484375,
"completions/mean_terminated_length": 1402.484375,
"completions/min_length": 274.0,
"completions/min_terminated_length": 274.0,
"entropy": 0.12977053970098495,
"epoch": 0.00806,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5543701648712158,
"kl": 0.6270178612321615,
"learning_rate": 9.999750579359041e-05,
"loss": 0.0183,
"num_tokens": 23192365.0,
"reward": 10.964115142822266,
"reward_std": 14.9024658203125,
"rewards/rollout_reward_func/mean": 10.964115142822266,
"rewards/rollout_reward_func/std": 15.60954475402832,
"sampling/importance_sampling_ratio/max": 1.254056453704834,
"sampling/importance_sampling_ratio/mean": 0.9866700768470764,
"sampling/importance_sampling_ratio/min": 0.661080539226532,
"sampling/sampling_logp_difference/max": 0.2775760889053345,
"sampling/sampling_logp_difference/mean": 0.005836261436343193,
"step": 403,
"step_time": 40.087825246997454
},
{
"clip_ratio/high_max": 0.029910714831203222,
"clip_ratio/high_mean": 0.009676001209300011,
"clip_ratio/low_mean": 0.016021825780626386,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.025697826989926398,
"entropy": 0.13238740153610706,
"epoch": 0.00808,
"grad_norm": 0.44548580050468445,
"kl": 0.718162702396512,
"learning_rate": 9.999749218281836e-05,
"loss": 0.0147,
"step": 404,
"step_time": 9.659750243004964
},
{
"clip_ratio/high_max": 0.0029761905316263437,
"clip_ratio/high_mean": 0.0007440476329065859,
"clip_ratio/low_mean": 0.0007440476329065859,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0014880952658131719,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1552.0,
"completions/max_terminated_length": 1552.0,
"completions/mean_length": 1440.421875,
"completions/mean_terminated_length": 1440.421875,
"completions/min_length": 1154.0,
"completions/min_terminated_length": 1154.0,
"entropy": 0.12935744831338525,
"epoch": 0.0081,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5480543375015259,
"kl": 0.6046669036149979,
"learning_rate": 9.999747853501184e-05,
"loss": 0.0137,
"num_tokens": 23335798.0,
"reward": 12.202452659606934,
"reward_std": 18.661951065063477,
"rewards/rollout_reward_func/mean": 12.20245361328125,
"rewards/rollout_reward_func/std": 20.890966415405273,
"sampling/importance_sampling_ratio/max": 1.5541430711746216,
"sampling/importance_sampling_ratio/mean": 1.0242815017700195,
"sampling/importance_sampling_ratio/min": 0.6801992058753967,
"sampling/sampling_logp_difference/max": 0.38781797885894775,
"sampling/sampling_logp_difference/mean": 0.006136234849691391,
"step": 405,
"step_time": 40.755317154002114
},
{
"clip_ratio/high_max": 0.02380952425301075,
"clip_ratio/high_mean": 0.00744047638727352,
"clip_ratio/low_mean": 0.018129960633814335,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.025570437079295516,
"entropy": 0.12149734795093536,
"epoch": 0.00812,
"grad_norm": 0.26514580845832825,
"kl": 0.64109767973423,
"learning_rate": 9.999746485017087e-05,
"loss": 0.0087,
"step": 406,
"step_time": 10.136832774996947
},
{
"clip_ratio/high_max": 0.0059523810632526875,
"clip_ratio/high_mean": 0.0014880952658131719,
"clip_ratio/low_mean": 0.002232142898719758,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0037202381645329297,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1554.0,
"completions/max_terminated_length": 1554.0,
"completions/mean_length": 1405.703125,
"completions/mean_terminated_length": 1405.703125,
"completions/min_length": 666.0,
"completions/min_terminated_length": 666.0,
"entropy": 0.12104977620765567,
"epoch": 0.00814,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.35428720712661743,
"kl": 0.6081782300025225,
"learning_rate": 9.999745112829547e-05,
"loss": 0.0047,
"num_tokens": 23476941.0,
"reward": 10.940488815307617,
"reward_std": 14.940820693969727,
"rewards/rollout_reward_func/mean": 10.940488815307617,
"rewards/rollout_reward_func/std": 15.13664436340332,
"sampling/importance_sampling_ratio/max": 1.254475712776184,
"sampling/importance_sampling_ratio/mean": 0.9845165014266968,
"sampling/importance_sampling_ratio/min": 0.6197980642318726,
"sampling/sampling_logp_difference/max": 0.40376973152160645,
"sampling/sampling_logp_difference/mean": 0.00637152511626482,
"step": 407,
"step_time": 40.52080072600438
},
{
"clip_ratio/high_max": 0.02380952425301075,
"clip_ratio/high_mean": 0.007440476329065859,
"clip_ratio/low_mean": 0.026450893783476204,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.033891370287165046,
"entropy": 0.11284881783649325,
"epoch": 0.00816,
"grad_norm": 0.26076704263687134,
"kl": 0.6312750466167927,
"learning_rate": 9.999743736938565e-05,
"loss": -0.0013,
"step": 408,
"step_time": 10.76028649699765
},
{
"clip_ratio/high_max": 0.0028409091755747795,
"clip_ratio/high_mean": 0.0007102272938936949,
"clip_ratio/low_mean": 0.002232142898719758,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0029423701926134527,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1546.0,
"completions/max_terminated_length": 1546.0,
"completions/mean_length": 1414.90625,
"completions/mean_terminated_length": 1414.90625,
"completions/min_length": 419.0,
"completions/min_terminated_length": 419.0,
"entropy": 0.11457140510901809,
"epoch": 0.00818,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.668945848941803,
"kl": 0.5923841055482626,
"learning_rate": 9.999742357344142e-05,
"loss": 0.0624,
"num_tokens": 23618723.0,
"reward": 10.537452697753906,
"reward_std": 15.241682052612305,
"rewards/rollout_reward_func/mean": 10.537453651428223,
"rewards/rollout_reward_func/std": 16.505765914916992,
"sampling/importance_sampling_ratio/max": 1.2935158014297485,
"sampling/importance_sampling_ratio/mean": 0.9813590049743652,
"sampling/importance_sampling_ratio/min": 2.974116992179171e-14,
"sampling/sampling_logp_difference/max": 25.953086853027344,
"sampling/sampling_logp_difference/mean": 0.028037957847118378,
"step": 409,
"step_time": 40.691261925003346
},
{
"clip_ratio/high_max": 0.04437229549512267,
"clip_ratio/high_mean": 0.011837121448479593,
"clip_ratio/low_mean": 0.016443452972453088,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.028280574886593968,
"entropy": 0.1118474374525249,
"epoch": 0.0082,
"grad_norm": 0.2630373537540436,
"kl": 0.6832827776670456,
"learning_rate": 9.999740974046282e-05,
"loss": 0.0566,
"step": 410,
"step_time": 9.717429660999187
},
{
"clip_ratio/high_max": 0.011904762126505375,
"clip_ratio/high_mean": 0.004464285797439516,
"clip_ratio/low_mean": 0.0007440476329065859,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0052083334303461015,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1543.0,
"completions/max_terminated_length": 1543.0,
"completions/mean_length": 1393.609375,
"completions/mean_terminated_length": 1393.609375,
"completions/min_length": 298.0,
"completions/min_terminated_length": 298.0,
"entropy": 0.1146247279830277,
"epoch": 0.00822,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.47182416915893555,
"kl": 0.5627781376242638,
"learning_rate": 9.999739587044981e-05,
"loss": -0.0341,
"num_tokens": 23759122.0,
"reward": 8.971721649169922,
"reward_std": 14.443693161010742,
"rewards/rollout_reward_func/mean": 8.971721649169922,
"rewards/rollout_reward_func/std": 14.68343448638916,
"sampling/importance_sampling_ratio/max": 1.243363857269287,
"sampling/importance_sampling_ratio/mean": 0.9929588437080383,
"sampling/importance_sampling_ratio/min": 0.7046716809272766,
"sampling/sampling_logp_difference/max": 0.35747838020324707,
"sampling/sampling_logp_difference/mean": 0.005684119649231434,
"step": 411,
"step_time": 39.962890398002855
},
{
"clip_ratio/high_max": 0.035714286379516125,
"clip_ratio/high_mean": 0.009672619227785617,
"clip_ratio/low_mean": 0.01767113123787567,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.02734375116415322,
"entropy": 0.11477407393977046,
"epoch": 0.00824,
"grad_norm": 0.24663475155830383,
"kl": 0.6022106558084488,
"learning_rate": 9.999738196340245e-05,
"loss": -0.0386,
"step": 412,
"step_time": 9.870993509000982
},
{
"clip_ratio/high_max": 0.0029761905316263437,
"clip_ratio/high_mean": 0.0007440476329065859,
"clip_ratio/low_mean": 0.002232142898719758,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0029761905316263437,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1563.0,
"completions/max_terminated_length": 1563.0,
"completions/mean_length": 1434.671875,
"completions/mean_terminated_length": 1434.671875,
"completions/min_length": 1052.0,
"completions/min_terminated_length": 1052.0,
"entropy": 0.11288065044209361,
"epoch": 0.00826,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.38144180178642273,
"kl": 0.7383872698992491,
"learning_rate": 9.999736801932072e-05,
"loss": 0.0133,
"num_tokens": 23902181.0,
"reward": 13.304646492004395,
"reward_std": 20.157991409301758,
"rewards/rollout_reward_func/mean": 13.304647445678711,
"rewards/rollout_reward_func/std": 21.064607620239258,
"sampling/importance_sampling_ratio/max": 1.3603137731552124,
"sampling/importance_sampling_ratio/mean": 1.0158387422561646,
"sampling/importance_sampling_ratio/min": 0.7469893097877502,
"sampling/sampling_logp_difference/max": 0.2501299977302551,
"sampling/sampling_logp_difference/mean": 0.004556077066808939,
"step": 413,
"step_time": 41.25988831600807
},
{
"clip_ratio/high_max": 0.017857143422588706,
"clip_ratio/high_mean": 0.0044642858556471765,
"clip_ratio/low_mean": 0.012648809934034944,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.01711309573147446,
"entropy": 0.10936349909752607,
"epoch": 0.00828,
"grad_norm": 0.2556546628475189,
"kl": 0.7252329587936401,
"learning_rate": 9.999735403820466e-05,
"loss": 0.0102,
"step": 414,
"step_time": 10.573283408997668
},
{
"clip_ratio/high_max": 0.0059523810632526875,
"clip_ratio/high_mean": 0.0014880952658131719,
"clip_ratio/low_mean": 0.0007440476329065859,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.002232142898719758,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1562.0,
"completions/max_terminated_length": 1562.0,
"completions/mean_length": 1493.640625,
"completions/mean_terminated_length": 1493.640625,
"completions/min_length": 1359.0,
"completions/min_terminated_length": 1359.0,
"entropy": 0.11034470843151212,
"epoch": 0.0083,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.7242380380630493,
"kl": 0.6212767362594604,
"learning_rate": 9.999734002005428e-05,
"loss": -0.0155,
"num_tokens": 24049141.0,
"reward": 9.928826332092285,
"reward_std": 15.976888656616211,
"rewards/rollout_reward_func/mean": 9.928826332092285,
"rewards/rollout_reward_func/std": 16.414718627929688,
"sampling/importance_sampling_ratio/max": 1.3260316848754883,
"sampling/importance_sampling_ratio/mean": 1.0085797309875488,
"sampling/importance_sampling_ratio/min": 0.5519727468490601,
"sampling/sampling_logp_difference/max": 0.5959200859069824,
"sampling/sampling_logp_difference/mean": 0.006388316862285137,
"step": 415,
"step_time": 40.88382640199961
},
{
"clip_ratio/high_max": 0.014880952658131719,
"clip_ratio/high_mean": 0.005952381121460348,
"clip_ratio/low_mean": 0.019494048377964646,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.025446429615840316,
"entropy": 0.09857920417562127,
"epoch": 0.00832,
"grad_norm": 0.39599546790122986,
"kl": 0.7278024889528751,
"learning_rate": 9.99973259648696e-05,
"loss": -0.013,
"step": 416,
"step_time": 10.850284384998304
},
{
"clip_ratio/high_max": 0.0059523810632526875,
"clip_ratio/high_mean": 0.0014880952658131719,
"clip_ratio/low_mean": 0.0007440476329065859,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.002232142898719758,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1541.0,
"completions/max_terminated_length": 1541.0,
"completions/mean_length": 1388.96875,
"completions/mean_terminated_length": 1388.96875,
"completions/min_length": 188.0,
"completions/min_terminated_length": 188.0,
"entropy": 0.1026167522650212,
"epoch": 0.00834,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.4579165577888489,
"kl": 0.8239834625273943,
"learning_rate": 9.99973118726506e-05,
"loss": -0.0484,
"num_tokens": 24189178.0,
"reward": 12.621437072753906,
"reward_std": 16.67880630493164,
"rewards/rollout_reward_func/mean": 12.621437072753906,
"rewards/rollout_reward_func/std": 17.352924346923828,
"sampling/importance_sampling_ratio/max": 1.2838397026062012,
"sampling/importance_sampling_ratio/mean": 1.0156192779541016,
"sampling/importance_sampling_ratio/min": 0.6750461459159851,
"sampling/sampling_logp_difference/max": 0.2394113540649414,
"sampling/sampling_logp_difference/mean": 0.004534607753157616,
"step": 417,
"step_time": 39.856104094997136
},
{
"clip_ratio/high_max": 0.038690477376803756,
"clip_ratio/high_mean": 0.011904762184713036,
"clip_ratio/low_mean": 0.01116071455180645,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.02306547696935013,
"entropy": 0.11136638512834907,
"epoch": 0.00836,
"grad_norm": 0.2160414755344391,
"kl": 0.6315647587180138,
"learning_rate": 9.999729774339733e-05,
"loss": -0.0554,
"step": 418,
"step_time": 9.950184918994637
},
{
"clip_ratio/high_max": 0.0028409091755747795,
"clip_ratio/high_mean": 0.0007102272938936949,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0007102272938936949,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1545.0,
"completions/max_terminated_length": 1545.0,
"completions/mean_length": 1415.34375,
"completions/mean_terminated_length": 1415.34375,
"completions/min_length": 741.0,
"completions/min_terminated_length": 741.0,
"entropy": 0.12300179339945316,
"epoch": 0.00838,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.35927513241767883,
"kl": 0.563910448923707,
"learning_rate": 9.999728357710979e-05,
"loss": -0.0024,
"num_tokens": 24330939.0,
"reward": 10.211483001708984,
"reward_std": 12.243392944335938,
"rewards/rollout_reward_func/mean": 10.211483001708984,
"rewards/rollout_reward_func/std": 12.923269271850586,
"sampling/importance_sampling_ratio/max": 1.561508297920227,
"sampling/importance_sampling_ratio/mean": 0.9852752089500427,
"sampling/importance_sampling_ratio/min": 0.6525661945343018,
"sampling/sampling_logp_difference/max": 0.421316921710968,
"sampling/sampling_logp_difference/mean": 0.005881062708795071,
"step": 419,
"step_time": 40.82010957399871
},
{
"clip_ratio/high_max": 0.023403680184856057,
"clip_ratio/high_mean": 0.00801836303435266,
"clip_ratio/low_mean": 0.005332341359462589,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.01335070439381525,
"entropy": 0.12413196917623281,
"epoch": 0.0084,
"grad_norm": 0.22808168828487396,
"kl": 0.5641085561364889,
"learning_rate": 9.999726937378799e-05,
"loss": -0.0082,
"step": 420,
"step_time": 9.7226802879959
},
{
"clip_ratio/high_max": 0.008928571594879031,
"clip_ratio/high_mean": 0.002232142898719758,
"clip_ratio/low_mean": 0.002232142898719758,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.004464285797439516,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1535.0,
"completions/max_terminated_length": 1535.0,
"completions/mean_length": 1441.640625,
"completions/mean_terminated_length": 1441.640625,
"completions/min_length": 864.0,
"completions/min_terminated_length": 864.0,
"entropy": 0.1284960494376719,
"epoch": 0.00842,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.532646656036377,
"kl": 0.7414810676127672,
"learning_rate": 9.999725513343196e-05,
"loss": 0.0034,
"num_tokens": 24474440.0,
"reward": 15.56411361694336,
"reward_std": 16.717456817626953,
"rewards/rollout_reward_func/mean": 15.56411361694336,
"rewards/rollout_reward_func/std": 16.81290626525879,
"sampling/importance_sampling_ratio/max": 1.2900909185409546,
"sampling/importance_sampling_ratio/mean": 1.0089163780212402,
"sampling/importance_sampling_ratio/min": 0.6302499175071716,
"sampling/sampling_logp_difference/max": 0.41839098930358887,
"sampling/sampling_logp_difference/mean": 0.006366787478327751,
"step": 421,
"step_time": 41.64962637100143
},
{
"clip_ratio/high_max": 0.020833333721384406,
"clip_ratio/high_mean": 0.0052083334303461015,
"clip_ratio/low_mean": 0.014136905199848115,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.019345238688401878,
"entropy": 0.12450070818886161,
"epoch": 0.00844,
"grad_norm": 0.3132474422454834,
"kl": 0.7047660015523434,
"learning_rate": 9.999724085604169e-05,
"loss": -0.0014,
"step": 422,
"step_time": 10.727001868001025
},
{
"clip_ratio/high_max": 0.0029761905316263437,
"clip_ratio/high_mean": 0.0007440476329065859,
"clip_ratio/low_mean": 0.002232142898719758,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0029761905316263437,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1557.0,
"completions/max_terminated_length": 1557.0,
"completions/mean_length": 1452.453125,
"completions/mean_terminated_length": 1452.453125,
"completions/min_length": 194.0,
"completions/min_terminated_length": 194.0,
"entropy": 0.11882536578923464,
"epoch": 0.00846,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.7803420424461365,
"kl": 0.8879449907690287,
"learning_rate": 9.999722654161722e-05,
"loss": -0.0437,
"num_tokens": 24618707.0,
"reward": 11.537307739257812,
"reward_std": 16.87006187438965,
"rewards/rollout_reward_func/mean": 11.537307739257812,
"rewards/rollout_reward_func/std": 18.111291885375977,
"sampling/importance_sampling_ratio/max": 2.1790900230407715,
"sampling/importance_sampling_ratio/mean": 1.0079734325408936,
"sampling/importance_sampling_ratio/min": 0.6660839319229126,
"sampling/sampling_logp_difference/max": 1.0955865383148193,
"sampling/sampling_logp_difference/mean": 0.0059229484759271145,
"step": 423,
"step_time": 39.62021958500554
},
{
"clip_ratio/high_max": 0.02380952425301075,
"clip_ratio/high_mean": 0.006696428696159273,
"clip_ratio/low_mean": 0.015560741710942239,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.02225717029068619,
"entropy": 0.12485062563791871,
"epoch": 0.00848,
"grad_norm": 0.31361132860183716,
"kl": 0.7454855944961309,
"learning_rate": 9.999721219015854e-05,
"loss": -0.0541,
"step": 424,
"step_time": 10.1194757400026
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0014880952658131719,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0014880952658131719,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1537.0,
"completions/max_terminated_length": 1537.0,
"completions/mean_length": 1414.84375,
"completions/mean_terminated_length": 1414.84375,
"completions/min_length": 690.0,
"completions/min_terminated_length": 690.0,
"entropy": 0.1313102599233389,
"epoch": 0.0085,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5788205862045288,
"kl": 0.6796710211783648,
"learning_rate": 9.999719780166567e-05,
"loss": -0.0346,
"num_tokens": 24760444.0,
"reward": 10.583850860595703,
"reward_std": 15.813437461853027,
"rewards/rollout_reward_func/mean": 10.583850860595703,
"rewards/rollout_reward_func/std": 15.782630920410156,
"sampling/importance_sampling_ratio/max": 1.3160440921783447,
"sampling/importance_sampling_ratio/mean": 0.9774030447006226,
"sampling/importance_sampling_ratio/min": 0.7505905628204346,
"sampling/sampling_logp_difference/max": 0.2754938304424286,
"sampling/sampling_logp_difference/mean": 0.00678935507312417,
"step": 425,
"step_time": 41.97174792000442
},
{
"clip_ratio/high_max": 0.020833333721384406,
"clip_ratio/high_mean": 0.0052083334303461015,
"clip_ratio/low_mean": 0.017931548063643277,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.023139881726820022,
"entropy": 0.13454774813726544,
"epoch": 0.00852,
"grad_norm": 0.24161121249198914,
"kl": 0.6602058243006468,
"learning_rate": 9.999718337613865e-05,
"loss": -0.0446,
"step": 426,
"step_time": 9.663861974999236
},
{
"clip_ratio/high_max": 0.008928571594879031,
"clip_ratio/high_mean": 0.002232142898719758,
"clip_ratio/low_mean": 0.0014880952658131719,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0037202381645329297,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1540.0,
"completions/max_terminated_length": 1540.0,
"completions/mean_length": 1432.515625,
"completions/mean_terminated_length": 1432.515625,
"completions/min_length": 408.0,
"completions/min_terminated_length": 408.0,
"entropy": 0.1410405244678259,
"epoch": 0.00854,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5101809501647949,
"kl": 0.6386174689978361,
"learning_rate": 9.999716891357746e-05,
"loss": 0.0369,
"num_tokens": 24903364.0,
"reward": 11.803701400756836,
"reward_std": 16.973173141479492,
"rewards/rollout_reward_func/mean": 11.803701400756836,
"rewards/rollout_reward_func/std": 17.966468811035156,
"sampling/importance_sampling_ratio/max": 1.7738006114959717,
"sampling/importance_sampling_ratio/mean": 0.995194137096405,
"sampling/importance_sampling_ratio/min": 0.6213434338569641,
"sampling/sampling_logp_difference/max": 0.5084433555603027,
"sampling/sampling_logp_difference/mean": 0.007640195079147816,
"step": 427,
"step_time": 42.486011768000026
},
{
"clip_ratio/high_max": 0.02976190554909408,
"clip_ratio/high_mean": 0.01116071455180645,
"clip_ratio/low_mean": 0.012369791802484542,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.023530506470706314,
"entropy": 0.14208506979048252,
"epoch": 0.00856,
"grad_norm": 0.2106575071811676,
"kl": 0.6240573097020388,
"learning_rate": 9.999715441398214e-05,
"loss": 0.0308,
"step": 428,
"step_time": 10.646923483993305
},
{
"clip_ratio/high_max": 0.0029761905316263437,
"clip_ratio/high_mean": 0.0007440476329065859,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0007440476329065859,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1573.0,
"completions/max_terminated_length": 1573.0,
"completions/mean_length": 1438.828125,
"completions/mean_terminated_length": 1438.828125,
"completions/min_length": 1255.0,
"completions/min_terminated_length": 1255.0,
"entropy": 0.14296143036335707,
"epoch": 0.00858,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.42578843235969543,
"kl": 0.5468210577964783,
"learning_rate": 9.999713987735269e-05,
"loss": 0.0008,
"num_tokens": 25046668.0,
"reward": 12.157367706298828,
"reward_std": 19.82905387878418,
"rewards/rollout_reward_func/mean": 12.157367706298828,
"rewards/rollout_reward_func/std": 20.11625862121582,
"sampling/importance_sampling_ratio/max": 1.1917697191238403,
"sampling/importance_sampling_ratio/mean": 0.988789439201355,
"sampling/importance_sampling_ratio/min": 0.6782960295677185,
"sampling/sampling_logp_difference/max": 0.32637321949005127,
"sampling/sampling_logp_difference/mean": 0.006113000214099884,
"step": 429,
"step_time": 40.68629942800362
},
{
"clip_ratio/high_max": 0.023809524485841393,
"clip_ratio/high_mean": 0.007440476503688842,
"clip_ratio/low_mean": 0.01045386923942715,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.017894345801323652,
"entropy": 0.14418638544157147,
"epoch": 0.0086,
"grad_norm": 0.268960177898407,
"kl": 0.5387851055711508,
"learning_rate": 9.999712530368912e-05,
"loss": -0.0055,
"step": 430,
"step_time": 11.072718907002127
},
{
"clip_ratio/high_max": 0.0029761905316263437,
"clip_ratio/high_mean": 0.0007440476329065859,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0007440476329065859,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1550.0,
"completions/max_terminated_length": 1550.0,
"completions/mean_length": 1429.546875,
"completions/mean_terminated_length": 1429.546875,
"completions/min_length": 195.0,
"completions/min_terminated_length": 195.0,
"entropy": 0.15179488621652126,
"epoch": 0.00862,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.7485275864601135,
"kl": 0.5430763624608517,
"learning_rate": 9.999711069299146e-05,
"loss": -0.0808,
"num_tokens": 25189448.0,
"reward": 11.358131408691406,
"reward_std": 17.856586456298828,
"rewards/rollout_reward_func/mean": 11.358131408691406,
"rewards/rollout_reward_func/std": 18.32318878173828,
"sampling/importance_sampling_ratio/max": 1.3345392942428589,
"sampling/importance_sampling_ratio/mean": 1.0265988111495972,
"sampling/importance_sampling_ratio/min": 0.48013654351234436,
"sampling/sampling_logp_difference/max": 0.7489854097366333,
"sampling/sampling_logp_difference/mean": 0.008107547648251057,
"step": 431,
"step_time": 40.926112169998305
},
{
"clip_ratio/high_max": 0.0654761919286102,
"clip_ratio/high_mean": 0.02008928614668548,
"clip_ratio/low_mean": 0.017782738606911153,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.037872024811804295,
"entropy": 0.15683973440900445,
"epoch": 0.00864,
"grad_norm": 0.2219485342502594,
"kl": 0.5109246261417866,
"learning_rate": 9.99970960452597e-05,
"loss": -0.0914,
"step": 432,
"step_time": 10.182908312996005
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1552.0,
"completions/max_terminated_length": 1552.0,
"completions/mean_length": 1470.21875,
"completions/mean_terminated_length": 1470.21875,
"completions/min_length": 1344.0,
"completions/min_terminated_length": 1344.0,
"entropy": 0.14491091342642903,
"epoch": 0.00866,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.46287354826927185,
"kl": 0.5186197776347399,
"learning_rate": 9.999708136049389e-05,
"loss": -0.0113,
"num_tokens": 25334849.0,
"reward": 10.764678955078125,
"reward_std": 13.417325973510742,
"rewards/rollout_reward_func/mean": 10.764678955078125,
"rewards/rollout_reward_func/std": 14.159459114074707,
"sampling/importance_sampling_ratio/max": 1.4547772407531738,
"sampling/importance_sampling_ratio/mean": 1.0081079006195068,
"sampling/importance_sampling_ratio/min": 0.7049920558929443,
"sampling/sampling_logp_difference/max": 0.4709939956665039,
"sampling/sampling_logp_difference/mean": 0.005667536519467831,
"step": 433,
"step_time": 42.1604533589998
},
{
"clip_ratio/high_max": 0.0476190485060215,
"clip_ratio/high_mean": 0.014136905199848115,
"clip_ratio/low_mean": 0.01785714365541935,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.03199404844781384,
"entropy": 0.15482168877497315,
"epoch": 0.00868,
"grad_norm": 0.2332436740398407,
"kl": 0.5066223796457052,
"learning_rate": 9.9997066638694e-05,
"loss": -0.0183,
"step": 434,
"step_time": 10.119833896998898
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1575.0,
"completions/max_terminated_length": 1575.0,
"completions/mean_length": 1436.375,
"completions/mean_terminated_length": 1436.375,
"completions/min_length": 795.0,
"completions/min_terminated_length": 795.0,
"entropy": 0.16640883032232523,
"epoch": 0.0087,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5225183367729187,
"kl": 0.4759600590914488,
"learning_rate": 9.999705187986009e-05,
"loss": 0.0044,
"num_tokens": 25478062.0,
"reward": 11.862446784973145,
"reward_std": 14.980566024780273,
"rewards/rollout_reward_func/mean": 11.862445831298828,
"rewards/rollout_reward_func/std": 15.403722763061523,
"sampling/importance_sampling_ratio/max": 1.3265814781188965,
"sampling/importance_sampling_ratio/mean": 1.0067017078399658,
"sampling/importance_sampling_ratio/min": 0.6984032988548279,
"sampling/sampling_logp_difference/max": 0.3158724308013916,
"sampling/sampling_logp_difference/mean": 0.007522557862102985,
"step": 435,
"step_time": 40.53673548099687
},
{
"clip_ratio/high_max": 0.08556547830812633,
"clip_ratio/high_mean": 0.028087798331398517,
"clip_ratio/low_mean": 0.025297619868069887,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.05338541802484542,
"entropy": 0.166918208822608,
"epoch": 0.00872,
"grad_norm": 0.5592331886291504,
"kl": 0.46205065958201885,
"learning_rate": 9.999703708399215e-05,
"loss": -0.0001,
"step": 436,
"step_time": 10.790453629004332
},
{
"clip_ratio/high_max": 0.017857143189758062,
"clip_ratio/high_mean": 0.004464285797439516,
"clip_ratio/low_mean": 0.0007440476329065859,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0052083334303461015,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1542.0,
"completions/max_terminated_length": 1542.0,
"completions/mean_length": 1430.21875,
"completions/mean_terminated_length": 1430.21875,
"completions/min_length": 1031.0,
"completions/min_terminated_length": 1031.0,
"entropy": 0.1512767318636179,
"epoch": 0.00874,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5887247920036316,
"kl": 0.47883218713104725,
"learning_rate": 9.99970222510902e-05,
"loss": 0.023,
"num_tokens": 25620798.0,
"reward": 10.20716667175293,
"reward_std": 16.14691734313965,
"rewards/rollout_reward_func/mean": 10.20716667175293,
"rewards/rollout_reward_func/std": 17.900371551513672,
"sampling/importance_sampling_ratio/max": 1.2416183948516846,
"sampling/importance_sampling_ratio/mean": 0.9807419776916504,
"sampling/importance_sampling_ratio/min": 0.542736291885376,
"sampling/sampling_logp_difference/max": 0.36902284622192383,
"sampling/sampling_logp_difference/mean": 0.0073195262812078,
"step": 437,
"step_time": 40.27251563699974
},
{
"clip_ratio/high_max": 0.059523811331018806,
"clip_ratio/high_mean": 0.02306547691114247,
"clip_ratio/low_mean": 0.03698593232547864,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.06005140976049006,
"entropy": 0.11152059538289905,
"epoch": 0.00876,
"grad_norm": 0.34218233823776245,
"kl": 0.599434606730938,
"learning_rate": 9.999700738115424e-05,
"loss": 0.0208,
"step": 438,
"step_time": 10.141322578992913
},
{
"clip_ratio/high_max": 0.008928571594879031,
"clip_ratio/high_mean": 0.002232142898719758,
"clip_ratio/low_mean": 0.0007440476329065859,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0029761905316263437,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1551.0,
"completions/max_terminated_length": 1551.0,
"completions/mean_length": 1448.5625,
"completions/mean_terminated_length": 1448.5625,
"completions/min_length": 1357.0,
"completions/min_terminated_length": 1357.0,
"entropy": 0.09335534879937768,
"epoch": 0.00878,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5081444382667542,
"kl": 0.5331121180206537,
"learning_rate": 9.999699247418432e-05,
"loss": -0.0063,
"num_tokens": 25764758.0,
"reward": 9.246360778808594,
"reward_std": 12.59730339050293,
"rewards/rollout_reward_func/mean": 9.246360778808594,
"rewards/rollout_reward_func/std": 14.430070877075195,
"sampling/importance_sampling_ratio/max": 1.47153902053833,
"sampling/importance_sampling_ratio/mean": 0.9984990358352661,
"sampling/importance_sampling_ratio/min": 0.582763671875,
"sampling/sampling_logp_difference/max": 0.4040945768356323,
"sampling/sampling_logp_difference/mean": 0.0051497891545295715,
"step": 439,
"step_time": 41.78158714499841
},
{
"clip_ratio/high_max": 0.020833333721384406,
"clip_ratio/high_mean": 0.0052083334303461015,
"clip_ratio/low_mean": 0.02083333401242271,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.026041667442768812,
"entropy": 0.07112342561595142,
"epoch": 0.0088,
"grad_norm": 0.41764187812805176,
"kl": 0.8426203690469265,
"learning_rate": 9.999697753018041e-05,
"loss": -0.0085,
"step": 440,
"step_time": 10.17990355800066
},
{
"clip_ratio/high_max": 0.008928571594879031,
"clip_ratio/high_mean": 0.0029761905316263437,
"clip_ratio/low_mean": 0.002232142898719758,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0052083334303461015,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1552.0,
"completions/max_terminated_length": 1552.0,
"completions/mean_length": 1425.234375,
"completions/mean_terminated_length": 1425.234375,
"completions/min_length": 690.0,
"completions/min_terminated_length": 690.0,
"entropy": 0.07502732030116022,
"epoch": 0.00882,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.4730430841445923,
"kl": 0.598696194589138,
"learning_rate": 9.999696254914256e-05,
"loss": -0.0232,
"num_tokens": 25907211.0,
"reward": 12.0460205078125,
"reward_std": 12.864827156066895,
"rewards/rollout_reward_func/mean": 12.0460205078125,
"rewards/rollout_reward_func/std": 13.124265670776367,
"sampling/importance_sampling_ratio/max": 2.117748975753784,
"sampling/importance_sampling_ratio/mean": 0.979032039642334,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.2621982097625732,
"sampling/sampling_logp_difference/mean": 0.006923416629433632,
"step": 441,
"step_time": 40.17172135200235
},
{
"clip_ratio/high_max": 0.020833333721384406,
"clip_ratio/high_mean": 0.006696428754366934,
"clip_ratio/low_mean": 0.010491071618162096,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.01718750043073669,
"entropy": 0.07272043719422072,
"epoch": 0.00884,
"grad_norm": 0.2536933422088623,
"kl": 0.6081040930002928,
"learning_rate": 9.999694753107076e-05,
"loss": -0.0288,
"step": 442,
"step_time": 10.609227344999454
},
{
"clip_ratio/high_max": 0.0029761905316263437,
"clip_ratio/high_mean": 0.0007440476329065859,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0007440476329065859,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1551.0,
"completions/max_terminated_length": 1551.0,
"completions/mean_length": 1433.484375,
"completions/mean_terminated_length": 1433.484375,
"completions/min_length": 637.0,
"completions/min_terminated_length": 637.0,
"entropy": 0.08062643301673234,
"epoch": 0.00886,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.8294975757598877,
"kl": 0.5611858777701855,
"learning_rate": 9.999693247596505e-05,
"loss": 0.0316,
"num_tokens": 26050176.0,
"reward": 9.822164535522461,
"reward_std": 14.750000953674316,
"rewards/rollout_reward_func/mean": 9.822165489196777,
"rewards/rollout_reward_func/std": 14.6282377243042,
"sampling/importance_sampling_ratio/max": 1.5190024375915527,
"sampling/importance_sampling_ratio/mean": 1.0036146640777588,
"sampling/importance_sampling_ratio/min": 0.7604562640190125,
"sampling/sampling_logp_difference/max": 0.3031894564628601,
"sampling/sampling_logp_difference/mean": 0.004106580279767513,
"step": 443,
"step_time": 40.79559905699534
},
{
"clip_ratio/high_max": 0.030257937032729387,
"clip_ratio/high_mean": 0.007564484258182347,
"clip_ratio/low_mean": 0.015591179952025414,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.02315566421020776,
"entropy": 0.0824263768736273,
"epoch": 0.00888,
"grad_norm": 0.9322162866592407,
"kl": 0.7285797223448753,
"learning_rate": 9.999691738382544e-05,
"loss": 0.034,
"step": 444,
"step_time": 10.72277228299754
},
{
"clip_ratio/high_max": 0.0029761905316263437,
"clip_ratio/high_mean": 0.0007440476329065859,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0007440476329065859,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1574.0,
"completions/max_terminated_length": 1574.0,
"completions/mean_length": 1428.421875,
"completions/mean_terminated_length": 1428.421875,
"completions/min_length": 1166.0,
"completions/min_terminated_length": 1166.0,
"entropy": 0.07391244731843472,
"epoch": 0.0089,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5232189893722534,
"kl": 0.6245864983648062,
"learning_rate": 9.999690225465193e-05,
"loss": -0.0215,
"num_tokens": 26192780.0,
"reward": 11.33067512512207,
"reward_std": 15.117729187011719,
"rewards/rollout_reward_func/mean": 11.33067512512207,
"rewards/rollout_reward_func/std": 16.229934692382812,
"sampling/importance_sampling_ratio/max": 1.4000767469406128,
"sampling/importance_sampling_ratio/mean": 1.0228557586669922,
"sampling/importance_sampling_ratio/min": 0.8148965239524841,
"sampling/sampling_logp_difference/max": 0.303769588470459,
"sampling/sampling_logp_difference/mean": 0.003096876898780465,
"step": 445,
"step_time": 39.69306251100352
},
{
"clip_ratio/high_max": 0.020833333721384406,
"clip_ratio/high_mean": 0.0059523810632526875,
"clip_ratio/low_mean": 0.011408730410039425,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.017361111531499773,
"entropy": 0.07312626042403281,
"epoch": 0.00892,
"grad_norm": 0.2677549719810486,
"kl": 0.6443136036396027,
"learning_rate": 9.999688708844453e-05,
"loss": -0.0254,
"step": 446,
"step_time": 9.859747254999093
},
{
"clip_ratio/high_max": 0.0064484127797186375,
"clip_ratio/high_mean": 0.0023561508278362453,
"clip_ratio/low_mean": 0.0014542749268002808,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.003810425754636526,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1503.0,
"completions/max_terminated_length": 1503.0,
"completions/mean_length": 1392.1875,
"completions/mean_terminated_length": 1392.1875,
"completions/min_length": 1207.0,
"completions/min_terminated_length": 1207.0,
"entropy": 0.10388755868189037,
"epoch": 0.00894,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.44962024688720703,
"kl": 0.5700237862765789,
"learning_rate": 9.999687188520327e-05,
"loss": -0.0085,
"num_tokens": 26333000.0,
"reward": 10.396234512329102,
"reward_std": 12.773336410522461,
"rewards/rollout_reward_func/mean": 10.396234512329102,
"rewards/rollout_reward_func/std": 13.91511058807373,
"sampling/importance_sampling_ratio/max": 1.2538983821868896,
"sampling/importance_sampling_ratio/mean": 1.0106072425842285,
"sampling/importance_sampling_ratio/min": 0.8617662787437439,
"sampling/sampling_logp_difference/max": 0.21103119850158691,
"sampling/sampling_logp_difference/mean": 0.004161643795669079,
"step": 447,
"step_time": 40.40724292899722
},
{
"clip_ratio/high_max": 0.017215219675563276,
"clip_ratio/high_mean": 0.005047852551797405,
"clip_ratio/low_mean": 0.011870941845700145,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.01691879451391287,
"entropy": 0.10565289529040456,
"epoch": 0.00896,
"grad_norm": 0.3113742470741272,
"kl": 0.5588793251663446,
"learning_rate": 9.999685664492817e-05,
"loss": -0.011,
"step": 448,
"step_time": 9.88399511400712
},
{
"clip_ratio/high_max": 0.0029761905316263437,
"clip_ratio/high_mean": 0.0014880952658131719,
"clip_ratio/low_mean": 0.0029761905316263437,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.004464285797439516,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1539.0,
"completions/max_terminated_length": 1539.0,
"completions/mean_length": 1425.671875,
"completions/mean_terminated_length": 1425.671875,
"completions/min_length": 1252.0,
"completions/min_terminated_length": 1252.0,
"entropy": 0.08549337997101247,
"epoch": 0.00898,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.43871885538101196,
"kl": 0.5197541080415249,
"learning_rate": 9.999684136761923e-05,
"loss": 0.0424,
"num_tokens": 26475423.0,
"reward": 13.137186050415039,
"reward_std": 18.040781021118164,
"rewards/rollout_reward_func/mean": 13.137186050415039,
"rewards/rollout_reward_func/std": 18.348669052124023,
"sampling/importance_sampling_ratio/max": 2.0688071250915527,
"sampling/importance_sampling_ratio/mean": 1.0355302095413208,
"sampling/importance_sampling_ratio/min": 0.7141319513320923,
"sampling/sampling_logp_difference/max": 0.7877845764160156,
"sampling/sampling_logp_difference/mean": 0.004831024445593357,
"step": 449,
"step_time": 40.3097439919984
},
{
"clip_ratio/high_max": 0.02380952425301075,
"clip_ratio/high_mean": 0.00744047638727352,
"clip_ratio/low_mean": 0.010349026299081743,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.01778950251173228,
"entropy": 0.09008124680258334,
"epoch": 0.009,
"grad_norm": 0.2819499969482422,
"kl": 0.48992327228188515,
"learning_rate": 9.999682605327648e-05,
"loss": 0.0377,
"step": 450,
"step_time": 11.019723427001736
},
{
"clip_ratio/high_max": 0.008928571594879031,
"clip_ratio/high_mean": 0.002232142898719758,
"clip_ratio/low_mean": 0.0007440476329065859,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0029761905316263437,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1561.0,
"completions/max_terminated_length": 1561.0,
"completions/mean_length": 1437.046875,
"completions/mean_terminated_length": 1437.046875,
"completions/min_length": 686.0,
"completions/min_terminated_length": 686.0,
"entropy": 0.09430601261556149,
"epoch": 0.00902,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.39097362756729126,
"kl": 0.5122922882437706,
"learning_rate": 9.99968107018999e-05,
"loss": -0.0447,
"num_tokens": 26618636.0,
"reward": 10.664965629577637,
"reward_std": 12.413619995117188,
"rewards/rollout_reward_func/mean": 10.664965629577637,
"rewards/rollout_reward_func/std": 12.955881118774414,
"sampling/importance_sampling_ratio/max": 1.1989917755126953,
"sampling/importance_sampling_ratio/mean": 0.9830008745193481,
"sampling/importance_sampling_ratio/min": 0.5060357451438904,
"sampling/sampling_logp_difference/max": 0.3329579830169678,
"sampling/sampling_logp_difference/mean": 0.004485957324504852,
"step": 451,
"step_time": 39.55284725899946
},
{
"clip_ratio/high_max": 0.017857143189758062,
"clip_ratio/high_mean": 0.0052083334303461015,
"clip_ratio/low_mean": 0.009709821664728224,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.014918155211489648,
"entropy": 0.09947029640898108,
"epoch": 0.00904,
"grad_norm": 0.2647772431373596,
"kl": 0.501507306471467,
"learning_rate": 9.999679531348955e-05,
"loss": -0.0474,
"step": 452,
"step_time": 9.83529582600022
},
{
"clip_ratio/high_max": 0.0029761905316263437,
"clip_ratio/high_mean": 0.0007440476329065859,
"clip_ratio/low_mean": 0.0007440476329065859,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0014880952658131719,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1576.0,
"completions/max_terminated_length": 1576.0,
"completions/mean_length": 1483.796875,
"completions/mean_terminated_length": 1483.796875,
"completions/min_length": 1354.0,
"completions/min_terminated_length": 1354.0,
"entropy": 0.08577556139789522,
"epoch": 0.00906,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.4616855978965759,
"kl": 0.4984573759138584,
"learning_rate": 9.999677988804543e-05,
"loss": 0.0129,
"num_tokens": 26764995.0,
"reward": 12.713988304138184,
"reward_std": 16.157230377197266,
"rewards/rollout_reward_func/mean": 12.713988304138184,
"rewards/rollout_reward_func/std": 17.417678833007812,
"sampling/importance_sampling_ratio/max": 1.2561296224594116,
"sampling/importance_sampling_ratio/mean": 1.0040102005004883,
"sampling/importance_sampling_ratio/min": 0.5851351618766785,
"sampling/sampling_logp_difference/max": 0.335345983505249,
"sampling/sampling_logp_difference/mean": 0.004758521914482117,
"step": 453,
"step_time": 42.06172357400101
},
{
"clip_ratio/high_max": 0.014880952658131719,
"clip_ratio/high_mean": 0.004464285797439516,
"clip_ratio/low_mean": 0.015625000465661287,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.020089286321308464,
"entropy": 0.0774516521487385,
"epoch": 0.00908,
"grad_norm": 0.13414981961250305,
"kl": 0.5282110534608364,
"learning_rate": 9.999676442556757e-05,
"loss": 0.0065,
"step": 454,
"step_time": 10.263705699999264
},
{
"clip_ratio/high_max": 0.018005952704697847,
"clip_ratio/high_mean": 0.005245535809081048,
"clip_ratio/low_mean": 0.002232142898719758,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.007477678707800806,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1557.0,
"completions/max_terminated_length": 1557.0,
"completions/mean_length": 1432.234375,
"completions/mean_terminated_length": 1432.234375,
"completions/min_length": 777.0,
"completions/min_terminated_length": 777.0,
"entropy": 0.0869816429913044,
"epoch": 0.0091,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5139105319976807,
"kl": 0.5019301455467939,
"learning_rate": 9.999674892605595e-05,
"loss": -0.0143,
"num_tokens": 26907877.0,
"reward": 14.470987319946289,
"reward_std": 12.551952362060547,
"rewards/rollout_reward_func/mean": 14.470987319946289,
"rewards/rollout_reward_func/std": 13.231359481811523,
"sampling/importance_sampling_ratio/max": 1.4351580142974854,
"sampling/importance_sampling_ratio/mean": 0.9842495918273926,
"sampling/importance_sampling_ratio/min": 0.7047746181488037,
"sampling/sampling_logp_difference/max": 0.36011219024658203,
"sampling/sampling_logp_difference/mean": 0.005461296532303095,
"step": 455,
"step_time": 41.37734428300246
},
{
"clip_ratio/high_max": 0.036011905409395695,
"clip_ratio/high_mean": 0.012723214633297175,
"clip_ratio/low_mean": 0.01116071455180645,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.023883929243311286,
"entropy": 0.07720525958575308,
"epoch": 0.00912,
"grad_norm": 0.3397330045700073,
"kl": 0.6114528980106115,
"learning_rate": 9.99967333895106e-05,
"loss": -0.0171,
"step": 456,
"step_time": 10.617095338997387
},
{
"clip_ratio/high_max": 0.009077381109818816,
"clip_ratio/high_mean": 0.002269345277454704,
"clip_ratio/low_mean": 0.0037202381645329297,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005989583441987634,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1556.0,
"completions/max_terminated_length": 1556.0,
"completions/mean_length": 1450.328125,
"completions/mean_terminated_length": 1450.328125,
"completions/min_length": 677.0,
"completions/min_terminated_length": 677.0,
"entropy": 0.08630289603024721,
"epoch": 0.00914,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.915941059589386,
"kl": 0.5304882265627384,
"learning_rate": 9.999671781593154e-05,
"loss": -0.0128,
"num_tokens": 27051977.0,
"reward": 12.1441650390625,
"reward_std": 13.508443832397461,
"rewards/rollout_reward_func/mean": 12.1441650390625,
"rewards/rollout_reward_func/std": 14.862476348876953,
"sampling/importance_sampling_ratio/max": 1.8943538665771484,
"sampling/importance_sampling_ratio/mean": 1.0383222103118896,
"sampling/importance_sampling_ratio/min": 0.6029430031776428,
"sampling/sampling_logp_difference/max": 0.5262751579284668,
"sampling/sampling_logp_difference/mean": 0.0063569676131010056,
"step": 457,
"step_time": 40.44359572299618
},
{
"clip_ratio/high_max": 0.026785714784637094,
"clip_ratio/high_mean": 0.00889475119765848,
"clip_ratio/low_mean": 0.015625000349245965,
"clip_ratio/low_min": 0.0029761905316263437,
"clip_ratio/region_mean": 0.024519751546904445,
"entropy": 0.07588907447643578,
"epoch": 0.00916,
"grad_norm": 0.3780209422111511,
"kl": 0.5850545484572649,
"learning_rate": 9.999670220531878e-05,
"loss": -0.0142,
"step": 458,
"step_time": 10.81988593099959
},
{
"clip_ratio/high_max": 0.0059523810632526875,
"clip_ratio/high_mean": 0.0014880952658131719,
"clip_ratio/low_mean": 0.0007812500116415322,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.002269345277454704,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1548.0,
"completions/max_terminated_length": 1548.0,
"completions/mean_length": 1446.8125,
"completions/mean_terminated_length": 1446.8125,
"completions/min_length": 708.0,
"completions/min_terminated_length": 708.0,
"entropy": 0.06712023681029677,
"epoch": 0.00918,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5037431716918945,
"kl": 0.5365529656410217,
"learning_rate": 9.999668655767235e-05,
"loss": -0.0142,
"num_tokens": 27195924.0,
"reward": 12.623528480529785,
"reward_std": 16.375185012817383,
"rewards/rollout_reward_func/mean": 12.623528480529785,
"rewards/rollout_reward_func/std": 17.157840728759766,
"sampling/importance_sampling_ratio/max": 1.4218283891677856,
"sampling/importance_sampling_ratio/mean": 1.0120244026184082,
"sampling/importance_sampling_ratio/min": 0.7264562249183655,
"sampling/sampling_logp_difference/max": 0.36830270290374756,
"sampling/sampling_logp_difference/mean": 0.003537567099556327,
"step": 459,
"step_time": 39.789260978999664
},
{
"clip_ratio/high_max": 0.020833333721384406,
"clip_ratio/high_mean": 0.0052083334303461015,
"clip_ratio/low_mean": 0.010230655025225133,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.015438988397363573,
"entropy": 0.061492747627198696,
"epoch": 0.0092,
"grad_norm": 0.27846819162368774,
"kl": 0.6263625603169203,
"learning_rate": 9.999667087299225e-05,
"loss": -0.0179,
"step": 460,
"step_time": 10.157873148000363
},
{
"clip_ratio/high_max": 0.011904762126505375,
"clip_ratio/high_mean": 0.0037202381645329297,
"clip_ratio/low_mean": 0.0007440476329065859,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.004464285797439516,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1553.0,
"completions/max_terminated_length": 1553.0,
"completions/mean_length": 1393.921875,
"completions/mean_terminated_length": 1393.921875,
"completions/min_length": 286.0,
"completions/min_terminated_length": 286.0,
"entropy": 0.06535098806489259,
"epoch": 0.00922,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.6925691366195679,
"kl": 0.5965993329882622,
"learning_rate": 9.99966551512785e-05,
"loss": -0.0133,
"num_tokens": 27336352.0,
"reward": 8.150674819946289,
"reward_std": 15.653514862060547,
"rewards/rollout_reward_func/mean": 8.150674819946289,
"rewards/rollout_reward_func/std": 16.096240997314453,
"sampling/importance_sampling_ratio/max": 1.3934406042099,
"sampling/importance_sampling_ratio/mean": 0.9711774587631226,
"sampling/importance_sampling_ratio/min": 0.3346167504787445,
"sampling/sampling_logp_difference/max": 1.0496406555175781,
"sampling/sampling_logp_difference/mean": 0.005856034811586142,
"step": 461,
"step_time": 41.696121679995485
},
{
"clip_ratio/high_max": 0.023958333767950535,
"clip_ratio/high_mean": 0.00673363107489422,
"clip_ratio/low_mean": 0.015252976503688842,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.021986607811413705,
"entropy": 0.0650356519035995,
"epoch": 0.00924,
"grad_norm": 1.2314876317977905,
"kl": 1.7299257963895798,
"learning_rate": 9.999663939253112e-05,
"loss": -0.0022,
"step": 462,
"step_time": 10.117755536000914
},
{
"clip_ratio/high_max": 0.009077381109818816,
"clip_ratio/high_mean": 0.002269345277454704,
"clip_ratio/low_mean": 0.0014880952658131719,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.003757440543267876,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1543.0,
"completions/max_terminated_length": 1543.0,
"completions/mean_length": 1386.734375,
"completions/mean_terminated_length": 1386.734375,
"completions/min_length": 195.0,
"completions/min_terminated_length": 195.0,
"entropy": 0.08255739836022258,
"epoch": 0.00926,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.4534408450126648,
"kl": 0.5308241080492735,
"learning_rate": 9.999662359675012e-05,
"loss": -0.0123,
"num_tokens": 27476234.0,
"reward": 11.247259140014648,
"reward_std": 14.853042602539062,
"rewards/rollout_reward_func/mean": 11.247259140014648,
"rewards/rollout_reward_func/std": 14.736608505249023,
"sampling/importance_sampling_ratio/max": 1.3197416067123413,
"sampling/importance_sampling_ratio/mean": 0.9946113228797913,
"sampling/importance_sampling_ratio/min": 0.7106093764305115,
"sampling/sampling_logp_difference/max": 0.3446381092071533,
"sampling/sampling_logp_difference/mean": 0.00609009200707078,
"step": 463,
"step_time": 40.418589189997874
},
{
"clip_ratio/high_max": 0.026934524532407522,
"clip_ratio/high_mean": 0.009709821664728224,
"clip_ratio/low_mean": 0.01640625053551048,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.026116072200238705,
"entropy": 0.09197275433689356,
"epoch": 0.00928,
"grad_norm": 0.46926549077033997,
"kl": 0.5417319964617491,
"learning_rate": 9.999660776393552e-05,
"loss": -0.0111,
"step": 464,
"step_time": 10.364104637999844
},
{
"clip_ratio/high_max": 0.0059523810632526875,
"clip_ratio/high_mean": 0.0014880952658131719,
"clip_ratio/low_mean": 0.0014880952658131719,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0029761905316263437,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1538.0,
"completions/max_terminated_length": 1538.0,
"completions/mean_length": 1408.109375,
"completions/mean_terminated_length": 1408.109375,
"completions/min_length": 788.0,
"completions/min_terminated_length": 788.0,
"entropy": 0.0891355937346816,
"epoch": 0.0093,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.9044552445411682,
"kl": 0.6998987477272749,
"learning_rate": 9.999659189408731e-05,
"loss": -0.0085,
"num_tokens": 27617505.0,
"reward": 13.215154647827148,
"reward_std": 11.782221794128418,
"rewards/rollout_reward_func/mean": 13.215155601501465,
"rewards/rollout_reward_func/std": 12.105838775634766,
"sampling/importance_sampling_ratio/max": 1.657700777053833,
"sampling/importance_sampling_ratio/mean": 1.0080327987670898,
"sampling/importance_sampling_ratio/min": 0.5086445808410645,
"sampling/sampling_logp_difference/max": 0.6424019932746887,
"sampling/sampling_logp_difference/mean": 0.005240763537585735,
"step": 465,
"step_time": 39.89362095000433
}
],
"logging_steps": 1.0,
"max_steps": 100000,
"num_input_tokens_seen": 27617505,
"num_train_epochs": 2,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}