{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.0093, "eval_steps": 500, "global_step": 465, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 822.0, "completions/max_terminated_length": 822.0, "completions/mean_length": 741.390625, "completions/mean_terminated_length": 741.390625, "completions/min_length": 296.0, "completions/min_terminated_length": 296.0, "entropy": 0.19560225727036595, "epoch": 2e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.6278125643730164, "kl": 0.0, "learning_rate": 0.0, "loss": 0.0026, "num_tokens": 102275.0, "reward": -0.2660611569881439, "reward_std": 9.006877899169922, "rewards/rollout_reward_func/mean": -0.26606130599975586, "rewards/rollout_reward_func/std": 10.133543014526367, "sampling/importance_sampling_ratio/max": 1.4521965980529785, "sampling/importance_sampling_ratio/mean": 1.0252978801727295, "sampling/importance_sampling_ratio/min": 0.6192880272865295, "sampling/sampling_logp_difference/max": 0.35935235023498535, "sampling/sampling_logp_difference/mean": 0.013161457143723965, "step": 1, "step_time": 18.950907858999926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.19560225727036595, "epoch": 4e-05, "grad_norm": 0.6270994544029236, "kl": 0.0, "learning_rate": 2.8571428571428573e-06, "loss": 0.0026, "step": 2, "step_time": 6.845600487000297 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0026041667442768812, "clip_ratio/low_mean": 0.0026041667442768812, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0052083334885537624, "completions/clipped_ratio": 0.0, "completions/max_length": 817.0, "completions/max_terminated_length": 817.0, "completions/mean_length": 745.078125, "completions/mean_terminated_length": 745.078125, "completions/min_length": 290.0, "completions/min_terminated_length": 290.0, "entropy": 0.1830942602828145, "epoch": 6e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.6748153567314148, "kl": 0.0004804102204616356, "learning_rate": 5.7142857142857145e-06, "loss": -0.0139, "num_tokens": 204643.0, "reward": 0.07987305521965027, "reward_std": 6.112407207489014, "rewards/rollout_reward_func/mean": 0.07987302541732788, "rewards/rollout_reward_func/std": 6.9746317863464355, "sampling/importance_sampling_ratio/max": 1.6137751340866089, "sampling/importance_sampling_ratio/mean": 1.0131056308746338, "sampling/importance_sampling_ratio/min": 0.5117371678352356, "sampling/sampling_logp_difference/max": 0.6347737312316895, "sampling/sampling_logp_difference/mean": 0.013132400810718536, "step": 3, "step_time": 20.457778603000065 }, { "clip_ratio/high_max": 0.0062500000931322575, "clip_ratio/high_mean": 0.0015625000232830644, "clip_ratio/low_mean": 0.0013020833721384406, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002864583395421505, "entropy": 0.18449228629469872, "epoch": 8e-05, "grad_norm": 0.7855743169784546, "kl": 0.0004326992366259219, "learning_rate": 8.571428571428573e-06, "loss": -0.0127, "step": 4, "step_time": 7.153126219000001 }, { "clip_ratio/high_max": 0.015625000465661287, "clip_ratio/high_mean": 0.003906250116415322, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003906250116415322, "completions/clipped_ratio": 0.0, "completions/max_length": 830.0, "completions/max_terminated_length": 830.0, "completions/mean_length": 773.3125, "completions/mean_terminated_length": 773.3125, "completions/min_length": 691.0, "completions/min_terminated_length": 691.0, "entropy": 0.19608404766768217, "epoch": 0.0001, "frac_reward_zero_std": 0.0, "grad_norm": 0.6154729723930359, "kl": 0.0007404440693790093, "learning_rate": 1.1428571428571429e-05, "loss": -0.0267, "num_tokens": 308926.0, "reward": -2.357975721359253, "reward_std": 5.998347282409668, "rewards/rollout_reward_func/mean": -2.357975721359253, "rewards/rollout_reward_func/std": 6.508192539215088, "sampling/importance_sampling_ratio/max": 1.5696072578430176, "sampling/importance_sampling_ratio/mean": 1.0018606185913086, "sampling/importance_sampling_ratio/min": 0.6378414630889893, "sampling/sampling_logp_difference/max": 0.4687232971191406, "sampling/sampling_logp_difference/mean": 0.014497373253107071, "step": 5, "step_time": 21.077881563999767 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0026041667442768812, "clip_ratio/low_mean": 0.0027225379599258304, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005326704704202712, "entropy": 0.20008834172040224, "epoch": 0.00012, "grad_norm": 0.613211989402771, "kl": 0.0017206422435265267, "learning_rate": 1.4285714285714285e-05, "loss": -0.0283, "step": 6, "step_time": 8.075609097999632 }, { "clip_ratio/high_max": 0.0052083334885537624, "clip_ratio/high_mean": 0.0013020833721384406, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0013020833721384406, "completions/clipped_ratio": 0.0, "completions/max_length": 843.0, "completions/max_terminated_length": 843.0, "completions/mean_length": 754.3125, "completions/mean_terminated_length": 754.3125, "completions/min_length": 302.0, "completions/min_terminated_length": 302.0, "entropy": 0.21214309986680746, "epoch": 0.00014, "frac_reward_zero_std": 0.0, "grad_norm": 0.5866758823394775, "kl": 0.0038609652619925328, "learning_rate": 1.7142857142857145e-05, "loss": 0.0015, "num_tokens": 413194.0, "reward": -0.5192327499389648, "reward_std": 8.747434616088867, "rewards/rollout_reward_func/mean": -0.5192328095436096, "rewards/rollout_reward_func/std": 9.696125030517578, "sampling/importance_sampling_ratio/max": 1.3741450309753418, "sampling/importance_sampling_ratio/mean": 0.988805890083313, "sampling/importance_sampling_ratio/min": 0.6078794002532959, "sampling/sampling_logp_difference/max": 0.25654804706573486, "sampling/sampling_logp_difference/mean": 0.012450095266103745, "step": 7, "step_time": 21.18102895699974 }, { "clip_ratio/high_max": 0.042140152771025896, "clip_ratio/high_mean": 0.010535038192756474, "clip_ratio/low_mean": 0.011718750349245965, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02225378854200244, "entropy": 0.2212895406410098, "epoch": 0.00016, "grad_norm": 0.5727657675743103, "kl": 0.01148045047011692, "learning_rate": 2e-05, "loss": 0.0002, "step": 8, "step_time": 8.206865795999875 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0026041667442768812, "clip_ratio/low_mean": 0.004142992664128542, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006747159408405423, "completions/clipped_ratio": 0.0, "completions/max_length": 822.0, "completions/max_terminated_length": 822.0, "completions/mean_length": 742.765625, "completions/mean_terminated_length": 742.765625, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "entropy": 0.25475312024354935, "epoch": 0.00018, "frac_reward_zero_std": 0.0, "grad_norm": 0.7087565064430237, "kl": 0.03194100991822779, "learning_rate": 2.2857142857142858e-05, "loss": 0.0241, "num_tokens": 516181.0, "reward": -2.378840684890747, "reward_std": 6.36100959777832, "rewards/rollout_reward_func/mean": -2.378840446472168, "rewards/rollout_reward_func/std": 7.315836429595947, "sampling/importance_sampling_ratio/max": 1.6080894470214844, "sampling/importance_sampling_ratio/mean": 1.0152499675750732, "sampling/importance_sampling_ratio/min": 0.4359276592731476, "sampling/sampling_logp_difference/max": 0.4399428367614746, "sampling/sampling_logp_difference/mean": 0.028559193015098572, "step": 9, "step_time": 22.1539967839999 }, { "clip_ratio/high_max": 0.04734848625957966, "clip_ratio/high_mean": 0.013139204937033355, "clip_ratio/low_mean": 0.007930871448479593, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.021070076385512948, "entropy": 0.26416848599910736, "epoch": 0.0002, "grad_norm": 0.6573855876922607, "kl": 0.03966027498245239, "learning_rate": 2.5714285714285714e-05, "loss": 0.021, "step": 10, "step_time": 7.0971175310000945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 822.0, "completions/max_terminated_length": 822.0, "completions/mean_length": 750.75, "completions/mean_terminated_length": 750.75, "completions/min_length": 608.0, "completions/min_terminated_length": 608.0, "entropy": 0.2343001812696457, "epoch": 0.00022, "frac_reward_zero_std": 0.0, "grad_norm": 0.864137589931488, "kl": 0.03614223480690271, "learning_rate": 2.857142857142857e-05, "loss": -0.018, "num_tokens": 619774.0, "reward": -1.144383430480957, "reward_std": 9.403154373168945, "rewards/rollout_reward_func/mean": -1.144383192062378, "rewards/rollout_reward_func/std": 10.208455085754395, "sampling/importance_sampling_ratio/max": 1.6737509965896606, "sampling/importance_sampling_ratio/mean": 1.0005735158920288, "sampling/importance_sampling_ratio/min": 0.5264889001846313, "sampling/sampling_logp_difference/max": 0.7381381988525391, "sampling/sampling_logp_difference/mean": 0.03099803999066353, "step": 11, "step_time": 24.1759815060002 }, { "clip_ratio/high_max": 0.04261363763362169, "clip_ratio/high_mean": 0.011955492780543864, "clip_ratio/low_mean": 0.018229166977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.030184659990482032, "entropy": 0.23695118725299835, "epoch": 0.00024, "grad_norm": 0.5675711631774902, "kl": 0.05231437139445916, "learning_rate": 3.142857142857143e-05, "loss": -0.0247, "step": 12, "step_time": 7.241505567000331 }, { "clip_ratio/high_max": 0.0052083334885537624, "clip_ratio/high_mean": 0.0013020833721384406, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0013020833721384406, "completions/clipped_ratio": 0.0, "completions/max_length": 827.0, "completions/max_terminated_length": 827.0, "completions/mean_length": 766.03125, "completions/mean_terminated_length": 766.03125, "completions/min_length": 638.0, "completions/min_terminated_length": 638.0, "entropy": 0.23159058205783367, "epoch": 0.00026, "frac_reward_zero_std": 0.0, "grad_norm": 0.560762345790863, "kl": 0.10253941919654608, "learning_rate": 3.428571428571429e-05, "loss": -0.0132, "num_tokens": 725286.0, "reward": 0.9126645922660828, "reward_std": 8.317488670349121, "rewards/rollout_reward_func/mean": 0.9126646518707275, "rewards/rollout_reward_func/std": 9.508187294006348, "sampling/importance_sampling_ratio/max": 1.4912891387939453, "sampling/importance_sampling_ratio/mean": 0.9157562255859375, "sampling/importance_sampling_ratio/min": 0.15846048295497894, "sampling/sampling_logp_difference/max": 0.9116353988647461, "sampling/sampling_logp_difference/mean": 0.03342486917972565, "step": 13, "step_time": 24.797564555999315 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.0052083334885537624, "clip_ratio/low_mean": 0.04107481171377003, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046283145202323794, "entropy": 0.21706843469291925, "epoch": 0.00028, "grad_norm": 0.737306535243988, "kl": 0.20574123412370682, "learning_rate": 3.7142857142857143e-05, "loss": -0.0141, "step": 14, "step_time": 8.782559869000124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.002864583395421505, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002864583395421505, "completions/clipped_ratio": 0.0, "completions/max_length": 838.0, "completions/max_terminated_length": 838.0, "completions/mean_length": 730.265625, "completions/mean_terminated_length": 730.265625, "completions/min_length": 293.0, "completions/min_terminated_length": 293.0, "entropy": 0.20442467741668224, "epoch": 0.0003, "frac_reward_zero_std": 0.0, "grad_norm": 0.7728816866874695, "kl": 0.10564538510516286, "learning_rate": 4e-05, "loss": 0.0314, "num_tokens": 827749.0, "reward": -1.6128692626953125, "reward_std": 6.231240272521973, "rewards/rollout_reward_func/mean": -1.6128690242767334, "rewards/rollout_reward_func/std": 6.545647621154785, "sampling/importance_sampling_ratio/max": 1.7540509700775146, "sampling/importance_sampling_ratio/mean": 1.0142356157302856, "sampling/importance_sampling_ratio/min": 0.45990973711013794, "sampling/sampling_logp_difference/max": 0.7248215675354004, "sampling/sampling_logp_difference/mean": 0.030622530728578568, "step": 15, "step_time": 24.38517581200017 }, { "clip_ratio/high_max": 0.047821971122175455, "clip_ratio/high_mean": 0.013257576036266983, "clip_ratio/low_mean": 0.025236743036657572, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0384943193057552, "entropy": 0.19760818500071764, "epoch": 0.00032, "grad_norm": 0.6611685752868652, "kl": 0.11387888877652586, "learning_rate": 4.2857142857142856e-05, "loss": 0.0262, "step": 16, "step_time": 7.110903799999505 }, { "clip_ratio/high_max": 0.0052083334885537624, "clip_ratio/high_mean": 0.0013020833721384406, "clip_ratio/low_mean": 0.0013020833721384406, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0026041667442768812, "completions/clipped_ratio": 0.0, "completions/max_length": 826.0, "completions/max_terminated_length": 826.0, "completions/mean_length": 760.359375, "completions/mean_terminated_length": 760.359375, "completions/min_length": 659.0, "completions/min_terminated_length": 659.0, "entropy": 0.19120646081864834, "epoch": 0.00034, "frac_reward_zero_std": 0.0, "grad_norm": 1.0170923471450806, "kl": 0.08101693401113153, "learning_rate": 4.5714285714285716e-05, "loss": -0.015, "num_tokens": 931841.0, "reward": -1.6879972219467163, "reward_std": 9.023077011108398, "rewards/rollout_reward_func/mean": -1.6879971027374268, "rewards/rollout_reward_func/std": 10.298378944396973, "sampling/importance_sampling_ratio/max": 2.430154800415039, "sampling/importance_sampling_ratio/mean": 1.065093755722046, "sampling/importance_sampling_ratio/min": 0.6535128951072693, "sampling/sampling_logp_difference/max": 0.7661471366882324, "sampling/sampling_logp_difference/mean": 0.024486079812049866, "step": 17, "step_time": 27.987481355 }, { "clip_ratio/high_max": 0.043560607358813286, "clip_ratio/high_mean": 0.016335227992385626, "clip_ratio/low_mean": 0.01846590987406671, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03480113763362169, "entropy": 0.19553834106773138, "epoch": 0.00036, "grad_norm": 0.5111234784126282, "kl": 0.088710677344352, "learning_rate": 4.8571428571428576e-05, "loss": -0.0206, "step": 18, "step_time": 7.182192339999801 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0026041667442768812, "clip_ratio/low_mean": 0.0013020833721384406, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003906250116415322, "completions/clipped_ratio": 0.0, "completions/max_length": 816.0, "completions/max_terminated_length": 816.0, "completions/mean_length": 733.640625, "completions/mean_terminated_length": 733.640625, "completions/min_length": 296.0, "completions/min_terminated_length": 296.0, "entropy": 0.1935133864171803, "epoch": 0.00038, "frac_reward_zero_std": 0.0, "grad_norm": 0.7022229433059692, "kl": 0.1404350029770285, "learning_rate": 5.142857142857143e-05, "loss": -0.0003, "num_tokens": 1033723.0, "reward": -1.2022110223770142, "reward_std": 10.956363677978516, "rewards/rollout_reward_func/mean": -1.2022109031677246, "rewards/rollout_reward_func/std": 12.292625427246094, "sampling/importance_sampling_ratio/max": 1.6157236099243164, "sampling/importance_sampling_ratio/mean": 0.9594892263412476, "sampling/importance_sampling_ratio/min": 0.3754613697528839, "sampling/sampling_logp_difference/max": 0.9176025390625, "sampling/sampling_logp_difference/mean": 0.028035998344421387, "step": 19, "step_time": 27.688288005999993 }, { "clip_ratio/high_max": 0.04876894084736705, "clip_ratio/high_mean": 0.012192235211841762, "clip_ratio/low_mean": 0.018584280740469694, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0307765161851421, "entropy": 0.20130611211061478, "epoch": 0.0004, "grad_norm": 0.4695027768611908, "kl": 0.18750765593722463, "learning_rate": 5.428571428571428e-05, "loss": -0.0054, "step": 20, "step_time": 7.739605327000618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0014204545877873898, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0014204545877873898, "completions/clipped_ratio": 0.0, "completions/max_length": 823.0, "completions/max_terminated_length": 823.0, "completions/mean_length": 737.9375, "completions/mean_terminated_length": 737.9375, "completions/min_length": 618.0, "completions/min_terminated_length": 618.0, "entropy": 0.18132759165018797, "epoch": 0.00042, "frac_reward_zero_std": 0.0, "grad_norm": 1.1652212142944336, "kl": 0.13582510640844703, "learning_rate": 5.714285714285714e-05, "loss": 0.0262, "num_tokens": 1135968.0, "reward": -0.28913062810897827, "reward_std": 7.3008809089660645, "rewards/rollout_reward_func/mean": -0.28913065791130066, "rewards/rollout_reward_func/std": 7.988962650299072, "sampling/importance_sampling_ratio/max": 2.336996555328369, "sampling/importance_sampling_ratio/mean": 1.0362560749053955, "sampling/importance_sampling_ratio/min": 0.6398296356201172, "sampling/sampling_logp_difference/max": 0.6417920589447021, "sampling/sampling_logp_difference/mean": 0.022837379947304726, "step": 21, "step_time": 28.57662482000046 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.006510416860692203, "clip_ratio/low_mean": 0.025386679684743285, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03189709666185081, "entropy": 0.1763849752023816, "epoch": 0.00044, "grad_norm": 0.3849461078643799, "kl": 0.16632835287600756, "learning_rate": 6e-05, "loss": 0.0212, "step": 22, "step_time": 8.287740409000207 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0026041667442768812, "clip_ratio/low_mean": 0.0014204545877873898, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004024621332064271, "completions/clipped_ratio": 0.0, "completions/max_length": 826.0, "completions/max_terminated_length": 826.0, "completions/mean_length": 723.875, "completions/mean_terminated_length": 723.875, "completions/min_length": 305.0, "completions/min_terminated_length": 305.0, "entropy": 0.1840990763157606, "epoch": 0.00046, "frac_reward_zero_std": 0.0, "grad_norm": 0.860249936580658, "kl": 0.25097968662157655, "learning_rate": 6.285714285714286e-05, "loss": 0.0286, "num_tokens": 1237057.0, "reward": 0.4839830696582794, "reward_std": 10.420938491821289, "rewards/rollout_reward_func/mean": 0.4839830994606018, "rewards/rollout_reward_func/std": 11.429144859313965, "sampling/importance_sampling_ratio/max": 2.106267213821411, "sampling/importance_sampling_ratio/mean": 1.0313048362731934, "sampling/importance_sampling_ratio/min": 0.574251651763916, "sampling/sampling_logp_difference/max": 0.8508915901184082, "sampling/sampling_logp_difference/mean": 0.02066868171095848, "step": 23, "step_time": 28.494462327999827 }, { "clip_ratio/high_max": 0.06818181974813342, "clip_ratio/high_mean": 0.021070076152682304, "clip_ratio/low_mean": 0.018347538076341152, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03941761387977749, "entropy": 0.19043638091534376, "epoch": 0.00048, "grad_norm": 0.6448091864585876, "kl": 0.35418248968198895, "learning_rate": 6.571428571428571e-05, "loss": 0.0215, "step": 24, "step_time": 7.416647947999536 }, { "clip_ratio/high_max": 0.0052083334885537624, "clip_ratio/high_mean": 0.0013020833721384406, "clip_ratio/low_mean": 0.004024621332064271, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005326704704202712, "completions/clipped_ratio": 0.0, "completions/max_length": 829.0, "completions/max_terminated_length": 829.0, "completions/mean_length": 731.0625, "completions/mean_terminated_length": 731.0625, "completions/min_length": 615.0, "completions/min_terminated_length": 615.0, "entropy": 0.1908296812325716, "epoch": 0.0005, "frac_reward_zero_std": 0.0, "grad_norm": 0.66495680809021, "kl": 0.21043909061700106, "learning_rate": 6.857142857142858e-05, "loss": -0.0275, "num_tokens": 1337760.0, "reward": 0.9224299788475037, "reward_std": 10.655890464782715, "rewards/rollout_reward_func/mean": 0.9224300384521484, "rewards/rollout_reward_func/std": 12.821269989013672, "sampling/importance_sampling_ratio/max": 1.5019664764404297, "sampling/importance_sampling_ratio/mean": 1.0262192487716675, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.9519531726837158, "sampling/sampling_logp_difference/mean": 0.018259627744555473, "step": 25, "step_time": 29.797745564000707 }, { "clip_ratio/high_max": 0.05823863809928298, "clip_ratio/high_mean": 0.017163826269097626, "clip_ratio/low_mean": 0.024147727992385626, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.041311554377898574, "entropy": 0.1913931304588914, "epoch": 0.00052, "grad_norm": 0.5575593709945679, "kl": 0.26408666698262095, "learning_rate": 7.142857142857143e-05, "loss": -0.0322, "step": 26, "step_time": 7.109563219000847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 827.0, "completions/max_terminated_length": 827.0, "completions/mean_length": 731.640625, "completions/mean_terminated_length": 731.640625, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "entropy": 0.19422233663499355, "epoch": 0.00054, "frac_reward_zero_std": 0.0, "grad_norm": 0.6210283637046814, "kl": 0.21635392913594842, "learning_rate": 7.428571428571429e-05, "loss": -0.0185, "num_tokens": 1439214.0, "reward": 0.326141357421875, "reward_std": 13.388666152954102, "rewards/rollout_reward_func/mean": 0.32614123821258545, "rewards/rollout_reward_func/std": 14.97364616394043, "sampling/importance_sampling_ratio/max": 1.5914506912231445, "sampling/importance_sampling_ratio/mean": 1.0221253633499146, "sampling/importance_sampling_ratio/min": 0.7667937874794006, "sampling/sampling_logp_difference/max": 0.37548696994781494, "sampling/sampling_logp_difference/mean": 0.012905368581414223, "step": 27, "step_time": 28.513997486000562 }, { "clip_ratio/high_max": 0.05255681974813342, "clip_ratio/high_mean": 0.01574337179772556, "clip_ratio/low_mean": 0.01661931863054633, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.032362690777517855, "entropy": 0.1939925504848361, "epoch": 0.00056, "grad_norm": 0.2964678406715393, "kl": 0.22840850101783872, "learning_rate": 7.714285714285715e-05, "loss": -0.0252, "step": 28, "step_time": 8.46359607699992 }, { "clip_ratio/high_max": 0.010890151839703321, "clip_ratio/high_mean": 0.0027225379599258304, "clip_ratio/low_mean": 0.0013020833721384406, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004024621448479593, "completions/clipped_ratio": 0.0, "completions/max_length": 834.0, "completions/max_terminated_length": 834.0, "completions/mean_length": 714.359375, "completions/mean_terminated_length": 714.359375, "completions/min_length": 503.0, "completions/min_terminated_length": 503.0, "entropy": 0.1717732958495617, "epoch": 0.00058, "frac_reward_zero_std": 0.0, "grad_norm": 0.3590966761112213, "kl": 0.24717363435775042, "learning_rate": 8e-05, "loss": 0.0036, "num_tokens": 1539055.0, "reward": 1.930895447731018, "reward_std": 8.148633003234863, "rewards/rollout_reward_func/mean": 1.930895447731018, "rewards/rollout_reward_func/std": 9.020356178283691, "sampling/importance_sampling_ratio/max": 1.6024476289749146, "sampling/importance_sampling_ratio/mean": 1.0161041021347046, "sampling/importance_sampling_ratio/min": 0.7807760238647461, "sampling/sampling_logp_difference/max": 0.35602256655693054, "sampling/sampling_logp_difference/mean": 0.011149970814585686, "step": 29, "step_time": 28.064759372000253 }, { "clip_ratio/high_max": 0.027083334047347307, "clip_ratio/high_mean": 0.006770833511836827, "clip_ratio/low_mean": 0.029711175127886236, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03648200852330774, "entropy": 0.16414203867316246, "epoch": 0.0006, "grad_norm": 0.38951048254966736, "kl": 0.28005583630874753, "learning_rate": 8.285714285714287e-05, "loss": 0.0013, "step": 30, "step_time": 7.401456857000312 }, { "clip_ratio/high_max": 0.0052083334885537624, "clip_ratio/high_mean": 0.0013020833721384406, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0013020833721384406, "completions/clipped_ratio": 0.0, "completions/max_length": 809.0, "completions/max_terminated_length": 809.0, "completions/mean_length": 708.046875, "completions/mean_terminated_length": 708.046875, "completions/min_length": 475.0, "completions/min_terminated_length": 475.0, "entropy": 0.16439654119312763, "epoch": 0.00062, "frac_reward_zero_std": 0.0, "grad_norm": 0.5445168614387512, "kl": 0.2800124539062381, "learning_rate": 8.571428571428571e-05, "loss": 0.0097, "num_tokens": 1638113.0, "reward": 0.29781579971313477, "reward_std": 10.009416580200195, "rewards/rollout_reward_func/mean": 0.29781582951545715, "rewards/rollout_reward_func/std": 11.176705360412598, "sampling/importance_sampling_ratio/max": 1.755067229270935, "sampling/importance_sampling_ratio/mean": 1.0180511474609375, "sampling/importance_sampling_ratio/min": 0.580125629901886, "sampling/sampling_logp_difference/max": 0.5197739601135254, "sampling/sampling_logp_difference/mean": 0.013791397213935852, "step": 31, "step_time": 30.773730244999797 }, { "clip_ratio/high_max": 0.03645833395421505, "clip_ratio/high_mean": 0.013139204704202712, "clip_ratio/low_mean": 0.03042140242177993, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.04356060677673668, "entropy": 0.15187342395074666, "epoch": 0.00064, "grad_norm": 0.30164626240730286, "kl": 0.32055927254259586, "learning_rate": 8.857142857142857e-05, "loss": 0.0037, "step": 32, "step_time": 7.328695028999618 }, { "clip_ratio/high_max": 0.0052083334885537624, "clip_ratio/high_mean": 0.0013020833721384406, "clip_ratio/low_mean": 0.0027225379599258304, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004024621332064271, "completions/clipped_ratio": 0.0, "completions/max_length": 835.0, "completions/max_terminated_length": 835.0, "completions/mean_length": 701.5, "completions/mean_terminated_length": 701.5, "completions/min_length": 502.0, "completions/min_terminated_length": 502.0, "entropy": 0.1332990936934948, "epoch": 0.00066, "frac_reward_zero_std": 0.0, "grad_norm": 0.43104609847068787, "kl": 0.32164820563048124, "learning_rate": 9.142857142857143e-05, "loss": 0.0025, "num_tokens": 1738075.0, "reward": 3.1038765907287598, "reward_std": 11.951395988464355, "rewards/rollout_reward_func/mean": 3.1038765907287598, "rewards/rollout_reward_func/std": 12.847871780395508, "sampling/importance_sampling_ratio/max": 1.3508435487747192, "sampling/importance_sampling_ratio/mean": 0.9952214360237122, "sampling/importance_sampling_ratio/min": 0.6407750844955444, "sampling/sampling_logp_difference/max": 0.47523796558380127, "sampling/sampling_logp_difference/mean": 0.013571259565651417, "step": 33, "step_time": 27.829260915000077 }, { "clip_ratio/high_max": 0.03219697065651417, "clip_ratio/high_mean": 0.010653409408405423, "clip_ratio/low_mean": 0.029000947601161897, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03965435700956732, "entropy": 0.12380115175619721, "epoch": 0.00068, "grad_norm": 0.27367016673088074, "kl": 0.423783166334033, "learning_rate": 9.428571428571429e-05, "loss": -0.0, "step": 34, "step_time": 7.799126809999507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 819.0, "completions/max_terminated_length": 819.0, "completions/mean_length": 687.4375, "completions/mean_terminated_length": 687.4375, "completions/min_length": 618.0, "completions/min_terminated_length": 618.0, "entropy": 0.10798696288838983, "epoch": 0.0007, "frac_reward_zero_std": 0.0, "grad_norm": 0.4482150673866272, "kl": 0.3214763030409813, "learning_rate": 9.714285714285715e-05, "loss": 0.028, "num_tokens": 1836043.0, "reward": 3.037400960922241, "reward_std": 12.985002517700195, "rewards/rollout_reward_func/mean": 3.037400960922241, "rewards/rollout_reward_func/std": 13.425616264343262, "sampling/importance_sampling_ratio/max": 1.4862518310546875, "sampling/importance_sampling_ratio/mean": 1.0146703720092773, "sampling/importance_sampling_ratio/min": 0.5140225291252136, "sampling/sampling_logp_difference/max": 0.8002816438674927, "sampling/sampling_logp_difference/mean": 0.01363956555724144, "step": 35, "step_time": 28.662696071000028 }, { "clip_ratio/high_max": 0.042140152771025896, "clip_ratio/high_mean": 0.010535038192756474, "clip_ratio/low_mean": 0.014441288309171796, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.024976326269097626, "entropy": 0.1118780323304236, "epoch": 0.00072, "grad_norm": 0.1983855962753296, "kl": 0.373223016038537, "learning_rate": 0.0001, "loss": 0.0232, "step": 36, "step_time": 8.269840026000338 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0026041667442768812, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0026041667442768812, "completions/clipped_ratio": 0.0, "completions/max_length": 811.0, "completions/max_terminated_length": 811.0, "completions/mean_length": 679.3125, "completions/mean_terminated_length": 679.3125, "completions/min_length": 449.0, "completions/min_terminated_length": 449.0, "entropy": 0.12342227855697274, "epoch": 0.00074, "frac_reward_zero_std": 0.0, "grad_norm": 0.6195780634880066, "kl": 0.45714515913277864, "learning_rate": 9.999999998148153e-05, "loss": -0.0249, "num_tokens": 1932947.0, "reward": 3.72019362449646, "reward_std": 11.354637145996094, "rewards/rollout_reward_func/mean": 3.720193862915039, "rewards/rollout_reward_func/std": 11.66490650177002, "sampling/importance_sampling_ratio/max": 2.1260557174682617, "sampling/importance_sampling_ratio/mean": 1.049971580505371, "sampling/importance_sampling_ratio/min": 0.6164436340332031, "sampling/sampling_logp_difference/max": 0.5450749397277832, "sampling/sampling_logp_difference/mean": 0.01501537300646305, "step": 37, "step_time": 27.480367904999866 }, { "clip_ratio/high_max": 0.05303030414506793, "clip_ratio/high_mean": 0.014678030624054372, "clip_ratio/low_mean": 0.014322917209938169, "clip_ratio/low_min": 0.0052083334885537624, "clip_ratio/region_mean": 0.029000947950407863, "entropy": 0.13006606698036194, "epoch": 0.00076, "grad_norm": 0.2681926488876343, "kl": 0.4847450293600559, "learning_rate": 9.999999992592612e-05, "loss": -0.0318, "step": 38, "step_time": 7.225284665000345 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0026041667442768812, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0026041667442768812, "completions/clipped_ratio": 0.0, "completions/max_length": 830.0, "completions/max_terminated_length": 830.0, "completions/mean_length": 700.09375, "completions/mean_terminated_length": 700.09375, "completions/min_length": 300.0, "completions/min_terminated_length": 300.0, "entropy": 0.15452369069680572, "epoch": 0.00078, "frac_reward_zero_std": 0.0, "grad_norm": 0.4834868311882019, "kl": 0.4672291334718466, "learning_rate": 9.999999983333379e-05, "loss": -0.0162, "num_tokens": 2032280.0, "reward": 5.62964391708374, "reward_std": 9.88559341430664, "rewards/rollout_reward_func/mean": 5.629644393920898, "rewards/rollout_reward_func/std": 12.693258285522461, "sampling/importance_sampling_ratio/max": 1.5066994428634644, "sampling/importance_sampling_ratio/mean": 1.0094711780548096, "sampling/importance_sampling_ratio/min": 0.6512829065322876, "sampling/sampling_logp_difference/max": 0.4918508529663086, "sampling/sampling_logp_difference/mean": 0.01460680365562439, "step": 39, "step_time": 30.803230847000123 }, { "clip_ratio/high_max": 0.05823863809928298, "clip_ratio/high_mean": 0.01976799312978983, "clip_ratio/low_mean": 0.02734375069849193, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.04711174394469708, "entropy": 0.14933442790061235, "epoch": 0.0008, "grad_norm": 0.34873443841934204, "kl": 0.5781354140490294, "learning_rate": 9.99999997037045e-05, "loss": -0.0203, "step": 40, "step_time": 7.3111222899999575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0028409091755747795, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0028409091755747795, "completions/clipped_ratio": 0.0, "completions/max_length": 829.0, "completions/max_terminated_length": 829.0, "completions/mean_length": 686.59375, "completions/mean_terminated_length": 686.59375, "completions/min_length": 615.0, "completions/min_terminated_length": 615.0, "entropy": 0.15176831698045135, "epoch": 0.00082, "frac_reward_zero_std": 0.0, "grad_norm": 0.4504550099372864, "kl": 0.6600655419752002, "learning_rate": 9.999999953703829e-05, "loss": -0.0185, "num_tokens": 2130497.0, "reward": 2.0073609352111816, "reward_std": 8.8825044631958, "rewards/rollout_reward_func/mean": 2.0073609352111816, "rewards/rollout_reward_func/std": 9.321340560913086, "sampling/importance_sampling_ratio/max": 1.5246989727020264, "sampling/importance_sampling_ratio/mean": 1.0359078645706177, "sampling/importance_sampling_ratio/min": 0.3844473361968994, "sampling/sampling_logp_difference/max": 0.955810546875, "sampling/sampling_logp_difference/mean": 0.012838078662753105, "step": 41, "step_time": 28.587795755999878 }, { "clip_ratio/high_max": 0.03172348579391837, "clip_ratio/high_mean": 0.009232954820618033, "clip_ratio/low_mean": 0.022608901956118643, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03184185642749071, "entropy": 0.14899979438632727, "epoch": 0.00084, "grad_norm": 2.304894208908081, "kl": 1.5326191950589418, "learning_rate": 9.999999933333512e-05, "loss": -0.0201, "step": 42, "step_time": 8.04831712999976 }, { "clip_ratio/high_max": 0.005681818351149559, "clip_ratio/high_mean": 0.0014204545877873898, "clip_ratio/low_mean": 0.0026041667442768812, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004024621332064271, "completions/clipped_ratio": 0.0, "completions/max_length": 825.0, "completions/max_terminated_length": 825.0, "completions/mean_length": 685.40625, "completions/mean_terminated_length": 685.40625, "completions/min_length": 389.0, "completions/min_terminated_length": 389.0, "entropy": 0.1393027831800282, "epoch": 0.00086, "frac_reward_zero_std": 0.0, "grad_norm": 0.5649179816246033, "kl": 0.7229090742766857, "learning_rate": 9.999999909259503e-05, "loss": -0.017, "num_tokens": 2228288.0, "reward": 1.6912901401519775, "reward_std": 10.596427917480469, "rewards/rollout_reward_func/mean": 1.691290020942688, "rewards/rollout_reward_func/std": 12.0145263671875, "sampling/importance_sampling_ratio/max": 1.3425889015197754, "sampling/importance_sampling_ratio/mean": 0.9553788304328918, "sampling/importance_sampling_ratio/min": 0.5974801778793335, "sampling/sampling_logp_difference/max": 0.34511590003967285, "sampling/sampling_logp_difference/mean": 0.01251951139420271, "step": 43, "step_time": 27.475775691999615 }, { "clip_ratio/high_max": 0.04829545598477125, "clip_ratio/high_mean": 0.015980114112608135, "clip_ratio/low_mean": 0.024053030996583402, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.040033145574852824, "entropy": 0.14617095375433564, "epoch": 0.00088, "grad_norm": 0.3445337116718292, "kl": 0.5654929745942354, "learning_rate": 9.9999998814818e-05, "loss": -0.023, "step": 44, "step_time": 7.598477493999553 }, { "clip_ratio/high_max": 0.0052083334885537624, "clip_ratio/high_mean": 0.0013020833721384406, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0013020833721384406, "completions/clipped_ratio": 0.0, "completions/max_length": 831.0, "completions/max_terminated_length": 831.0, "completions/mean_length": 713.671875, "completions/mean_terminated_length": 713.671875, "completions/min_length": 656.0, "completions/min_terminated_length": 656.0, "entropy": 0.14390681218355894, "epoch": 0.0009, "frac_reward_zero_std": 0.0, "grad_norm": 0.45263612270355225, "kl": 0.6904484182596207, "learning_rate": 9.999999850000404e-05, "loss": -0.005, "num_tokens": 2328132.0, "reward": 2.4324169158935547, "reward_std": 13.961143493652344, "rewards/rollout_reward_func/mean": 2.4324169158935547, "rewards/rollout_reward_func/std": 14.438629150390625, "sampling/importance_sampling_ratio/max": 1.3720179796218872, "sampling/importance_sampling_ratio/mean": 1.00229012966156, "sampling/importance_sampling_ratio/min": 0.6608520746231079, "sampling/sampling_logp_difference/max": 0.301973819732666, "sampling/sampling_logp_difference/mean": 0.010271631181240082, "step": 45, "step_time": 28.995988180999802 }, { "clip_ratio/high_max": 0.026041667442768812, "clip_ratio/high_mean": 0.006510416860692203, "clip_ratio/low_mean": 0.02043087175115943, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.026941288728266954, "entropy": 0.1375666274689138, "epoch": 0.00092, "grad_norm": 0.3008887469768524, "kl": 0.6632084101438522, "learning_rate": 9.999999814815312e-05, "loss": -0.0106, "step": 46, "step_time": 7.42895066499932 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0026041667442768812, "clip_ratio/low_mean": 0.0027225379599258304, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005326704704202712, "completions/clipped_ratio": 0.0, "completions/max_length": 823.0, "completions/max_terminated_length": 823.0, "completions/mean_length": 698.640625, "completions/mean_terminated_length": 698.640625, "completions/min_length": 393.0, "completions/min_terminated_length": 393.0, "entropy": 0.14624580927193165, "epoch": 0.00094, "frac_reward_zero_std": 0.0, "grad_norm": 0.36133161187171936, "kl": 0.5184649843722582, "learning_rate": 9.99999977592653e-05, "loss": -0.0129, "num_tokens": 2426521.0, "reward": 1.375571846961975, "reward_std": 11.66879940032959, "rewards/rollout_reward_func/mean": 1.3755717277526855, "rewards/rollout_reward_func/std": 11.796045303344727, "sampling/importance_sampling_ratio/max": 1.8656487464904785, "sampling/importance_sampling_ratio/mean": 1.0228910446166992, "sampling/importance_sampling_ratio/min": 0.505867063999176, "sampling/sampling_logp_difference/max": 0.6223084926605225, "sampling/sampling_logp_difference/mean": 0.011709067039191723, "step": 47, "step_time": 29.763493531999984 }, { "clip_ratio/high_max": 0.03172348579391837, "clip_ratio/high_mean": 0.007930871448479593, "clip_ratio/low_mean": 0.02568655402865261, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03361742536071688, "entropy": 0.14968854701146483, "epoch": 0.00096, "grad_norm": 0.17635680735111237, "kl": 0.5038973540067673, "learning_rate": 9.999999733334051e-05, "loss": -0.0167, "step": 48, "step_time": 7.652514348999603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0027225379599258304, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0027225379599258304, "completions/clipped_ratio": 0.0, "completions/max_length": 818.0, "completions/max_terminated_length": 818.0, "completions/mean_length": 704.453125, "completions/mean_terminated_length": 704.453125, "completions/min_length": 635.0, "completions/min_terminated_length": 635.0, "entropy": 0.14840606460347772, "epoch": 0.00098, "frac_reward_zero_std": 0.0, "grad_norm": 0.5855311751365662, "kl": 0.5907826572656631, "learning_rate": 9.99999968703788e-05, "loss": 0.0381, "num_tokens": 2526069.0, "reward": 4.523091793060303, "reward_std": 11.536006927490234, "rewards/rollout_reward_func/mean": 4.523091793060303, "rewards/rollout_reward_func/std": 12.290811538696289, "sampling/importance_sampling_ratio/max": 2.122157573699951, "sampling/importance_sampling_ratio/mean": 1.0083321332931519, "sampling/importance_sampling_ratio/min": 0.6556381583213806, "sampling/sampling_logp_difference/max": 0.5623667240142822, "sampling/sampling_logp_difference/mean": 0.012646196410059929, "step": 49, "step_time": 27.48595007899985 }, { "clip_ratio/high_max": 0.05445075919851661, "clip_ratio/high_mean": 0.017518940148875117, "clip_ratio/low_mean": 0.03401988744735718, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.051538827014155686, "entropy": 0.14183657616376877, "epoch": 0.001, "grad_norm": 0.3584051728248596, "kl": 0.5096510350704193, "learning_rate": 9.999999637038015e-05, "loss": 0.0365, "step": 50, "step_time": 9.165422230000104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0027225379599258304, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0027225379599258304, "completions/clipped_ratio": 0.0, "completions/max_length": 818.0, "completions/max_terminated_length": 818.0, "completions/mean_length": 692.375, "completions/mean_terminated_length": 692.375, "completions/min_length": 290.0, "completions/min_terminated_length": 290.0, "entropy": 0.14247119799256325, "epoch": 0.00102, "frac_reward_zero_std": 0.0, "grad_norm": 0.49646589159965515, "kl": 0.4570716666057706, "learning_rate": 9.999999583334457e-05, "loss": -0.0101, "num_tokens": 2623145.0, "reward": 4.133634567260742, "reward_std": 10.326797485351562, "rewards/rollout_reward_func/mean": 4.133634567260742, "rewards/rollout_reward_func/std": 10.82159423828125, "sampling/importance_sampling_ratio/max": 1.6070019006729126, "sampling/importance_sampling_ratio/mean": 0.996033787727356, "sampling/importance_sampling_ratio/min": 0.5886021852493286, "sampling/sampling_logp_difference/max": 0.543494701385498, "sampling/sampling_logp_difference/mean": 0.010751021094620228, "step": 51, "step_time": 28.26161867099927 }, { "clip_ratio/high_max": 0.04829545598477125, "clip_ratio/high_mean": 0.013375947251915932, "clip_ratio/low_mean": 0.02781723579391837, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0411931830458343, "entropy": 0.13112169969826937, "epoch": 0.00104, "grad_norm": 0.34045207500457764, "kl": 0.5393304694443941, "learning_rate": 9.999999525927207e-05, "loss": -0.016, "step": 52, "step_time": 6.901260032999289 }, { "clip_ratio/high_max": 0.0052083334885537624, "clip_ratio/high_mean": 0.0013020833721384406, "clip_ratio/low_mean": 0.0013020833721384406, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0026041667442768812, "completions/clipped_ratio": 0.0, "completions/max_length": 825.0, "completions/max_terminated_length": 825.0, "completions/mean_length": 691.875, "completions/mean_terminated_length": 691.875, "completions/min_length": 462.0, "completions/min_terminated_length": 462.0, "entropy": 0.11744047561660409, "epoch": 0.00106, "frac_reward_zero_std": 0.0, "grad_norm": 0.3921552300453186, "kl": 0.42071591690182686, "learning_rate": 9.999999464816261e-05, "loss": 0.0037, "num_tokens": 2721107.0, "reward": 4.605119705200195, "reward_std": 12.441184997558594, "rewards/rollout_reward_func/mean": 4.605119228363037, "rewards/rollout_reward_func/std": 14.067066192626953, "sampling/importance_sampling_ratio/max": 1.3290151357650757, "sampling/importance_sampling_ratio/mean": 0.9739052057266235, "sampling/importance_sampling_ratio/min": 0.38011765480041504, "sampling/sampling_logp_difference/max": 0.929356575012207, "sampling/sampling_logp_difference/mean": 0.010732135735452175, "step": 53, "step_time": 30.069013398000834 }, { "clip_ratio/high_max": 0.02651515230536461, "clip_ratio/high_mean": 0.006628788076341152, "clip_ratio/low_mean": 0.022904830053448677, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.029533618362620473, "entropy": 0.10777218686416745, "epoch": 0.00108, "grad_norm": 0.23905742168426514, "kl": 0.5194222312420607, "learning_rate": 9.999999400001624e-05, "loss": 0.002, "step": 54, "step_time": 7.081706939000014 }, { "clip_ratio/high_max": 0.0052083334885537624, "clip_ratio/high_mean": 0.0013020833721384406, "clip_ratio/low_mean": 0.0014204545877873898, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0027225379599258304, "completions/clipped_ratio": 0.0, "completions/max_length": 824.0, "completions/max_terminated_length": 824.0, "completions/mean_length": 702.796875, "completions/mean_terminated_length": 702.796875, "completions/min_length": 614.0, "completions/min_terminated_length": 614.0, "entropy": 0.11523706745356321, "epoch": 0.0011, "frac_reward_zero_std": 0.0, "grad_norm": 0.500625491142273, "kl": 0.5581346470862627, "learning_rate": 9.999999331483292e-05, "loss": -0.0203, "num_tokens": 2818643.0, "reward": 3.496170997619629, "reward_std": 14.47857666015625, "rewards/rollout_reward_func/mean": 3.496170997619629, "rewards/rollout_reward_func/std": 14.920737266540527, "sampling/importance_sampling_ratio/max": 1.5530641078948975, "sampling/importance_sampling_ratio/mean": 1.0201001167297363, "sampling/importance_sampling_ratio/min": 0.5336768627166748, "sampling/sampling_logp_difference/max": 0.6660118103027344, "sampling/sampling_logp_difference/mean": 0.013495232909917831, "step": 55, "step_time": 28.81458637300034 }, { "clip_ratio/high_max": 0.036931819282472134, "clip_ratio/high_mean": 0.011837121681310236, "clip_ratio/low_mean": 0.02758049312978983, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03941761504393071, "entropy": 0.10888301394879818, "epoch": 0.00112, "grad_norm": 0.29490140080451965, "kl": 0.5603756010532379, "learning_rate": 9.999999259261268e-05, "loss": -0.0253, "step": 56, "step_time": 8.193311973000164 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0026041667442768812, "clip_ratio/low_mean": 0.0026041667442768812, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0052083334885537624, "completions/clipped_ratio": 0.0, "completions/max_length": 832.0, "completions/max_terminated_length": 832.0, "completions/mean_length": 693.90625, "completions/mean_terminated_length": 693.90625, "completions/min_length": 494.0, "completions/min_terminated_length": 494.0, "entropy": 0.12224696017801762, "epoch": 0.00114, "frac_reward_zero_std": 0.0, "grad_norm": 0.4371468722820282, "kl": 0.5304271820932627, "learning_rate": 9.99999918333555e-05, "loss": 0.0189, "num_tokens": 2916279.0, "reward": 3.36903715133667, "reward_std": 12.011173248291016, "rewards/rollout_reward_func/mean": 3.369036912918091, "rewards/rollout_reward_func/std": 12.399989128112793, "sampling/importance_sampling_ratio/max": 1.8561766147613525, "sampling/importance_sampling_ratio/mean": 1.0033948421478271, "sampling/importance_sampling_ratio/min": 0.3815801441669464, "sampling/sampling_logp_difference/max": 0.957763671875, "sampling/sampling_logp_difference/mean": 0.011768012307584286, "step": 57, "step_time": 28.027659202000223 }, { "clip_ratio/high_max": 0.05255681974813342, "clip_ratio/high_mean": 0.015743371564894915, "clip_ratio/low_mean": 0.019767993013374507, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0355113644618541, "entropy": 0.12679382599890232, "epoch": 0.00116, "grad_norm": 0.3022422790527344, "kl": 0.5225307196378708, "learning_rate": 9.999999103706142e-05, "loss": 0.015, "step": 58, "step_time": 8.72335070799977 }, { "clip_ratio/high_max": 0.0052083334885537624, "clip_ratio/high_mean": 0.0013020833721384406, "clip_ratio/low_mean": 0.0026041667442768812, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003906250116415322, "completions/clipped_ratio": 0.0, "completions/max_length": 844.0, "completions/max_terminated_length": 844.0, "completions/mean_length": 681.265625, "completions/mean_terminated_length": 681.265625, "completions/min_length": 373.0, "completions/min_terminated_length": 373.0, "entropy": 0.12399047752842307, "epoch": 0.00118, "frac_reward_zero_std": 0.0, "grad_norm": 0.6583297848701477, "kl": 0.5364211667329073, "learning_rate": 9.999999020373037e-05, "loss": 0.0117, "num_tokens": 3012934.0, "reward": 2.8170366287231445, "reward_std": 12.926514625549316, "rewards/rollout_reward_func/mean": 2.8170366287231445, "rewards/rollout_reward_func/std": 13.227665901184082, "sampling/importance_sampling_ratio/max": 2.4036612510681152, "sampling/importance_sampling_ratio/mean": 0.9975829720497131, "sampling/importance_sampling_ratio/min": 0.6259334683418274, "sampling/sampling_logp_difference/max": 0.720775842666626, "sampling/sampling_logp_difference/mean": 0.010457618162035942, "step": 59, "step_time": 28.9596313010004 }, { "clip_ratio/high_max": 0.0416666679084301, "clip_ratio/high_mean": 0.011718750349245965, "clip_ratio/low_mean": 0.03385416732635349, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.04557291779201478, "entropy": 0.11637644609436393, "epoch": 0.0012, "grad_norm": 1.9307663440704346, "kl": 1.8184253200888634, "learning_rate": 9.999998933336241e-05, "loss": 0.0213, "step": 60, "step_time": 7.307322721999981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0014204545877873898, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0014204545877873898, "completions/clipped_ratio": 0.0, "completions/max_length": 816.0, "completions/max_terminated_length": 816.0, "completions/mean_length": 685.78125, "completions/mean_terminated_length": 685.78125, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "entropy": 0.11613691644743085, "epoch": 0.00122, "frac_reward_zero_std": 0.0, "grad_norm": 0.44358986616134644, "kl": 0.5193471424281597, "learning_rate": 9.999998842595753e-05, "loss": -0.0024, "num_tokens": 3109806.0, "reward": 4.651793479919434, "reward_std": 12.063810348510742, "rewards/rollout_reward_func/mean": 4.651793479919434, "rewards/rollout_reward_func/std": 12.754688262939453, "sampling/importance_sampling_ratio/max": 1.6620776653289795, "sampling/importance_sampling_ratio/mean": 0.9981948137283325, "sampling/importance_sampling_ratio/min": 0.6313586831092834, "sampling/sampling_logp_difference/max": 0.4394187927246094, "sampling/sampling_logp_difference/mean": 0.009169764816761017, "step": 61, "step_time": 30.805930039000714 }, { "clip_ratio/high_max": 0.026988637167960405, "clip_ratio/high_mean": 0.01065340917557478, "clip_ratio/low_mean": 0.020951705169864, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03160511434543878, "entropy": 0.12027787417173386, "epoch": 0.00124, "grad_norm": 0.3839333951473236, "kl": 0.5386558780446649, "learning_rate": 9.999998748151572e-05, "loss": -0.0001, "step": 62, "step_time": 7.061834261000513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0013020833721384406, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0013020833721384406, "completions/clipped_ratio": 0.0, "completions/max_length": 807.0, "completions/max_terminated_length": 807.0, "completions/mean_length": 694.46875, "completions/mean_terminated_length": 694.46875, "completions/min_length": 465.0, "completions/min_terminated_length": 465.0, "entropy": 0.13281571818515658, "epoch": 0.00126, "frac_reward_zero_std": 0.0, "grad_norm": 0.47250673174858093, "kl": 0.5621049534529448, "learning_rate": 9.999998650003696e-05, "loss": -0.0068, "num_tokens": 3207160.0, "reward": 4.072500705718994, "reward_std": 12.934675216674805, "rewards/rollout_reward_func/mean": 4.072500705718994, "rewards/rollout_reward_func/std": 13.5437650680542, "sampling/importance_sampling_ratio/max": 1.4505815505981445, "sampling/importance_sampling_ratio/mean": 1.0127054452896118, "sampling/importance_sampling_ratio/min": 0.644386887550354, "sampling/sampling_logp_difference/max": 0.46297478675842285, "sampling/sampling_logp_difference/mean": 0.01112096942961216, "step": 63, "step_time": 27.765410665000445 }, { "clip_ratio/high_max": 0.03645833441987634, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.025236743153072894, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03565340966451913, "entropy": 0.11304981098510325, "epoch": 0.00128, "grad_norm": 0.23461361229419708, "kl": 0.707372922450304, "learning_rate": 9.999998548152131e-05, "loss": -0.0107, "step": 64, "step_time": 9.65409977299987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0013020833721384406, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0013020833721384406, "completions/clipped_ratio": 0.0, "completions/max_length": 827.0, "completions/max_terminated_length": 827.0, "completions/mean_length": 715.78125, "completions/mean_terminated_length": 715.78125, "completions/min_length": 618.0, "completions/min_terminated_length": 618.0, "entropy": 0.11744949175044894, "epoch": 0.0013, "frac_reward_zero_std": 0.0, "grad_norm": 1.5356135368347168, "kl": 2.483667228370905, "learning_rate": 9.999998442596872e-05, "loss": 0.0155, "num_tokens": 3305784.0, "reward": 3.657202959060669, "reward_std": 10.959955215454102, "rewards/rollout_reward_func/mean": 3.657203197479248, "rewards/rollout_reward_func/std": 12.17599105834961, "sampling/importance_sampling_ratio/max": 1.36454439163208, "sampling/importance_sampling_ratio/mean": 1.0064573287963867, "sampling/importance_sampling_ratio/min": 0.6259024739265442, "sampling/sampling_logp_difference/max": 0.4463231563568115, "sampling/sampling_logp_difference/mean": 0.008584607392549515, "step": 65, "step_time": 29.066453170999694 }, { "clip_ratio/high_max": 0.031250000931322575, "clip_ratio/high_mean": 0.010416666860692203, "clip_ratio/low_mean": 0.02178030402865261, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.032196971122175455, "entropy": 0.14252985129132867, "epoch": 0.00132, "grad_norm": 0.2563531696796417, "kl": 0.6245546955615282, "learning_rate": 9.999998333337922e-05, "loss": -0.0004, "step": 66, "step_time": 8.02741467600049 }, { "clip_ratio/high_max": 0.005681818351149559, "clip_ratio/high_mean": 0.0014204545877873898, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0014204545877873898, "completions/clipped_ratio": 0.0, "completions/max_length": 833.0, "completions/max_terminated_length": 833.0, "completions/mean_length": 678.890625, "completions/mean_terminated_length": 678.890625, "completions/min_length": 413.0, "completions/min_terminated_length": 413.0, "entropy": 0.15250376611948013, "epoch": 0.00134, "frac_reward_zero_std": 0.0, "grad_norm": 0.5203387141227722, "kl": 0.695558762177825, "learning_rate": 9.999998220375278e-05, "loss": -0.0145, "num_tokens": 3401864.0, "reward": 1.054423451423645, "reward_std": 11.31953239440918, "rewards/rollout_reward_func/mean": 1.054423451423645, "rewards/rollout_reward_func/std": 12.172701835632324, "sampling/importance_sampling_ratio/max": 1.2194569110870361, "sampling/importance_sampling_ratio/mean": 0.9876125454902649, "sampling/importance_sampling_ratio/min": 0.550414502620697, "sampling/sampling_logp_difference/max": 0.5297477841377258, "sampling/sampling_logp_difference/mean": 0.008570928126573563, "step": 67, "step_time": 30.060426205000795 }, { "clip_ratio/high_max": 0.05965909268707037, "clip_ratio/high_mean": 0.018939394736662507, "clip_ratio/low_mean": 0.03338068269658834, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.05232007708400488, "entropy": 0.17216729745268822, "epoch": 0.00136, "grad_norm": 0.27647653222084045, "kl": 0.6303851045668125, "learning_rate": 9.999998103708944e-05, "loss": -0.0169, "step": 68, "step_time": 7.55158718300072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0015625000232830644, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0015625000232830644, "completions/clipped_ratio": 0.0, "completions/max_length": 818.0, "completions/max_terminated_length": 818.0, "completions/mean_length": 692.171875, "completions/mean_terminated_length": 692.171875, "completions/min_length": 387.0, "completions/min_terminated_length": 387.0, "entropy": 0.17608004808425903, "epoch": 0.00138, "frac_reward_zero_std": 0.0, "grad_norm": 0.40714362263679504, "kl": 0.5837149824947119, "learning_rate": 9.999997983338918e-05, "loss": 0.0075, "num_tokens": 3498494.0, "reward": 4.154041290283203, "reward_std": 15.997432708740234, "rewards/rollout_reward_func/mean": 4.154041290283203, "rewards/rollout_reward_func/std": 18.081926345825195, "sampling/importance_sampling_ratio/max": 1.240838885307312, "sampling/importance_sampling_ratio/mean": 0.9964578747749329, "sampling/importance_sampling_ratio/min": 0.756720781326294, "sampling/sampling_logp_difference/max": 0.326712965965271, "sampling/sampling_logp_difference/mean": 0.009719014167785645, "step": 69, "step_time": 28.760111235000295 }, { "clip_ratio/high_max": 0.04450757708400488, "clip_ratio/high_mean": 0.013731061248108745, "clip_ratio/low_mean": 0.016698232851922512, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.030429294100031257, "entropy": 0.1826375536620617, "epoch": 0.0014, "grad_norm": 0.4711000919342041, "kl": 0.5743975602090359, "learning_rate": 9.999997859265198e-05, "loss": 0.0045, "step": 70, "step_time": 8.15129353600014 }, { "clip_ratio/high_max": 0.0052083334885537624, "clip_ratio/high_mean": 0.0013020833721384406, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0013020833721384406, "completions/clipped_ratio": 0.0, "completions/max_length": 835.0, "completions/max_terminated_length": 835.0, "completions/mean_length": 695.796875, "completions/mean_terminated_length": 695.796875, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "entropy": 0.20647307951003313, "epoch": 0.00142, "frac_reward_zero_std": 0.0, "grad_norm": 0.3142479956150055, "kl": 0.5285101179033518, "learning_rate": 9.999997731487787e-05, "loss": -0.0177, "num_tokens": 3595387.0, "reward": 2.402292490005493, "reward_std": 13.013188362121582, "rewards/rollout_reward_func/mean": 2.402292251586914, "rewards/rollout_reward_func/std": 13.636407852172852, "sampling/importance_sampling_ratio/max": 1.3384240865707397, "sampling/importance_sampling_ratio/mean": 1.011613368988037, "sampling/importance_sampling_ratio/min": 0.776378870010376, "sampling/sampling_logp_difference/max": 0.2462749481201172, "sampling/sampling_logp_difference/mean": 0.009866164065897465, "step": 71, "step_time": 28.09002854999926 }, { "clip_ratio/high_max": 0.043560607358813286, "clip_ratio/high_mean": 0.013494318816810846, "clip_ratio/low_mean": 0.012428977759554982, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.025923296343535185, "entropy": 0.20543431770056486, "epoch": 0.00144, "grad_norm": 0.23259234428405762, "kl": 0.5239685252308846, "learning_rate": 9.999997600006685e-05, "loss": -0.0218, "step": 72, "step_time": 8.672955195000668 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0026041667442768812, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0026041667442768812, "completions/clipped_ratio": 0.0, "completions/max_length": 832.0, "completions/max_terminated_length": 832.0, "completions/mean_length": 704.984375, "completions/mean_terminated_length": 704.984375, "completions/min_length": 533.0, "completions/min_terminated_length": 533.0, "entropy": 0.20190842729061842, "epoch": 0.00146, "frac_reward_zero_std": 0.0, "grad_norm": 0.3707256019115448, "kl": 0.5400361772626638, "learning_rate": 9.999997464821892e-05, "loss": 0.006, "num_tokens": 3692772.0, "reward": 2.049668073654175, "reward_std": 15.488001823425293, "rewards/rollout_reward_func/mean": 2.049668073654175, "rewards/rollout_reward_func/std": 15.380194664001465, "sampling/importance_sampling_ratio/max": 1.1559480428695679, "sampling/importance_sampling_ratio/mean": 0.970598578453064, "sampling/importance_sampling_ratio/min": 0.6524748802185059, "sampling/sampling_logp_difference/max": 0.35463929176330566, "sampling/sampling_logp_difference/mean": 0.009403295814990997, "step": 73, "step_time": 28.82742140900018 }, { "clip_ratio/high_max": 0.06818182021379471, "clip_ratio/high_mean": 0.018347538425587118, "clip_ratio/low_mean": 0.02402935700956732, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.04237689543515444, "entropy": 0.2009204039350152, "epoch": 0.00148, "grad_norm": 0.2297271341085434, "kl": 0.5404210295528173, "learning_rate": 9.999997325933408e-05, "loss": 0.001, "step": 74, "step_time": 7.489119195001422 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0026041667442768812, "clip_ratio/low_mean": 0.0015625000232830644, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004166666767559946, "completions/clipped_ratio": 0.0, "completions/max_length": 810.0, "completions/max_terminated_length": 810.0, "completions/mean_length": 688.1875, "completions/mean_terminated_length": 688.1875, "completions/min_length": 607.0, "completions/min_terminated_length": 607.0, "entropy": 0.19008919596672058, "epoch": 0.0015, "frac_reward_zero_std": 0.0, "grad_norm": 0.39288049936294556, "kl": 0.5065996870398521, "learning_rate": 9.999997183341232e-05, "loss": -0.0174, "num_tokens": 3789251.0, "reward": 5.712855339050293, "reward_std": 12.491518020629883, "rewards/rollout_reward_func/mean": 5.712855339050293, "rewards/rollout_reward_func/std": 13.803718566894531, "sampling/importance_sampling_ratio/max": 1.3862897157669067, "sampling/importance_sampling_ratio/mean": 0.9820230007171631, "sampling/importance_sampling_ratio/min": 0.7251328825950623, "sampling/sampling_logp_difference/max": 0.38344359397888184, "sampling/sampling_logp_difference/mean": 0.011255129240453243, "step": 75, "step_time": 29.904527067999425 }, { "clip_ratio/high_max": 0.0416666679084301, "clip_ratio/high_mean": 0.014322916977107525, "clip_ratio/low_mean": 0.03125000069849193, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.04557291744276881, "entropy": 0.1766198892146349, "epoch": 0.00152, "grad_norm": 0.24856555461883545, "kl": 0.5580815225839615, "learning_rate": 9.999997037045364e-05, "loss": -0.0236, "step": 76, "step_time": 7.936869918999946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 829.0, "completions/max_terminated_length": 829.0, "completions/mean_length": 704.921875, "completions/mean_terminated_length": 704.921875, "completions/min_length": 290.0, "completions/min_terminated_length": 290.0, "entropy": 0.1587599003687501, "epoch": 0.00154, "frac_reward_zero_std": 0.0, "grad_norm": 0.5212773680686951, "kl": 0.5196739248931408, "learning_rate": 9.999996887045807e-05, "loss": -0.0035, "num_tokens": 3886377.0, "reward": 4.441685199737549, "reward_std": 10.929279327392578, "rewards/rollout_reward_func/mean": 4.441685676574707, "rewards/rollout_reward_func/std": 12.737987518310547, "sampling/importance_sampling_ratio/max": 1.4177803993225098, "sampling/importance_sampling_ratio/mean": 0.9960745573043823, "sampling/importance_sampling_ratio/min": 0.6403241157531738, "sampling/sampling_logp_difference/max": 0.35891127586364746, "sampling/sampling_logp_difference/mean": 0.009403642266988754, "step": 77, "step_time": 29.23442492700042 }, { "clip_ratio/high_max": 0.03787878900766373, "clip_ratio/high_mean": 0.009469697251915932, "clip_ratio/low_mean": 0.02260890230536461, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03207859944086522, "entropy": 0.14319165889173746, "epoch": 0.00156, "grad_norm": 0.21223606169223785, "kl": 0.6083459779620171, "learning_rate": 9.999996733342559e-05, "loss": -0.0046, "step": 78, "step_time": 9.08587798599865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0052083334885537624, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0052083334885537624, "completions/clipped_ratio": 0.0, "completions/max_length": 827.0, "completions/max_terminated_length": 827.0, "completions/mean_length": 699.296875, "completions/mean_terminated_length": 699.296875, "completions/min_length": 445.0, "completions/min_terminated_length": 445.0, "entropy": 0.13262670719996095, "epoch": 0.00158, "frac_reward_zero_std": 0.0, "grad_norm": 0.3686180114746094, "kl": 0.5813394356518984, "learning_rate": 9.99999657593562e-05, "loss": 0.0239, "num_tokens": 3983088.0, "reward": 4.565882682800293, "reward_std": 10.690776824951172, "rewards/rollout_reward_func/mean": 4.565882682800293, "rewards/rollout_reward_func/std": 10.94388484954834, "sampling/importance_sampling_ratio/max": 2.301131010055542, "sampling/importance_sampling_ratio/mean": 1.038649559020996, "sampling/importance_sampling_ratio/min": 0.6781718730926514, "sampling/sampling_logp_difference/max": 0.7350552082061768, "sampling/sampling_logp_difference/mean": 0.009047108702361584, "step": 79, "step_time": 29.03609049600027 }, { "clip_ratio/high_max": 0.0691287899389863, "clip_ratio/high_mean": 0.02249053120613098, "clip_ratio/low_mean": 0.017282197484746575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03977272880729288, "entropy": 0.1365647497586906, "epoch": 0.0016, "grad_norm": 0.26296547055244446, "kl": 0.5871373657137156, "learning_rate": 9.99999641482499e-05, "loss": 0.0196, "step": 80, "step_time": 8.78226529199901 }, { "clip_ratio/high_max": 0.0052083334885537624, "clip_ratio/high_mean": 0.0013020833721384406, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0013020833721384406, "completions/clipped_ratio": 0.0, "completions/max_length": 817.0, "completions/max_terminated_length": 817.0, "completions/mean_length": 704.28125, "completions/mean_terminated_length": 704.28125, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "entropy": 0.13410852942615747, "epoch": 0.00162, "frac_reward_zero_std": 0.0, "grad_norm": 0.6391101479530334, "kl": 0.49469868279993534, "learning_rate": 9.999996250010672e-05, "loss": -0.0038, "num_tokens": 4080648.0, "reward": 5.768660545349121, "reward_std": 10.985546112060547, "rewards/rollout_reward_func/mean": 5.768660068511963, "rewards/rollout_reward_func/std": 11.962743759155273, "sampling/importance_sampling_ratio/max": 1.4244225025177002, "sampling/importance_sampling_ratio/mean": 1.0141850709915161, "sampling/importance_sampling_ratio/min": 0.6568657755851746, "sampling/sampling_logp_difference/max": 0.3986610174179077, "sampling/sampling_logp_difference/mean": 0.009017249569296837, "step": 81, "step_time": 29.058386802999394 }, { "clip_ratio/high_max": 0.06250000186264515, "clip_ratio/high_mean": 0.016927083721384406, "clip_ratio/low_mean": 0.025213068933226168, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.04214015288744122, "entropy": 0.1406740453094244, "epoch": 0.00164, "grad_norm": 0.3618878424167633, "kl": 0.5326054207980633, "learning_rate": 9.99999608149266e-05, "loss": -0.0092, "step": 82, "step_time": 7.4923771910011965 }, { "clip_ratio/high_max": 0.0052083334885537624, "clip_ratio/high_mean": 0.0013020833721384406, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0013020833721384406, "completions/clipped_ratio": 0.0, "completions/max_length": 827.0, "completions/max_terminated_length": 827.0, "completions/mean_length": 690.765625, "completions/mean_terminated_length": 690.765625, "completions/min_length": 619.0, "completions/min_terminated_length": 619.0, "entropy": 0.13702308759093285, "epoch": 0.00166, "frac_reward_zero_std": 0.0, "grad_norm": 0.7054314017295837, "kl": 0.5327184200286865, "learning_rate": 9.999995909270962e-05, "loss": 0.0131, "num_tokens": 4176944.0, "reward": 6.398343563079834, "reward_std": 12.486600875854492, "rewards/rollout_reward_func/mean": 6.398343563079834, "rewards/rollout_reward_func/std": 13.118927955627441, "sampling/importance_sampling_ratio/max": 1.1626012325286865, "sampling/importance_sampling_ratio/mean": 0.9923787117004395, "sampling/importance_sampling_ratio/min": 0.6767197847366333, "sampling/sampling_logp_difference/max": 0.27681541442871094, "sampling/sampling_logp_difference/mean": 0.007814774289727211, "step": 83, "step_time": 30.334892443000626 }, { "clip_ratio/high_max": 0.052083334885537624, "clip_ratio/high_mean": 0.014322917093522847, "clip_ratio/low_mean": 0.02935606148093939, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.04367897834163159, "entropy": 0.13890184368938208, "epoch": 0.00168, "grad_norm": 0.23295485973358154, "kl": 0.583111148327589, "learning_rate": 9.999995733345573e-05, "loss": 0.0096, "step": 84, "step_time": 8.188888645999668 }, { "clip_ratio/high_max": 0.005681818351149559, "clip_ratio/high_mean": 0.0014204545877873898, "clip_ratio/low_mean": 0.0013020833721384406, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0027225379599258304, "completions/clipped_ratio": 0.0, "completions/max_length": 847.0, "completions/max_terminated_length": 847.0, "completions/mean_length": 709.140625, "completions/mean_terminated_length": 709.140625, "completions/min_length": 385.0, "completions/min_terminated_length": 385.0, "entropy": 0.1646800385788083, "epoch": 0.0017, "frac_reward_zero_std": 0.0, "grad_norm": 0.721032977104187, "kl": 0.5625268556177616, "learning_rate": 9.999995553716494e-05, "loss": -0.003, "num_tokens": 4273965.0, "reward": 5.8386735916137695, "reward_std": 13.300103187561035, "rewards/rollout_reward_func/mean": 5.8386735916137695, "rewards/rollout_reward_func/std": 13.629975318908691, "sampling/importance_sampling_ratio/max": 1.314743995666504, "sampling/importance_sampling_ratio/mean": 1.0051491260528564, "sampling/importance_sampling_ratio/min": 0.7047513127326965, "sampling/sampling_logp_difference/max": 0.2584061622619629, "sampling/sampling_logp_difference/mean": 0.009669218212366104, "step": 85, "step_time": 28.418585942000846 }, { "clip_ratio/high_max": 0.06912878947332501, "clip_ratio/high_mean": 0.019886364112608135, "clip_ratio/low_mean": 0.04139046813361347, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.061276831780560315, "entropy": 0.16476231161504984, "epoch": 0.00172, "grad_norm": 0.3757534921169281, "kl": 0.6113345008343458, "learning_rate": 9.999995370383726e-05, "loss": -0.0069, "step": 86, "step_time": 8.756163650000417 }, { "clip_ratio/high_max": 0.011363636702299118, "clip_ratio/high_mean": 0.0028409091755747795, "clip_ratio/low_mean": 0.0013020833721384406, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00414299254771322, "completions/clipped_ratio": 0.0, "completions/max_length": 823.0, "completions/max_terminated_length": 823.0, "completions/mean_length": 684.84375, "completions/mean_terminated_length": 684.84375, "completions/min_length": 619.0, "completions/min_terminated_length": 619.0, "entropy": 0.15467680245637894, "epoch": 0.00174, "frac_reward_zero_std": 0.0, "grad_norm": 0.39151129126548767, "kl": 0.5371765866875648, "learning_rate": 9.999995183347267e-05, "loss": 0.0105, "num_tokens": 4369299.0, "reward": 5.963912010192871, "reward_std": 12.684013366699219, "rewards/rollout_reward_func/mean": 5.963912010192871, "rewards/rollout_reward_func/std": 13.017167091369629, "sampling/importance_sampling_ratio/max": 1.2570720911026, "sampling/importance_sampling_ratio/mean": 1.0000150203704834, "sampling/importance_sampling_ratio/min": 0.6576955914497375, "sampling/sampling_logp_difference/max": 0.23494529724121094, "sampling/sampling_logp_difference/mean": 0.009037286043167114, "step": 87, "step_time": 27.959497561998433 }, { "clip_ratio/high_max": 0.04876894038170576, "clip_ratio/high_mean": 0.014796401956118643, "clip_ratio/low_mean": 0.030184660223312676, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.04498106241226196, "entropy": 0.1502314694225788, "epoch": 0.00176, "grad_norm": 0.24433566629886627, "kl": 0.518398828804493, "learning_rate": 9.999994992607121e-05, "loss": 0.0052, "step": 88, "step_time": 6.98385682199978 }, { "clip_ratio/high_max": 0.0052083334885537624, "clip_ratio/high_mean": 0.0013020833721384406, "clip_ratio/low_mean": 0.0013020833721384406, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0026041667442768812, "completions/clipped_ratio": 0.0, "completions/max_length": 808.0, "completions/max_terminated_length": 808.0, "completions/mean_length": 674.84375, "completions/mean_terminated_length": 674.84375, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "entropy": 0.1636304627172649, "epoch": 0.00178, "frac_reward_zero_std": 0.0, "grad_norm": 0.4070056080818176, "kl": 0.44829913787543774, "learning_rate": 9.999994798163285e-05, "loss": 0.0028, "num_tokens": 4464636.0, "reward": 4.596271991729736, "reward_std": 12.002615928649902, "rewards/rollout_reward_func/mean": 4.5962724685668945, "rewards/rollout_reward_func/std": 12.03700065612793, "sampling/importance_sampling_ratio/max": 1.8310773372650146, "sampling/importance_sampling_ratio/mean": 1.0015285015106201, "sampling/importance_sampling_ratio/min": 0.6802361011505127, "sampling/sampling_logp_difference/max": 0.63387131690979, "sampling/sampling_logp_difference/mean": 0.01002519205212593, "step": 89, "step_time": 29.20652451100068 }, { "clip_ratio/high_max": 0.053503789473325014, "clip_ratio/high_mean": 0.014678030740469694, "clip_ratio/low_mean": 0.014914773171767592, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02959280402865261, "entropy": 0.16931697819381952, "epoch": 0.0018, "grad_norm": 0.22567316889762878, "kl": 0.45854073390364647, "learning_rate": 9.999994600015763e-05, "loss": -0.0044, "step": 90, "step_time": 7.806028198999684 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0026041667442768812, "clip_ratio/low_mean": 0.0013020833721384406, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003906250116415322, "completions/clipped_ratio": 0.0, "completions/max_length": 818.0, "completions/max_terminated_length": 818.0, "completions/mean_length": 690.921875, "completions/mean_terminated_length": 690.921875, "completions/min_length": 573.0, "completions/min_terminated_length": 573.0, "entropy": 0.17555938381701708, "epoch": 0.00182, "frac_reward_zero_std": 0.0, "grad_norm": 0.6369052529335022, "kl": 0.5130380634218454, "learning_rate": 9.99999439816455e-05, "loss": 0.0097, "num_tokens": 4560466.0, "reward": 4.266380786895752, "reward_std": 8.932316780090332, "rewards/rollout_reward_func/mean": 4.266380786895752, "rewards/rollout_reward_func/std": 9.506205558776855, "sampling/importance_sampling_ratio/max": 1.4317197799682617, "sampling/importance_sampling_ratio/mean": 0.9800074100494385, "sampling/importance_sampling_ratio/min": 0.6640469431877136, "sampling/sampling_logp_difference/max": 0.39695852994918823, "sampling/sampling_logp_difference/mean": 0.011851027607917786, "step": 91, "step_time": 30.03264877599986 }, { "clip_ratio/high_max": 0.07812500186264515, "clip_ratio/high_mean": 0.027343750349245965, "clip_ratio/low_mean": 0.02604166732635349, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.05338541814126074, "entropy": 0.16226398199796677, "epoch": 0.00184, "grad_norm": 0.49021783471107483, "kl": 0.678026232868433, "learning_rate": 9.999994192609649e-05, "loss": 0.0008, "step": 92, "step_time": 9.04058756600034 }, { "clip_ratio/high_max": 0.0052083334885537624, "clip_ratio/high_mean": 0.0013020833721384406, "clip_ratio/low_mean": 0.0026041667442768812, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003906250116415322, "completions/clipped_ratio": 0.0, "completions/max_length": 827.0, "completions/max_terminated_length": 827.0, "completions/mean_length": 706.40625, "completions/mean_terminated_length": 706.40625, "completions/min_length": 297.0, "completions/min_terminated_length": 297.0, "entropy": 0.1653224742040038, "epoch": 0.00186, "frac_reward_zero_std": 0.0, "grad_norm": 0.5930750966072083, "kl": 0.533378497697413, "learning_rate": 9.999993983351059e-05, "loss": 0.0049, "num_tokens": 4657400.0, "reward": 4.687631607055664, "reward_std": 12.176762580871582, "rewards/rollout_reward_func/mean": 4.687631607055664, "rewards/rollout_reward_func/std": 13.946465492248535, "sampling/importance_sampling_ratio/max": 2.0640549659729004, "sampling/importance_sampling_ratio/mean": 1.0510772466659546, "sampling/importance_sampling_ratio/min": 0.6677830219268799, "sampling/sampling_logp_difference/max": 0.5909380912780762, "sampling/sampling_logp_difference/mean": 0.01191171444952488, "step": 93, "step_time": 28.226474250999672 }, { "clip_ratio/high_max": 0.0416666679084301, "clip_ratio/high_mean": 0.011718750349245965, "clip_ratio/low_mean": 0.022135417442768812, "clip_ratio/low_min": 0.0052083334885537624, "clip_ratio/region_mean": 0.033854167675599456, "entropy": 0.15958264330402017, "epoch": 0.00188, "grad_norm": 0.35930758714675903, "kl": 0.7466034032404423, "learning_rate": 9.999993770388783e-05, "loss": 0.0032, "step": 94, "step_time": 8.234778083000037 }, { "clip_ratio/high_max": 0.0052083334885537624, "clip_ratio/high_mean": 0.0013020833721384406, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0013020833721384406, "completions/clipped_ratio": 0.0, "completions/max_length": 824.0, "completions/max_terminated_length": 824.0, "completions/mean_length": 697.828125, "completions/mean_terminated_length": 697.828125, "completions/min_length": 623.0, "completions/min_terminated_length": 623.0, "entropy": 0.16395951714366674, "epoch": 0.0019, "frac_reward_zero_std": 0.0, "grad_norm": 0.30130699276924133, "kl": 0.4953720346093178, "learning_rate": 9.99999355372282e-05, "loss": 0.0087, "num_tokens": 4753836.0, "reward": 4.204550743103027, "reward_std": 11.951547622680664, "rewards/rollout_reward_func/mean": 4.204550743103027, "rewards/rollout_reward_func/std": 13.192495346069336, "sampling/importance_sampling_ratio/max": 1.7262465953826904, "sampling/importance_sampling_ratio/mean": 1.0100435018539429, "sampling/importance_sampling_ratio/min": 0.6937407851219177, "sampling/sampling_logp_difference/max": 0.5034514665603638, "sampling/sampling_logp_difference/mean": 0.008402319625020027, "step": 95, "step_time": 29.812369647001105 }, { "clip_ratio/high_max": 0.03645833441987634, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.009114583604969084, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01953125058207661, "entropy": 0.18544823909178376, "epoch": 0.00192, "grad_norm": 0.2023509442806244, "kl": 0.44245083443820477, "learning_rate": 9.999993333353168e-05, "loss": 0.0061, "step": 96, "step_time": 7.195571093998296 }, { "clip_ratio/high_max": 0.0052083334885537624, "clip_ratio/high_mean": 0.0026041667442768812, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0026041667442768812, "completions/clipped_ratio": 0.0, "completions/max_length": 829.0, "completions/max_terminated_length": 829.0, "completions/mean_length": 687.921875, "completions/mean_terminated_length": 687.921875, "completions/min_length": 596.0, "completions/min_terminated_length": 596.0, "entropy": 0.20835321862250566, "epoch": 0.00194, "frac_reward_zero_std": 0.0, "grad_norm": 0.35837119817733765, "kl": 0.4389466196298599, "learning_rate": 9.999993109279828e-05, "loss": 0.0044, "num_tokens": 4849131.0, "reward": 3.880918264389038, "reward_std": 8.090033531188965, "rewards/rollout_reward_func/mean": 3.880918264389038, "rewards/rollout_reward_func/std": 9.26294231414795, "sampling/importance_sampling_ratio/max": 1.2344332933425903, "sampling/importance_sampling_ratio/mean": 0.9644654989242554, "sampling/importance_sampling_ratio/min": 0.7370292544364929, "sampling/sampling_logp_difference/max": 0.29116082191467285, "sampling/sampling_logp_difference/mean": 0.009601429104804993, "step": 97, "step_time": 30.270599251999556 }, { "clip_ratio/high_max": 0.052083334885537624, "clip_ratio/high_mean": 0.016927083721384406, "clip_ratio/low_mean": 0.014559659757651389, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.031486743479035795, "entropy": 0.21354177221655846, "epoch": 0.00196, "grad_norm": 0.20351360738277435, "kl": 0.43841097690165043, "learning_rate": 9.999992881502804e-05, "loss": 0.0004, "step": 98, "step_time": 7.506363271999817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 824.0, "completions/max_terminated_length": 824.0, "completions/mean_length": 706.140625, "completions/mean_terminated_length": 706.140625, "completions/min_length": 391.0, "completions/min_terminated_length": 391.0, "entropy": 0.21307788416743279, "epoch": 0.00198, "frac_reward_zero_std": 0.0, "grad_norm": 0.4080103039741516, "kl": 0.5334641952067614, "learning_rate": 9.99999265002209e-05, "loss": -0.003, "num_tokens": 4945915.0, "reward": 5.200403213500977, "reward_std": 14.344334602355957, "rewards/rollout_reward_func/mean": 5.200403213500977, "rewards/rollout_reward_func/std": 14.294367790222168, "sampling/importance_sampling_ratio/max": 1.2247880697250366, "sampling/importance_sampling_ratio/mean": 1.0129998922348022, "sampling/importance_sampling_ratio/min": 0.7771543860435486, "sampling/sampling_logp_difference/max": 0.23006606101989746, "sampling/sampling_logp_difference/mean": 0.00854739174246788, "step": 99, "step_time": 29.33993570499979 }, { "clip_ratio/high_max": 0.015625000465661287, "clip_ratio/high_mean": 0.006510416744276881, "clip_ratio/low_mean": 0.023555872030556202, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.030066288774833083, "entropy": 0.21015852224081755, "epoch": 0.002, "grad_norm": 0.2297798991203308, "kl": 0.5835338849574327, "learning_rate": 9.999992414837691e-05, "loss": -0.008, "step": 100, "step_time": 8.775622698999086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 837.0, "completions/max_terminated_length": 837.0, "completions/mean_length": 711.671875, "completions/mean_terminated_length": 711.671875, "completions/min_length": 616.0, "completions/min_terminated_length": 616.0, "entropy": 0.2137407148256898, "epoch": 0.00202, "frac_reward_zero_std": 0.0, "grad_norm": 0.4429977834224701, "kl": 0.4639507979154587, "learning_rate": 9.999992175949606e-05, "loss": -0.0173, "num_tokens": 5042733.0, "reward": 3.351179838180542, "reward_std": 8.503268241882324, "rewards/rollout_reward_func/mean": 3.351179838180542, "rewards/rollout_reward_func/std": 8.948554039001465, "sampling/importance_sampling_ratio/max": 1.328324556350708, "sampling/importance_sampling_ratio/mean": 1.0001481771469116, "sampling/importance_sampling_ratio/min": 0.5792597532272339, "sampling/sampling_logp_difference/max": 0.4302701950073242, "sampling/sampling_logp_difference/mean": 0.008802896365523338, "step": 101, "step_time": 29.50366010900052 }, { "clip_ratio/high_max": 0.0572916679084301, "clip_ratio/high_mean": 0.02083333407063037, "clip_ratio/low_mean": 0.021188447950407863, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.04202178155537695, "entropy": 0.19818230718374252, "epoch": 0.00204, "grad_norm": 0.228831484913826, "kl": 0.523833503946662, "learning_rate": 9.999991933357836e-05, "loss": -0.0238, "step": 102, "step_time": 7.743058271999871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 819.0, "completions/max_terminated_length": 819.0, "completions/mean_length": 679.5, "completions/mean_terminated_length": 679.5, "completions/min_length": 393.0, "completions/min_terminated_length": 393.0, "entropy": 0.16811883123591542, "epoch": 0.00206, "frac_reward_zero_std": 0.0, "grad_norm": 0.2717866897583008, "kl": 0.5116975158452988, "learning_rate": 9.999991687062378e-05, "loss": 0.0026, "num_tokens": 5137485.0, "reward": 3.233732223510742, "reward_std": 12.289377212524414, "rewards/rollout_reward_func/mean": 3.233732223510742, "rewards/rollout_reward_func/std": 14.167500495910645, "sampling/importance_sampling_ratio/max": 1.1452041864395142, "sampling/importance_sampling_ratio/mean": 0.9949536323547363, "sampling/importance_sampling_ratio/min": 0.8263934254646301, "sampling/sampling_logp_difference/max": 0.11179852485656738, "sampling/sampling_logp_difference/mean": 0.00560589786618948, "step": 103, "step_time": 28.410943980999036 }, { "clip_ratio/high_max": 0.03645833441987634, "clip_ratio/high_mean": 0.009114583604969084, "clip_ratio/low_mean": 0.036576704937033355, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.04569128877483308, "entropy": 0.1520394361577928, "epoch": 0.00208, "grad_norm": 0.1855272352695465, "kl": 0.548751313239336, "learning_rate": 9.999991437063234e-05, "loss": -0.0007, "step": 104, "step_time": 7.630572153999765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0013020833721384406, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0013020833721384406, "completions/clipped_ratio": 0.0, "completions/max_length": 821.0, "completions/max_terminated_length": 821.0, "completions/mean_length": 689.109375, "completions/mean_terminated_length": 689.109375, "completions/min_length": 379.0, "completions/min_terminated_length": 379.0, "entropy": 0.15672127809375525, "epoch": 0.0021, "frac_reward_zero_std": 0.0, "grad_norm": 0.45729750394821167, "kl": 0.6510039251297712, "learning_rate": 9.999991183360407e-05, "loss": -0.011, "num_tokens": 5232831.0, "reward": 4.220555305480957, "reward_std": 10.952154159545898, "rewards/rollout_reward_func/mean": 4.220555305480957, "rewards/rollout_reward_func/std": 11.161866188049316, "sampling/importance_sampling_ratio/max": 1.289732813835144, "sampling/importance_sampling_ratio/mean": 0.9952840209007263, "sampling/importance_sampling_ratio/min": 0.6639890074729919, "sampling/sampling_logp_difference/max": 0.4248615503311157, "sampling/sampling_logp_difference/mean": 0.009283961728215218, "step": 105, "step_time": 29.269957731999057 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.006510416860692203, "clip_ratio/low_mean": 0.015625000232830644, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.022135417093522847, "entropy": 0.15183987142518163, "epoch": 0.00212, "grad_norm": 0.19675187766551971, "kl": 0.7388164456933737, "learning_rate": 9.999990925953892e-05, "loss": -0.0165, "step": 106, "step_time": 7.576425396001014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0013020833721384406, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0013020833721384406, "completions/clipped_ratio": 0.0, "completions/max_length": 817.0, "completions/max_terminated_length": 817.0, "completions/mean_length": 695.0625, "completions/mean_terminated_length": 695.0625, "completions/min_length": 619.0, "completions/min_terminated_length": 619.0, "entropy": 0.14494483266025782, "epoch": 0.00214, "frac_reward_zero_std": 0.0, "grad_norm": 0.4538170397281647, "kl": 0.678084384649992, "learning_rate": 9.999990664843695e-05, "loss": 0.0147, "num_tokens": 5328578.0, "reward": 9.525361061096191, "reward_std": 13.358152389526367, "rewards/rollout_reward_func/mean": 9.525361061096191, "rewards/rollout_reward_func/std": 14.251992225646973, "sampling/importance_sampling_ratio/max": 1.1812809705734253, "sampling/importance_sampling_ratio/mean": 0.9926539659500122, "sampling/importance_sampling_ratio/min": 0.7029387950897217, "sampling/sampling_logp_difference/max": 0.35564422607421875, "sampling/sampling_logp_difference/mean": 0.007083391770720482, "step": 107, "step_time": 28.04022229299926 }, { "clip_ratio/high_max": 0.046875000931322575, "clip_ratio/high_mean": 0.015625000349245965, "clip_ratio/low_mean": 0.015861742896959186, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03148674312978983, "entropy": 0.1589709185063839, "epoch": 0.00216, "grad_norm": 0.22844459116458893, "kl": 0.6251159347593784, "learning_rate": 9.999990400029812e-05, "loss": 0.0106, "step": 108, "step_time": 8.196292393000022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0013020833721384406, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0013020833721384406, "completions/clipped_ratio": 0.0, "completions/max_length": 832.0, "completions/max_terminated_length": 832.0, "completions/mean_length": 703.75, "completions/mean_terminated_length": 703.75, "completions/min_length": 522.0, "completions/min_terminated_length": 522.0, "entropy": 0.17329717054963112, "epoch": 0.00218, "frac_reward_zero_std": 0.0, "grad_norm": 0.3682776987552643, "kl": 0.6051198206841946, "learning_rate": 9.999990131512245e-05, "loss": 0.0061, "num_tokens": 5424927.0, "reward": 6.206368923187256, "reward_std": 10.578010559082031, "rewards/rollout_reward_func/mean": 6.206368923187256, "rewards/rollout_reward_func/std": 11.067666053771973, "sampling/importance_sampling_ratio/max": 1.4831089973449707, "sampling/importance_sampling_ratio/mean": 1.002763271331787, "sampling/importance_sampling_ratio/min": 0.7234499454498291, "sampling/sampling_logp_difference/max": 0.3583219051361084, "sampling/sampling_logp_difference/mean": 0.007746794261038303, "step": 109, "step_time": 27.809638274999543 }, { "clip_ratio/high_max": 0.026988637167960405, "clip_ratio/high_mean": 0.010653409641236067, "clip_ratio/low_mean": 0.014441288309171796, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.025094697950407863, "entropy": 0.17460143100470304, "epoch": 0.0022, "grad_norm": 0.1794712245464325, "kl": 0.6243367586284876, "learning_rate": 9.999989859290995e-05, "loss": 0.0027, "step": 110, "step_time": 7.0755484739993335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 820.0, "completions/max_terminated_length": 820.0, "completions/mean_length": 697.578125, "completions/mean_terminated_length": 697.578125, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "entropy": 0.17424820829182863, "epoch": 0.00222, "frac_reward_zero_std": 0.0, "grad_norm": 0.42331627011299133, "kl": 0.586581215262413, "learning_rate": 9.99998958336606e-05, "loss": -0.0044, "num_tokens": 5520852.0, "reward": 3.5279414653778076, "reward_std": 14.582866668701172, "rewards/rollout_reward_func/mean": 3.5279414653778076, "rewards/rollout_reward_func/std": 15.890913963317871, "sampling/importance_sampling_ratio/max": 1.2239214181900024, "sampling/importance_sampling_ratio/mean": 0.9994624853134155, "sampling/importance_sampling_ratio/min": 0.6852503418922424, "sampling/sampling_logp_difference/max": 0.31956130266189575, "sampling/sampling_logp_difference/mean": 0.006933148950338364, "step": 111, "step_time": 29.204085013999247 }, { "clip_ratio/high_max": 0.02651515230536461, "clip_ratio/high_mean": 0.006628788076341152, "clip_ratio/low_mean": 0.018129006726667285, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02475779491942376, "entropy": 0.1639441135339439, "epoch": 0.00224, "grad_norm": 0.19418354332447052, "kl": 0.650929281488061, "learning_rate": 9.999989303737441e-05, "loss": -0.0109, "step": 112, "step_time": 7.643361527999332 }, { "clip_ratio/high_max": 0.0052083334885537624, "clip_ratio/high_mean": 0.0013020833721384406, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0013020833721384406, "completions/clipped_ratio": 0.0, "completions/max_length": 813.0, "completions/max_terminated_length": 813.0, "completions/mean_length": 693.09375, "completions/mean_terminated_length": 693.09375, "completions/min_length": 630.0, "completions/min_terminated_length": 630.0, "entropy": 0.14537212159484625, "epoch": 0.00226, "frac_reward_zero_std": 0.0, "grad_norm": 0.4737316071987152, "kl": 0.669768800958991, "learning_rate": 9.99998902040514e-05, "loss": 0.0169, "num_tokens": 5616460.0, "reward": 3.8732333183288574, "reward_std": 9.794268608093262, "rewards/rollout_reward_func/mean": 3.8732333183288574, "rewards/rollout_reward_func/std": 10.40365982055664, "sampling/importance_sampling_ratio/max": 1.1869113445281982, "sampling/importance_sampling_ratio/mean": 0.9964576959609985, "sampling/importance_sampling_ratio/min": 0.5200645923614502, "sampling/sampling_logp_difference/max": 0.6150112152099609, "sampling/sampling_logp_difference/mean": 0.007128065451979637, "step": 113, "step_time": 27.900884353000038 }, { "clip_ratio/high_max": 0.042140152771025896, "clip_ratio/high_mean": 0.011837121681310236, "clip_ratio/low_mean": 0.006510416860692203, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01834753854200244, "entropy": 0.14897123211994767, "epoch": 0.00228, "grad_norm": 0.20660799741744995, "kl": 0.7189689762890339, "learning_rate": 9.999988733369157e-05, "loss": 0.0137, "step": 114, "step_time": 7.532232160000149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 828.0, "completions/max_terminated_length": 828.0, "completions/mean_length": 689.0625, "completions/mean_terminated_length": 689.0625, "completions/min_length": 291.0, "completions/min_terminated_length": 291.0, "entropy": 0.16697307769209146, "epoch": 0.0023, "frac_reward_zero_std": 0.0, "grad_norm": 0.37319666147232056, "kl": 0.6085000336170197, "learning_rate": 9.999988442629488e-05, "loss": -0.015, "num_tokens": 5711756.0, "reward": 3.845529079437256, "reward_std": 9.702705383300781, "rewards/rollout_reward_func/mean": 3.845529079437256, "rewards/rollout_reward_func/std": 9.905435562133789, "sampling/importance_sampling_ratio/max": 1.3216222524642944, "sampling/importance_sampling_ratio/mean": 1.0128694772720337, "sampling/importance_sampling_ratio/min": 0.7146333456039429, "sampling/sampling_logp_difference/max": 0.3742462396621704, "sampling/sampling_logp_difference/mean": 0.006911748554557562, "step": 115, "step_time": 29.116642522001257 }, { "clip_ratio/high_max": 0.0416666679084301, "clip_ratio/high_mean": 0.011718750349245965, "clip_ratio/low_mean": 0.020951705169864, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.032670455519109964, "entropy": 0.17214004416018724, "epoch": 0.00232, "grad_norm": 0.19411630928516388, "kl": 0.6454576198011637, "learning_rate": 9.99998814818614e-05, "loss": -0.0191, "step": 116, "step_time": 7.846242159999747 }, { "clip_ratio/high_max": 0.0052083334885537624, "clip_ratio/high_mean": 0.0013020833721384406, "clip_ratio/low_mean": 0.0013020833721384406, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0026041667442768812, "completions/clipped_ratio": 0.0, "completions/max_length": 815.0, "completions/max_terminated_length": 815.0, "completions/mean_length": 674.78125, "completions/mean_terminated_length": 674.78125, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "entropy": 0.16358821745961905, "epoch": 0.00234, "frac_reward_zero_std": 0.0, "grad_norm": 0.3246734142303467, "kl": 0.5800395030528307, "learning_rate": 9.999987850039107e-05, "loss": 0.0099, "num_tokens": 5806145.0, "reward": 1.2733659744262695, "reward_std": 12.069713592529297, "rewards/rollout_reward_func/mean": 1.2733662128448486, "rewards/rollout_reward_func/std": 12.829185485839844, "sampling/importance_sampling_ratio/max": 1.306739330291748, "sampling/importance_sampling_ratio/mean": 1.0012977123260498, "sampling/importance_sampling_ratio/min": 0.8135073781013489, "sampling/sampling_logp_difference/max": 0.19866454601287842, "sampling/sampling_logp_difference/mean": 0.006336529273539782, "step": 117, "step_time": 27.930649275999258 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.006510416860692203, "clip_ratio/low_mean": 0.013020833721384406, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01953125058207661, "entropy": 0.16348634008318186, "epoch": 0.00236, "grad_norm": 0.11877016723155975, "kl": 0.587722685188055, "learning_rate": 9.999987548188396e-05, "loss": 0.0055, "step": 118, "step_time": 7.173724952000157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 834.0, "completions/max_terminated_length": 834.0, "completions/mean_length": 685.921875, "completions/mean_terminated_length": 685.921875, "completions/min_length": 307.0, "completions/min_terminated_length": 307.0, "entropy": 0.17666231095790863, "epoch": 0.00238, "frac_reward_zero_std": 0.0, "grad_norm": 0.2972959578037262, "kl": 0.5733677446842194, "learning_rate": 9.999987242634001e-05, "loss": 0.0156, "num_tokens": 5901319.0, "reward": 6.098433494567871, "reward_std": 11.96851921081543, "rewards/rollout_reward_func/mean": 6.098433494567871, "rewards/rollout_reward_func/std": 14.112695693969727, "sampling/importance_sampling_ratio/max": 1.2103277444839478, "sampling/importance_sampling_ratio/mean": 1.0073938369750977, "sampling/importance_sampling_ratio/min": 0.7692804932594299, "sampling/sampling_logp_difference/max": 0.13658356666564941, "sampling/sampling_logp_difference/mean": 0.0063937013037502766, "step": 119, "step_time": 28.342584406000242 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0026041667442768812, "clip_ratio/low_mean": 0.009114583488553762, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011718750232830644, "entropy": 0.16604932164773345, "epoch": 0.0024, "grad_norm": 0.23078079521656036, "kl": 0.5974587891250849, "learning_rate": 9.999986933375924e-05, "loss": 0.0105, "step": 120, "step_time": 7.440147934999914 }, { "clip_ratio/high_max": 0.005681818351149559, "clip_ratio/high_mean": 0.0014204545877873898, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0014204545877873898, "completions/clipped_ratio": 0.0, "completions/max_length": 827.0, "completions/max_terminated_length": 827.0, "completions/mean_length": 677.953125, "completions/mean_terminated_length": 677.953125, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "entropy": 0.13314053160138428, "epoch": 0.00242, "frac_reward_zero_std": 0.0, "grad_norm": 0.2688275873661041, "kl": 0.6813949979841709, "learning_rate": 9.999986620414167e-05, "loss": -0.0055, "num_tokens": 5995970.0, "reward": 4.1811299324035645, "reward_std": 11.76725959777832, "rewards/rollout_reward_func/mean": 4.1811299324035645, "rewards/rollout_reward_func/std": 12.213129997253418, "sampling/importance_sampling_ratio/max": 1.4055489301681519, "sampling/importance_sampling_ratio/mean": 1.0007095336914062, "sampling/importance_sampling_ratio/min": 0.7907775640487671, "sampling/sampling_logp_difference/max": 0.2328205108642578, "sampling/sampling_logp_difference/mean": 0.0057580312713980675, "step": 121, "step_time": 25.73195371799966 }, { "clip_ratio/high_max": 0.03172348579391837, "clip_ratio/high_mean": 0.007930871448479593, "clip_ratio/low_mean": 0.007812500232830644, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015743371681310236, "entropy": 0.12837151251733303, "epoch": 0.00244, "grad_norm": 0.19652943313121796, "kl": 0.6755912862718105, "learning_rate": 9.99998630374873e-05, "loss": -0.0109, "step": 122, "step_time": 7.963045050999881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 777.0, "completions/max_terminated_length": 777.0, "completions/mean_length": 676.75, "completions/mean_terminated_length": 676.75, "completions/min_length": 277.0, "completions/min_terminated_length": 277.0, "entropy": 0.14288373803719878, "epoch": 0.00246, "frac_reward_zero_std": 0.0, "grad_norm": 0.5129408836364746, "kl": 0.6468502469360828, "learning_rate": 9.999985983379613e-05, "loss": -0.002, "num_tokens": 6090409.0, "reward": 5.090976238250732, "reward_std": 8.817068099975586, "rewards/rollout_reward_func/mean": 5.090975761413574, "rewards/rollout_reward_func/std": 9.348170280456543, "sampling/importance_sampling_ratio/max": 1.2873598337173462, "sampling/importance_sampling_ratio/mean": 0.9989021420478821, "sampling/importance_sampling_ratio/min": 0.8453167676925659, "sampling/sampling_logp_difference/max": 0.1934504508972168, "sampling/sampling_logp_difference/mean": 0.0064778015948832035, "step": 123, "step_time": 28.174620942000274 }, { "clip_ratio/high_max": 0.026041667442768812, "clip_ratio/high_mean": 0.006510416860692203, "clip_ratio/low_mean": 0.02367424312978983, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03018465987406671, "entropy": 0.12851850083097816, "epoch": 0.00248, "grad_norm": 0.170461967587471, "kl": 0.6984463054686785, "learning_rate": 9.999985659306817e-05, "loss": -0.0077, "step": 124, "step_time": 6.415902794999965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 823.0, "completions/max_terminated_length": 823.0, "completions/mean_length": 686.765625, "completions/mean_terminated_length": 686.765625, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "entropy": 0.12949980096891522, "epoch": 0.0025, "frac_reward_zero_std": 0.0, "grad_norm": 0.4389054477214813, "kl": 0.8373041488230228, "learning_rate": 9.999985331530339e-05, "loss": -0.0001, "num_tokens": 6185533.0, "reward": 6.523627281188965, "reward_std": 12.731056213378906, "rewards/rollout_reward_func/mean": 6.523627281188965, "rewards/rollout_reward_func/std": 13.220861434936523, "sampling/importance_sampling_ratio/max": 1.4951905012130737, "sampling/importance_sampling_ratio/mean": 1.0012614727020264, "sampling/importance_sampling_ratio/min": 0.7251157760620117, "sampling/sampling_logp_difference/max": 0.39764922857284546, "sampling/sampling_logp_difference/mean": 0.006425045896321535, "step": 125, "step_time": 27.883570014999805 }, { "clip_ratio/high_max": 0.03645833441987634, "clip_ratio/high_mean": 0.009114583604969084, "clip_ratio/low_mean": 0.02854567370377481, "clip_ratio/low_min": 0.0052083334885537624, "clip_ratio/region_mean": 0.03766025695949793, "entropy": 0.11564141698181629, "epoch": 0.00252, "grad_norm": 0.24558016657829285, "kl": 1.0033343844115734, "learning_rate": 9.999985000050182e-05, "loss": -0.0041, "step": 126, "step_time": 6.9546678629999406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0013020833721384406, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0013020833721384406, "completions/clipped_ratio": 0.0, "completions/max_length": 837.0, "completions/max_terminated_length": 837.0, "completions/mean_length": 683.515625, "completions/mean_terminated_length": 683.515625, "completions/min_length": 617.0, "completions/min_terminated_length": 617.0, "entropy": 0.10271549178287387, "epoch": 0.00254, "frac_reward_zero_std": 0.0, "grad_norm": 0.48136869072914124, "kl": 0.8534571155905724, "learning_rate": 9.999984664866347e-05, "loss": 0.0132, "num_tokens": 6280443.0, "reward": 4.674668788909912, "reward_std": 11.713541030883789, "rewards/rollout_reward_func/mean": 4.67466926574707, "rewards/rollout_reward_func/std": 13.705061912536621, "sampling/importance_sampling_ratio/max": 1.1515135765075684, "sampling/importance_sampling_ratio/mean": 0.9829530715942383, "sampling/importance_sampling_ratio/min": 0.6125902533531189, "sampling/sampling_logp_difference/max": 0.4248628616333008, "sampling/sampling_logp_difference/mean": 0.00649910606443882, "step": 127, "step_time": 27.126788691001366 }, { "clip_ratio/high_max": 0.03645833441987634, "clip_ratio/high_mean": 0.009114583604969084, "clip_ratio/low_mean": 0.015625000465661287, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02473958395421505, "entropy": 0.10252567520365119, "epoch": 0.00256, "grad_norm": 0.25049537420272827, "kl": 0.9629664830863476, "learning_rate": 9.999984325978833e-05, "loss": 0.0108, "step": 128, "step_time": 7.304040701002123 }, { "clip_ratio/high_max": 0.004166666883975267, "clip_ratio/high_mean": 0.0010416667209938169, "clip_ratio/low_mean": 0.0010416667209938169, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0020833334419876337, "completions/clipped_ratio": 0.0, "completions/max_length": 1019.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 959.40625, "completions/mean_terminated_length": 959.40625, "completions/min_length": 910.0, "completions/min_terminated_length": 910.0, "entropy": 0.13085902528837323, "epoch": 0.00258, "frac_reward_zero_std": 0.0, "grad_norm": 0.6536189913749695, "kl": 0.8716370463371277, "learning_rate": 9.99998398338764e-05, "loss": 0.0229, "num_tokens": 6393090.0, "reward": 5.6695556640625, "reward_std": 11.05074405670166, "rewards/rollout_reward_func/mean": 5.669555187225342, "rewards/rollout_reward_func/std": 12.366477966308594, "sampling/importance_sampling_ratio/max": 1.2895963191986084, "sampling/importance_sampling_ratio/mean": 1.023085355758667, "sampling/importance_sampling_ratio/min": 0.7725162506103516, "sampling/sampling_logp_difference/max": 0.30040407180786133, "sampling/sampling_logp_difference/mean": 0.008372966200113297, "step": 129, "step_time": 33.28244163999989 }, { "clip_ratio/high_max": 0.07559524197131395, "clip_ratio/high_mean": 0.028273811331018806, "clip_ratio/low_mean": 0.02299107296857983, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.051264884416013956, "entropy": 0.14083249866962433, "epoch": 0.0026, "grad_norm": 0.30618196725845337, "kl": 0.9511819295585155, "learning_rate": 9.999983637092769e-05, "loss": 0.0154, "step": 130, "step_time": 8.346246693000012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1034.0, "completions/max_terminated_length": 1034.0, "completions/mean_length": 954.09375, "completions/mean_terminated_length": 954.09375, "completions/min_length": 446.0, "completions/min_terminated_length": 446.0, "entropy": 0.1475105220451951, "epoch": 0.00262, "frac_reward_zero_std": 0.0, "grad_norm": 0.5288283228874207, "kl": 0.912773609161377, "learning_rate": 9.999983287094222e-05, "loss": -0.0212, "num_tokens": 6505385.0, "reward": 7.057158470153809, "reward_std": 10.295648574829102, "rewards/rollout_reward_func/mean": 7.057158470153809, "rewards/rollout_reward_func/std": 10.425559997558594, "sampling/importance_sampling_ratio/max": 1.3561307191848755, "sampling/importance_sampling_ratio/mean": 0.9847082495689392, "sampling/importance_sampling_ratio/min": 0.6632312536239624, "sampling/sampling_logp_difference/max": 0.22558808326721191, "sampling/sampling_logp_difference/mean": 0.00731184845790267, "step": 131, "step_time": 33.95278312799974 }, { "clip_ratio/high_max": 0.06726190773770213, "clip_ratio/high_mean": 0.02313988225068897, "clip_ratio/low_mean": 0.03020833560731262, "clip_ratio/low_min": 0.004166666883975267, "clip_ratio/region_mean": 0.0533482184400782, "entropy": 0.16657310537993908, "epoch": 0.00264, "grad_norm": 0.29056552052497864, "kl": 0.7771002501249313, "learning_rate": 9.999982933391997e-05, "loss": -0.0284, "step": 132, "step_time": 7.300914149999244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0031250001629814506, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250001629814506, "completions/clipped_ratio": 0.0, "completions/max_length": 1041.0, "completions/max_terminated_length": 1041.0, "completions/mean_length": 929.96875, "completions/mean_terminated_length": 929.96875, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "entropy": 0.18474403023719788, "epoch": 0.00266, "frac_reward_zero_std": 0.0, "grad_norm": 0.6026266813278198, "kl": 0.7070890348404646, "learning_rate": 9.999982575986094e-05, "loss": -0.0, "num_tokens": 6616176.0, "reward": 3.4366226196289062, "reward_std": 14.906189918518066, "rewards/rollout_reward_func/mean": 3.436622381210327, "rewards/rollout_reward_func/std": 16.053083419799805, "sampling/importance_sampling_ratio/max": 1.3064157962799072, "sampling/importance_sampling_ratio/mean": 1.0054032802581787, "sampling/importance_sampling_ratio/min": 0.5862367749214172, "sampling/sampling_logp_difference/max": 0.5461184978485107, "sampling/sampling_logp_difference/mean": 0.010052897036075592, "step": 133, "step_time": 32.881761665999875 }, { "clip_ratio/high_max": 0.06369047937914729, "clip_ratio/high_mean": 0.022172620403580368, "clip_ratio/low_mean": 0.03557477821595967, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.05774739931803197, "entropy": 0.18857589829713106, "epoch": 0.00268, "grad_norm": 0.2586621344089508, "kl": 0.8268643505871296, "learning_rate": 9.999982214876515e-05, "loss": -0.0078, "step": 134, "step_time": 7.700307692000479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1035.0, "completions/max_terminated_length": 1035.0, "completions/mean_length": 949.171875, "completions/mean_terminated_length": 949.171875, "completions/min_length": 820.0, "completions/min_terminated_length": 820.0, "entropy": 0.21391641069203615, "epoch": 0.0027, "frac_reward_zero_std": 0.0, "grad_norm": 0.5073988437652588, "kl": 0.669338870793581, "learning_rate": 9.999981850063262e-05, "loss": -0.0078, "num_tokens": 6728116.0, "reward": 5.17537784576416, "reward_std": 13.093953132629395, "rewards/rollout_reward_func/mean": 5.175378322601318, "rewards/rollout_reward_func/std": 13.309264183044434, "sampling/importance_sampling_ratio/max": 1.3000229597091675, "sampling/importance_sampling_ratio/mean": 0.9869031310081482, "sampling/importance_sampling_ratio/min": 0.7261144518852234, "sampling/sampling_logp_difference/max": 0.1514453887939453, "sampling/sampling_logp_difference/mean": 0.008218428120017052, "step": 135, "step_time": 32.26767973099959 }, { "clip_ratio/high_max": 0.06815476482734084, "clip_ratio/high_mean": 0.022321430151350796, "clip_ratio/low_mean": 0.04136904957704246, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.06369047961197793, "entropy": 0.21292453352361917, "epoch": 0.00272, "grad_norm": 0.3758169710636139, "kl": 0.6697604712098837, "learning_rate": 9.99998148154633e-05, "loss": -0.0147, "step": 136, "step_time": 8.984081079998305 }, { "clip_ratio/high_max": 0.004166666883975267, "clip_ratio/high_mean": 0.0010416667209938169, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0010416667209938169, "completions/clipped_ratio": 0.0, "completions/max_length": 1033.0, "completions/max_terminated_length": 1033.0, "completions/mean_length": 941.34375, "completions/mean_terminated_length": 941.34375, "completions/min_length": 618.0, "completions/min_terminated_length": 618.0, "entropy": 0.23286819364875555, "epoch": 0.00274, "frac_reward_zero_std": 0.0, "grad_norm": 0.6691973805427551, "kl": 0.6865591164678335, "learning_rate": 9.999981109325724e-05, "loss": 0.0217, "num_tokens": 6839571.0, "reward": 4.762706756591797, "reward_std": 11.42410659790039, "rewards/rollout_reward_func/mean": 4.762706756591797, "rewards/rollout_reward_func/std": 11.434100151062012, "sampling/importance_sampling_ratio/max": 1.5375083684921265, "sampling/importance_sampling_ratio/mean": 1.0140312910079956, "sampling/importance_sampling_ratio/min": 0.691352128982544, "sampling/sampling_logp_difference/max": 0.24680709838867188, "sampling/sampling_logp_difference/mean": 0.010121582075953484, "step": 137, "step_time": 31.738008715999968 }, { "clip_ratio/high_max": 0.07113095559179783, "clip_ratio/high_mean": 0.022098215762525797, "clip_ratio/low_mean": 0.0486922818236053, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.07079049723688513, "entropy": 0.2127716289833188, "epoch": 0.00276, "grad_norm": 0.30709579586982727, "kl": 0.6930392682552338, "learning_rate": 9.999980733401442e-05, "loss": 0.0087, "step": 138, "step_time": 8.016800426000827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1036.0, "completions/max_terminated_length": 1036.0, "completions/mean_length": 978.6875, "completions/mean_terminated_length": 978.6875, "completions/min_length": 407.0, "completions/min_terminated_length": 407.0, "entropy": 0.19525799248367548, "epoch": 0.00278, "frac_reward_zero_std": 0.0, "grad_norm": 0.7576553225517273, "kl": 0.7210239768028259, "learning_rate": 9.999980353773486e-05, "loss": 0.0087, "num_tokens": 6953628.0, "reward": 7.9931640625, "reward_std": 14.572214126586914, "rewards/rollout_reward_func/mean": 7.993164539337158, "rewards/rollout_reward_func/std": 15.543896675109863, "sampling/importance_sampling_ratio/max": 1.4368523359298706, "sampling/importance_sampling_ratio/mean": 1.020465612411499, "sampling/importance_sampling_ratio/min": 0.6616964340209961, "sampling/sampling_logp_difference/max": 0.3545997142791748, "sampling/sampling_logp_difference/mean": 0.009700989350676537, "step": 139, "step_time": 31.6874715999993 }, { "clip_ratio/high_max": 0.07712912419810891, "clip_ratio/high_mean": 0.02553228137549013, "clip_ratio/low_mean": 0.053521828493103385, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.07905411045067012, "entropy": 0.1893756091594696, "epoch": 0.0028, "grad_norm": 0.31711098551750183, "kl": 0.786970479413867, "learning_rate": 9.999979970441856e-05, "loss": -0.0032, "step": 140, "step_time": 8.092264081999474 }, { "clip_ratio/high_max": 0.008333333767950535, "clip_ratio/high_mean": 0.0020833334419876337, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0020833334419876337, "completions/clipped_ratio": 0.0, "completions/max_length": 1037.0, "completions/max_terminated_length": 1037.0, "completions/mean_length": 955.375, "completions/mean_terminated_length": 955.375, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "entropy": 0.17851338349282742, "epoch": 0.00282, "frac_reward_zero_std": 0.0, "grad_norm": 0.5795699954032898, "kl": 0.7661695275455713, "learning_rate": 9.999979583406551e-05, "loss": -0.0028, "num_tokens": 7066060.0, "reward": 5.970464706420898, "reward_std": 14.057101249694824, "rewards/rollout_reward_func/mean": 5.970464706420898, "rewards/rollout_reward_func/std": 15.589529991149902, "sampling/importance_sampling_ratio/max": 1.2306140661239624, "sampling/importance_sampling_ratio/mean": 1.0006752014160156, "sampling/importance_sampling_ratio/min": 0.7063568830490112, "sampling/sampling_logp_difference/max": 0.24321842193603516, "sampling/sampling_logp_difference/mean": 0.008507179096341133, "step": 141, "step_time": 31.519457149999653 }, { "clip_ratio/high_max": 0.10007440904155374, "clip_ratio/high_mean": 0.03335193661041558, "clip_ratio/low_mean": 0.04136905015911907, "clip_ratio/low_min": 0.004166666883975267, "clip_ratio/region_mean": 0.07472098711878061, "entropy": 0.16169621469452977, "epoch": 0.00284, "grad_norm": 0.21583755314350128, "kl": 0.8030649330466986, "learning_rate": 9.999979192667573e-05, "loss": -0.0127, "step": 142, "step_time": 8.37791394099986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1048.0, "completions/max_terminated_length": 1048.0, "completions/mean_length": 965.875, "completions/mean_terminated_length": 965.875, "completions/min_length": 397.0, "completions/min_terminated_length": 397.0, "entropy": 0.1351936119608581, "epoch": 0.00286, "frac_reward_zero_std": 0.0, "grad_norm": 0.6277215480804443, "kl": 0.6056302916258574, "learning_rate": 9.999978798224921e-05, "loss": -0.0037, "num_tokens": 7179154.0, "reward": 7.006319046020508, "reward_std": 16.71393394470215, "rewards/rollout_reward_func/mean": 7.006319522857666, "rewards/rollout_reward_func/std": 17.009944915771484, "sampling/importance_sampling_ratio/max": 1.4720120429992676, "sampling/importance_sampling_ratio/mean": 1.0313916206359863, "sampling/importance_sampling_ratio/min": 0.8535375595092773, "sampling/sampling_logp_difference/max": 0.33231019973754883, "sampling/sampling_logp_difference/mean": 0.007416378241032362, "step": 143, "step_time": 31.253298032000657 }, { "clip_ratio/high_max": 0.03363095410168171, "clip_ratio/high_mean": 0.010565476841293275, "clip_ratio/low_mean": 0.022564054117538035, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03312953084241599, "entropy": 0.12869372498244047, "epoch": 0.00288, "grad_norm": 0.3357137143611908, "kl": 0.6441880892962217, "learning_rate": 9.999978400078598e-05, "loss": -0.011, "step": 144, "step_time": 8.612604698998894 }, { "clip_ratio/high_max": 0.004166666883975267, "clip_ratio/high_mean": 0.0010416667209938169, "clip_ratio/low_mean": 0.0010416667209938169, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0020833334419876337, "completions/clipped_ratio": 0.0, "completions/max_length": 1033.0, "completions/max_terminated_length": 1033.0, "completions/mean_length": 981.234375, "completions/mean_terminated_length": 981.234375, "completions/min_length": 922.0, "completions/min_terminated_length": 922.0, "entropy": 0.14574182452633977, "epoch": 0.0029, "frac_reward_zero_std": 0.0, "grad_norm": 0.6256417632102966, "kl": 0.680026089772582, "learning_rate": 9.9999779982286e-05, "loss": 0.0066, "num_tokens": 7293276.0, "reward": 9.345416069030762, "reward_std": 12.761893272399902, "rewards/rollout_reward_func/mean": 9.345417022705078, "rewards/rollout_reward_func/std": 14.231216430664062, "sampling/importance_sampling_ratio/max": 1.2866383790969849, "sampling/importance_sampling_ratio/mean": 0.9946876764297485, "sampling/importance_sampling_ratio/min": 0.7063043117523193, "sampling/sampling_logp_difference/max": 0.3008323907852173, "sampling/sampling_logp_difference/mean": 0.007616790477186441, "step": 145, "step_time": 31.996783762999257 }, { "clip_ratio/high_max": 0.05424107378348708, "clip_ratio/high_mean": 0.020851935259997845, "clip_ratio/low_mean": 0.03377976384945214, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.05463169957511127, "entropy": 0.14952234365046024, "epoch": 0.00292, "grad_norm": 0.3231852948665619, "kl": 0.7531629204750061, "learning_rate": 9.999977592674931e-05, "loss": -0.0032, "step": 146, "step_time": 8.073437064001155 }, { "clip_ratio/high_max": 0.012500000651925802, "clip_ratio/high_mean": 0.0031250001629814506, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250001629814506, "completions/clipped_ratio": 0.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 951.1875, "completions/mean_terminated_length": 951.1875, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "entropy": 0.14304543379694223, "epoch": 0.00294, "frac_reward_zero_std": 0.0, "grad_norm": 0.4544009566307068, "kl": 0.6561761032789946, "learning_rate": 9.999977183417592e-05, "loss": -0.0136, "num_tokens": 7405394.0, "reward": 9.592363357543945, "reward_std": 11.82339859008789, "rewards/rollout_reward_func/mean": 9.592363357543945, "rewards/rollout_reward_func/std": 12.213863372802734, "sampling/importance_sampling_ratio/max": 1.3994261026382446, "sampling/importance_sampling_ratio/mean": 0.9877851009368896, "sampling/importance_sampling_ratio/min": 0.5693183541297913, "sampling/sampling_logp_difference/max": 0.5401673913002014, "sampling/sampling_logp_difference/mean": 0.007635599002242088, "step": 147, "step_time": 31.870756492000055 }, { "clip_ratio/high_max": 0.054166669491678476, "clip_ratio/high_mean": 0.013541667372919619, "clip_ratio/low_mean": 0.036681550089269876, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.05022321757860482, "entropy": 0.14903255039826035, "epoch": 0.00296, "grad_norm": 0.34076768159866333, "kl": 0.6760309524834156, "learning_rate": 9.99997677045658e-05, "loss": -0.0174, "step": 148, "step_time": 8.03263958799971 }, { "clip_ratio/high_max": 0.008333333767950535, "clip_ratio/high_mean": 0.0020833334419876337, "clip_ratio/low_mean": 0.0022435898426920176, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004326923284679651, "completions/clipped_ratio": 0.0, "completions/max_length": 1022.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 950.578125, "completions/mean_terminated_length": 950.578125, "completions/min_length": 673.0, "completions/min_terminated_length": 673.0, "entropy": 0.16968106850981712, "epoch": 0.00298, "frac_reward_zero_std": 0.0, "grad_norm": 0.5638662576675415, "kl": 0.6232388503849506, "learning_rate": 9.999976353791898e-05, "loss": -0.0115, "num_tokens": 7517436.0, "reward": 6.506036281585693, "reward_std": 12.593399047851562, "rewards/rollout_reward_func/mean": 6.506035804748535, "rewards/rollout_reward_func/std": 13.552786827087402, "sampling/importance_sampling_ratio/max": 1.6476225852966309, "sampling/importance_sampling_ratio/mean": 0.9991188645362854, "sampling/importance_sampling_ratio/min": 0.5213066935539246, "sampling/sampling_logp_difference/max": 0.576519250869751, "sampling/sampling_logp_difference/mean": 0.01059242058545351, "step": 149, "step_time": 30.528242389000752 }, { "clip_ratio/high_max": 0.05000000260770321, "clip_ratio/high_mean": 0.01458333432674408, "clip_ratio/low_mean": 0.03889938397333026, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.05348271911498159, "entropy": 0.17780038248747587, "epoch": 0.003, "grad_norm": 0.5385463833808899, "kl": 0.8597960155457258, "learning_rate": 9.999975933423545e-05, "loss": -0.0172, "step": 150, "step_time": 8.0192518380004 }, { "clip_ratio/high_max": 0.004166666883975267, "clip_ratio/high_mean": 0.0010416667209938169, "clip_ratio/low_mean": 0.002157738199457526, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003199404920451343, "completions/clipped_ratio": 0.0, "completions/max_length": 1020.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 953.953125, "completions/mean_terminated_length": 953.953125, "completions/min_length": 664.0, "completions/min_terminated_length": 664.0, "entropy": 0.1825277367606759, "epoch": 0.00302, "frac_reward_zero_std": 0.0, "grad_norm": 0.6920294165611267, "kl": 0.6721424907445908, "learning_rate": 9.999975509351522e-05, "loss": -0.0165, "num_tokens": 7629697.0, "reward": 6.279596328735352, "reward_std": 13.454200744628906, "rewards/rollout_reward_func/mean": 6.279596328735352, "rewards/rollout_reward_func/std": 15.490900039672852, "sampling/importance_sampling_ratio/max": 1.2544176578521729, "sampling/importance_sampling_ratio/mean": 0.9968298673629761, "sampling/importance_sampling_ratio/min": 0.5891286730766296, "sampling/sampling_logp_difference/max": 0.36822509765625, "sampling/sampling_logp_difference/mean": 0.009644769132137299, "step": 151, "step_time": 30.041253716999563 }, { "clip_ratio/high_max": 0.06250000279396772, "clip_ratio/high_mean": 0.02187500149011612, "clip_ratio/low_mean": 0.027847783756442368, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0497227858286351, "entropy": 0.19313342962414026, "epoch": 0.00304, "grad_norm": 0.3150973320007324, "kl": 0.6543413959443569, "learning_rate": 9.99997508157583e-05, "loss": -0.0263, "step": 152, "step_time": 8.048088266000377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 993.0, "completions/max_terminated_length": 993.0, "completions/mean_length": 933.640625, "completions/mean_terminated_length": 933.640625, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "entropy": 0.1851256461814046, "epoch": 0.00306, "frac_reward_zero_std": 0.0, "grad_norm": 0.6191554665565491, "kl": 0.5646015591919422, "learning_rate": 9.999974650096467e-05, "loss": -0.0157, "num_tokens": 7740640.0, "reward": 7.951285362243652, "reward_std": 13.322220802307129, "rewards/rollout_reward_func/mean": 7.951285362243652, "rewards/rollout_reward_func/std": 15.29836654663086, "sampling/importance_sampling_ratio/max": 1.1902070045471191, "sampling/importance_sampling_ratio/mean": 0.9911805987358093, "sampling/importance_sampling_ratio/min": 0.6955353617668152, "sampling/sampling_logp_difference/max": 0.37529921531677246, "sampling/sampling_logp_difference/mean": 0.007848689332604408, "step": 153, "step_time": 30.541750664000574 }, { "clip_ratio/high_max": 0.04301470750942826, "clip_ratio/high_mean": 0.013878677156753838, "clip_ratio/low_mean": 0.039536832249723375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.053415509522892535, "entropy": 0.16637779865413904, "epoch": 0.00308, "grad_norm": 0.3494158089160919, "kl": 0.6059492044150829, "learning_rate": 9.999974214913437e-05, "loss": -0.0231, "step": 154, "step_time": 8.139173758999277 }, { "clip_ratio/high_max": 0.004166666883975267, "clip_ratio/high_mean": 0.0010416667209938169, "clip_ratio/low_mean": 0.0010416667209938169, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0020833334419876337, "completions/clipped_ratio": 0.0, "completions/max_length": 1041.0, "completions/max_terminated_length": 1041.0, "completions/mean_length": 972.640625, "completions/mean_terminated_length": 972.640625, "completions/min_length": 935.0, "completions/min_terminated_length": 935.0, "entropy": 0.1503364727832377, "epoch": 0.0031, "frac_reward_zero_std": 0.0, "grad_norm": 0.6759209036827087, "kl": 0.6219805851578712, "learning_rate": 9.999973776026739e-05, "loss": 0.0152, "num_tokens": 7854154.0, "reward": 5.902735710144043, "reward_std": 12.42209243774414, "rewards/rollout_reward_func/mean": 5.902735710144043, "rewards/rollout_reward_func/std": 12.867145538330078, "sampling/importance_sampling_ratio/max": 1.4259474277496338, "sampling/importance_sampling_ratio/mean": 1.0006431341171265, "sampling/importance_sampling_ratio/min": 0.6987265348434448, "sampling/sampling_logp_difference/max": 0.35797882080078125, "sampling/sampling_logp_difference/mean": 0.008803295902907848, "step": 155, "step_time": 31.54653142600091 }, { "clip_ratio/high_max": 0.054464288521558046, "clip_ratio/high_mean": 0.018824405618943274, "clip_ratio/low_mean": 0.0364583358168602, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.05528274178504944, "entropy": 0.1241895561106503, "epoch": 0.00312, "grad_norm": 0.955508828163147, "kl": 0.9998617265373468, "learning_rate": 9.999973333436372e-05, "loss": 0.017, "step": 156, "step_time": 7.910055370999544 }, { "clip_ratio/high_max": 0.004166666883975267, "clip_ratio/high_mean": 0.0010416667209938169, "clip_ratio/low_mean": 0.0006127451197244227, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0016544118407182395, "completions/clipped_ratio": 0.0, "completions/max_length": 1040.0, "completions/max_terminated_length": 1040.0, "completions/mean_length": 972.546875, "completions/mean_terminated_length": 972.546875, "completions/min_length": 305.0, "completions/min_terminated_length": 305.0, "entropy": 0.11084589222446084, "epoch": 0.00314, "frac_reward_zero_std": 0.0, "grad_norm": 0.9052144885063171, "kl": 0.9162529278546572, "learning_rate": 9.999972887142338e-05, "loss": 0.0236, "num_tokens": 7967770.0, "reward": 10.1655855178833, "reward_std": 15.845230102539062, "rewards/rollout_reward_func/mean": 10.1655855178833, "rewards/rollout_reward_func/std": 17.717178344726562, "sampling/importance_sampling_ratio/max": 1.5550763607025146, "sampling/importance_sampling_ratio/mean": 1.0152667760849, "sampling/importance_sampling_ratio/min": 0.6825421452522278, "sampling/sampling_logp_difference/max": 0.38708627223968506, "sampling/sampling_logp_difference/mean": 0.006948791444301605, "step": 157, "step_time": 30.977979516999312 }, { "clip_ratio/high_max": 0.041964287869632244, "clip_ratio/high_mean": 0.013616072130389512, "clip_ratio/low_mean": 0.019929535686969757, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03354560805018991, "entropy": 0.11029910668730736, "epoch": 0.00316, "grad_norm": 0.3586527705192566, "kl": 0.996163547039032, "learning_rate": 9.999972437144637e-05, "loss": 0.018, "step": 158, "step_time": 8.73399685899949 }, { "clip_ratio/high_max": 0.004166666883975267, "clip_ratio/high_mean": 0.0010416667209938169, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0010416667209938169, "completions/clipped_ratio": 0.0, "completions/max_length": 1026.0, "completions/max_terminated_length": 1026.0, "completions/mean_length": 954.4375, "completions/mean_terminated_length": 954.4375, "completions/min_length": 686.0, "completions/min_terminated_length": 686.0, "entropy": 0.14973071590065956, "epoch": 0.00318, "frac_reward_zero_std": 0.0, "grad_norm": 0.7992783784866333, "kl": 0.5131530929356813, "learning_rate": 9.999971983443269e-05, "loss": -0.0019, "num_tokens": 8080082.0, "reward": 5.8201141357421875, "reward_std": 11.146739959716797, "rewards/rollout_reward_func/mean": 5.8201141357421875, "rewards/rollout_reward_func/std": 11.795808792114258, "sampling/importance_sampling_ratio/max": 1.2158492803573608, "sampling/importance_sampling_ratio/mean": 0.9923404455184937, "sampling/importance_sampling_ratio/min": 0.623603343963623, "sampling/sampling_logp_difference/max": 0.24274826049804688, "sampling/sampling_logp_difference/mean": 0.007134515792131424, "step": 159, "step_time": 31.143712819999564 }, { "clip_ratio/high_max": 0.06250000232830644, "clip_ratio/high_mean": 0.017708334140479565, "clip_ratio/low_mean": 0.028382036020047963, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046090369927696884, "entropy": 0.15424074092879891, "epoch": 0.0032, "grad_norm": 0.4114607274532318, "kl": 0.5258241277188063, "learning_rate": 9.999971526038235e-05, "loss": -0.0105, "step": 160, "step_time": 7.376053459000104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0020833334419876337, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0020833334419876337, "completions/clipped_ratio": 0.0, "completions/max_length": 1066.0, "completions/max_terminated_length": 1066.0, "completions/mean_length": 964.390625, "completions/mean_terminated_length": 964.390625, "completions/min_length": 795.0, "completions/min_terminated_length": 795.0, "entropy": 0.14214739575982094, "epoch": 0.00322, "frac_reward_zero_std": 0.0, "grad_norm": 0.6593955159187317, "kl": 0.7137422636151314, "learning_rate": 9.999971064929537e-05, "loss": 0.0221, "num_tokens": 8193063.0, "reward": 7.681003093719482, "reward_std": 11.441247940063477, "rewards/rollout_reward_func/mean": 7.681002616882324, "rewards/rollout_reward_func/std": 13.56708812713623, "sampling/importance_sampling_ratio/max": 1.4164402484893799, "sampling/importance_sampling_ratio/mean": 1.0107839107513428, "sampling/importance_sampling_ratio/min": 0.6920035481452942, "sampling/sampling_logp_difference/max": 0.3535594940185547, "sampling/sampling_logp_difference/mean": 0.007559535559266806, "step": 161, "step_time": 32.16549203100021 }, { "clip_ratio/high_max": 0.045833335258066654, "clip_ratio/high_mean": 0.014657739084213972, "clip_ratio/low_mean": 0.033670345321297646, "clip_ratio/low_min": 0.004166666883975267, "clip_ratio/region_mean": 0.04832808405626565, "entropy": 0.1284659137018025, "epoch": 0.00324, "grad_norm": 0.44948309659957886, "kl": 0.8788620755076408, "learning_rate": 9.999970600117172e-05, "loss": 0.0155, "step": 162, "step_time": 8.349364119001166 }, { "clip_ratio/high_max": 0.012500000651925802, "clip_ratio/high_mean": 0.0031250001629814506, "clip_ratio/low_mean": 0.0010416667209938169, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004166666883975267, "completions/clipped_ratio": 0.0, "completions/max_length": 1028.0, "completions/max_terminated_length": 1028.0, "completions/mean_length": 958.1875, "completions/mean_terminated_length": 958.1875, "completions/min_length": 688.0, "completions/min_terminated_length": 688.0, "entropy": 0.1297779600135982, "epoch": 0.00326, "frac_reward_zero_std": 0.0, "grad_norm": 0.45152774453163147, "kl": 0.6032252982258797, "learning_rate": 9.999970131601142e-05, "loss": -0.007, "num_tokens": 8305653.0, "reward": 9.560303688049316, "reward_std": 12.965145111083984, "rewards/rollout_reward_func/mean": 9.560302734375, "rewards/rollout_reward_func/std": 13.572053909301758, "sampling/importance_sampling_ratio/max": 1.3970085382461548, "sampling/importance_sampling_ratio/mean": 0.9942675828933716, "sampling/importance_sampling_ratio/min": 0.5912600755691528, "sampling/sampling_logp_difference/max": 0.43671131134033203, "sampling/sampling_logp_difference/mean": 0.006968793459236622, "step": 163, "step_time": 29.62484441499919 }, { "clip_ratio/high_max": 0.04534313944168389, "clip_ratio/high_mean": 0.013419118302408606, "clip_ratio/low_mean": 0.028385418467223644, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.04180453671142459, "entropy": 0.12798475893214345, "epoch": 0.00328, "grad_norm": 0.37086573243141174, "kl": 0.5329502020031214, "learning_rate": 9.99996965938145e-05, "loss": -0.0114, "step": 164, "step_time": 9.19148286500058 }, { "clip_ratio/high_max": 0.004166666883975267, "clip_ratio/high_mean": 0.0010416667209938169, "clip_ratio/low_mean": 0.0010416667209938169, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0020833334419876337, "completions/clipped_ratio": 0.0, "completions/max_length": 1068.0, "completions/max_terminated_length": 1068.0, "completions/mean_length": 958.28125, "completions/mean_terminated_length": 958.28125, "completions/min_length": 714.0, "completions/min_terminated_length": 714.0, "entropy": 0.14121837774291635, "epoch": 0.0033, "frac_reward_zero_std": 0.0, "grad_norm": 0.445486456155777, "kl": 0.6868670284748077, "learning_rate": 9.999969183458092e-05, "loss": 0.017, "num_tokens": 8418180.0, "reward": 6.036255836486816, "reward_std": 14.006401062011719, "rewards/rollout_reward_func/mean": 6.036255836486816, "rewards/rollout_reward_func/std": 15.667006492614746, "sampling/importance_sampling_ratio/max": 1.4084051847457886, "sampling/importance_sampling_ratio/mean": 0.9844825267791748, "sampling/importance_sampling_ratio/min": 0.6458684802055359, "sampling/sampling_logp_difference/max": 0.35437726974487305, "sampling/sampling_logp_difference/mean": 0.008984029293060303, "step": 165, "step_time": 30.86212910500126 }, { "clip_ratio/high_max": 0.041964287869632244, "clip_ratio/high_mean": 0.012574405525811017, "clip_ratio/low_mean": 0.02604166802484542, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.038616073317825794, "entropy": 0.13694474566727877, "epoch": 0.00332, "grad_norm": 0.2597510814666748, "kl": 0.670884259045124, "learning_rate": 9.999968703831071e-05, "loss": 0.012, "step": 166, "step_time": 8.765868728999521 }, { "clip_ratio/high_max": 0.008333333767950535, "clip_ratio/high_mean": 0.0020833334419876337, "clip_ratio/low_mean": 0.0020833334419876337, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004166666883975267, "completions/clipped_ratio": 0.0, "completions/max_length": 1028.0, "completions/max_terminated_length": 1028.0, "completions/mean_length": 964.015625, "completions/mean_terminated_length": 964.015625, "completions/min_length": 773.0, "completions/min_terminated_length": 773.0, "entropy": 0.13714495720341802, "epoch": 0.00334, "frac_reward_zero_std": 0.0, "grad_norm": 0.742760181427002, "kl": 0.5935596115887165, "learning_rate": 9.999968220500386e-05, "loss": 0.0264, "num_tokens": 8531148.0, "reward": 6.6519269943237305, "reward_std": 14.873868942260742, "rewards/rollout_reward_func/mean": 6.6519269943237305, "rewards/rollout_reward_func/std": 15.216424942016602, "sampling/importance_sampling_ratio/max": 1.4992643594741821, "sampling/importance_sampling_ratio/mean": 1.0216107368469238, "sampling/importance_sampling_ratio/min": 0.7036370635032654, "sampling/sampling_logp_difference/max": 0.351947546005249, "sampling/sampling_logp_difference/mean": 0.008944995701313019, "step": 167, "step_time": 30.057006109999747 }, { "clip_ratio/high_max": 0.03750000195577741, "clip_ratio/high_mean": 0.013541667489334941, "clip_ratio/low_mean": 0.03437500225845724, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.04791666998062283, "entropy": 0.13065697345882654, "epoch": 0.00336, "grad_norm": 8.381538391113281, "kl": 7.166379388421774, "learning_rate": 9.999967733466041e-05, "loss": 0.0808, "step": 168, "step_time": 8.213664751000124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0010416667209938169, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0010416667209938169, "completions/clipped_ratio": 0.0, "completions/max_length": 1016.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 943.078125, "completions/mean_terminated_length": 943.078125, "completions/min_length": 868.0, "completions/min_terminated_length": 868.0, "entropy": 0.13596792286261916, "epoch": 0.00338, "frac_reward_zero_std": 0.0, "grad_norm": 0.6320606470108032, "kl": 0.5089176166802645, "learning_rate": 9.999967242728034e-05, "loss": -0.0005, "num_tokens": 8642652.0, "reward": 9.83786392211914, "reward_std": 12.724628448486328, "rewards/rollout_reward_func/mean": 9.83786392211914, "rewards/rollout_reward_func/std": 13.589927673339844, "sampling/importance_sampling_ratio/max": 1.5156316757202148, "sampling/importance_sampling_ratio/mean": 1.001371145248413, "sampling/importance_sampling_ratio/min": 0.75341796875, "sampling/sampling_logp_difference/max": 0.40897202491760254, "sampling/sampling_logp_difference/mean": 0.006749385967850685, "step": 169, "step_time": 30.052868118000788 }, { "clip_ratio/high_max": 0.020833334419876337, "clip_ratio/high_mean": 0.007291667046956718, "clip_ratio/low_mean": 0.03333333553746343, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.04062500281725079, "entropy": 0.13328771898522973, "epoch": 0.0034, "grad_norm": 0.27786943316459656, "kl": 0.5417735707014799, "learning_rate": 9.999966748286363e-05, "loss": -0.004, "step": 170, "step_time": 7.808134698000686 }, { "clip_ratio/high_max": 0.004166666883975267, "clip_ratio/high_mean": 0.0010416667209938169, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0010416667209938169, "completions/clipped_ratio": 0.0, "completions/max_length": 1033.0, "completions/max_terminated_length": 1033.0, "completions/mean_length": 977.46875, "completions/mean_terminated_length": 977.46875, "completions/min_length": 890.0, "completions/min_terminated_length": 890.0, "entropy": 0.14305478753522038, "epoch": 0.00342, "frac_reward_zero_std": 0.0, "grad_norm": 0.477180153131485, "kl": 0.9006227869540453, "learning_rate": 9.999966250141033e-05, "loss": -0.016, "num_tokens": 8756508.0, "reward": 9.534229278564453, "reward_std": 10.647237777709961, "rewards/rollout_reward_func/mean": 9.534229278564453, "rewards/rollout_reward_func/std": 11.566615104675293, "sampling/importance_sampling_ratio/max": 1.4990143775939941, "sampling/importance_sampling_ratio/mean": 1.0070048570632935, "sampling/importance_sampling_ratio/min": 0.6254692077636719, "sampling/sampling_logp_difference/max": 0.4892125129699707, "sampling/sampling_logp_difference/mean": 0.008062894456088543, "step": 171, "step_time": 29.967204156000207 }, { "clip_ratio/high_max": 0.03333333507180214, "clip_ratio/high_mean": 0.009375000605359674, "clip_ratio/low_mean": 0.03333333553746343, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.042708336492069066, "entropy": 0.13219841895624995, "epoch": 0.00344, "grad_norm": 0.2979583740234375, "kl": 0.9737532902508974, "learning_rate": 9.999965748292042e-05, "loss": -0.0247, "step": 172, "step_time": 8.450734508001005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0031250001629814506, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250001629814506, "completions/clipped_ratio": 0.0, "completions/max_length": 1043.0, "completions/max_terminated_length": 1043.0, "completions/mean_length": 977.578125, "completions/mean_terminated_length": 977.578125, "completions/min_length": 911.0, "completions/min_terminated_length": 911.0, "entropy": 0.13215081067755818, "epoch": 0.00346, "frac_reward_zero_std": 0.0, "grad_norm": 0.570915699005127, "kl": 0.7707110401242971, "learning_rate": 9.999965242739393e-05, "loss": 0.0115, "num_tokens": 8870395.0, "reward": 7.963113784790039, "reward_std": 12.185734748840332, "rewards/rollout_reward_func/mean": 7.963113784790039, "rewards/rollout_reward_func/std": 12.419037818908691, "sampling/importance_sampling_ratio/max": 1.2637660503387451, "sampling/importance_sampling_ratio/mean": 0.9871397614479065, "sampling/importance_sampling_ratio/min": 0.6115806102752686, "sampling/sampling_logp_difference/max": 0.3316690921783447, "sampling/sampling_logp_difference/mean": 0.0069004204124212265, "step": 173, "step_time": 29.865270385998883 }, { "clip_ratio/high_max": 0.05000000214204192, "clip_ratio/high_mean": 0.013541667256504297, "clip_ratio/low_mean": 0.025976563920266926, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03951823094394058, "entropy": 0.1266618687659502, "epoch": 0.00348, "grad_norm": 0.3126421570777893, "kl": 0.7724483050405979, "learning_rate": 9.999964733483083e-05, "loss": 0.0074, "step": 174, "step_time": 8.14785716599863 }, { "clip_ratio/high_max": 0.004166666883975267, "clip_ratio/high_mean": 0.0010416667209938169, "clip_ratio/low_mean": 0.0010416667209938169, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0020833334419876337, "completions/clipped_ratio": 0.0, "completions/max_length": 1040.0, "completions/max_terminated_length": 1040.0, "completions/mean_length": 986.78125, "completions/mean_terminated_length": 986.78125, "completions/min_length": 814.0, "completions/min_terminated_length": 814.0, "entropy": 0.12785040121525526, "epoch": 0.0035, "frac_reward_zero_std": 0.0, "grad_norm": 0.4320923089981079, "kl": 0.5314337890595198, "learning_rate": 9.999964220523112e-05, "loss": 0.0134, "num_tokens": 8984945.0, "reward": 11.988597869873047, "reward_std": 11.876688957214355, "rewards/rollout_reward_func/mean": 11.988597869873047, "rewards/rollout_reward_func/std": 12.529437065124512, "sampling/importance_sampling_ratio/max": 1.5641355514526367, "sampling/importance_sampling_ratio/mean": 1.0155951976776123, "sampling/importance_sampling_ratio/min": 0.7307262420654297, "sampling/sampling_logp_difference/max": 0.28014975786209106, "sampling/sampling_logp_difference/mean": 0.006405924912542105, "step": 175, "step_time": 30.800439373998415 }, { "clip_ratio/high_max": 0.025000001303851604, "clip_ratio/high_mean": 0.008333333767950535, "clip_ratio/low_mean": 0.015625001047737896, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.023958335164934397, "entropy": 0.12503943219780922, "epoch": 0.00352, "grad_norm": 0.25414347648620605, "kl": 0.545308168977499, "learning_rate": 9.999963703859485e-05, "loss": 0.0068, "step": 176, "step_time": 8.294947108000088 }, { "clip_ratio/high_max": 0.012500000651925802, "clip_ratio/high_mean": 0.0031250001629814506, "clip_ratio/low_mean": 0.0010416667209938169, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004166666883975267, "completions/clipped_ratio": 0.0, "completions/max_length": 1032.0, "completions/max_terminated_length": 1032.0, "completions/mean_length": 947.78125, "completions/mean_terminated_length": 947.78125, "completions/min_length": 877.0, "completions/min_terminated_length": 877.0, "entropy": 0.11795077985152602, "epoch": 0.00354, "frac_reward_zero_std": 0.0, "grad_norm": 0.5428488850593567, "kl": 0.5484364293515682, "learning_rate": 9.9999631834922e-05, "loss": 0.0209, "num_tokens": 9096764.0, "reward": 7.462541580200195, "reward_std": 9.003820419311523, "rewards/rollout_reward_func/mean": 7.462541103363037, "rewards/rollout_reward_func/std": 9.709749221801758, "sampling/importance_sampling_ratio/max": 1.6056361198425293, "sampling/importance_sampling_ratio/mean": 1.0011367797851562, "sampling/importance_sampling_ratio/min": 0.6226766109466553, "sampling/sampling_logp_difference/max": 0.48480892181396484, "sampling/sampling_logp_difference/mean": 0.007405002135783434, "step": 177, "step_time": 30.438013943000442 }, { "clip_ratio/high_max": 0.025000001303851604, "clip_ratio/high_mean": 0.006250000325962901, "clip_ratio/low_mean": 0.021875001140870154, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.028125001583248377, "entropy": 0.11180919618345797, "epoch": 0.00356, "grad_norm": 1.0773159265518188, "kl": 0.7693799175322056, "learning_rate": 9.999962659421255e-05, "loss": 0.0218, "step": 178, "step_time": 8.289468396000302 }, { "clip_ratio/high_max": 0.012500000651925802, "clip_ratio/high_mean": 0.0031250001629814506, "clip_ratio/low_mean": 0.0020833334419876337, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005208333604969084, "completions/clipped_ratio": 0.0, "completions/max_length": 1017.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 962.765625, "completions/mean_terminated_length": 962.765625, "completions/min_length": 893.0, "completions/min_terminated_length": 893.0, "entropy": 0.12420041672885418, "epoch": 0.00358, "frac_reward_zero_std": 0.0, "grad_norm": 0.5452980399131775, "kl": 0.5841826293617487, "learning_rate": 9.999962131646658e-05, "loss": 0.0223, "num_tokens": 9209601.0, "reward": 9.949074745178223, "reward_std": 11.123800277709961, "rewards/rollout_reward_func/mean": 9.949074745178223, "rewards/rollout_reward_func/std": 11.492538452148438, "sampling/importance_sampling_ratio/max": 1.846232295036316, "sampling/importance_sampling_ratio/mean": 1.0060797929763794, "sampling/importance_sampling_ratio/min": 0.692804217338562, "sampling/sampling_logp_difference/max": 0.6036995649337769, "sampling/sampling_logp_difference/mean": 0.0071367728523910046, "step": 179, "step_time": 29.633916566999687 }, { "clip_ratio/high_max": 0.03750000195577741, "clip_ratio/high_mean": 0.014583334093913436, "clip_ratio/low_mean": 0.018824405735358596, "clip_ratio/low_min": 0.004166666883975267, "clip_ratio/region_mean": 0.033407740062102675, "entropy": 0.11627750238403678, "epoch": 0.0036, "grad_norm": 0.38062411546707153, "kl": 0.639982882887125, "learning_rate": 9.999961600168402e-05, "loss": 0.0192, "step": 180, "step_time": 8.508149862998835 }, { "clip_ratio/high_max": 0.012500000651925802, "clip_ratio/high_mean": 0.0031250001629814506, "clip_ratio/low_mean": 0.0010416667209938169, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004166666883975267, "completions/clipped_ratio": 0.0, "completions/max_length": 1031.0, "completions/max_terminated_length": 1031.0, "completions/mean_length": 964.46875, "completions/mean_terminated_length": 964.46875, "completions/min_length": 816.0, "completions/min_terminated_length": 816.0, "entropy": 0.10024931281805038, "epoch": 0.00362, "frac_reward_zero_std": 0.0, "grad_norm": 0.7843199968338013, "kl": 0.5290133021771908, "learning_rate": 9.999961064986489e-05, "loss": -0.0105, "num_tokens": 9322591.0, "reward": 9.743326187133789, "reward_std": 11.718559265136719, "rewards/rollout_reward_func/mean": 9.743326187133789, "rewards/rollout_reward_func/std": 11.767054557800293, "sampling/importance_sampling_ratio/max": 1.2395166158676147, "sampling/importance_sampling_ratio/mean": 0.9893835783004761, "sampling/importance_sampling_ratio/min": 0.7077917456626892, "sampling/sampling_logp_difference/max": 0.36174678802490234, "sampling/sampling_logp_difference/mean": 0.0061057801358401775, "step": 181, "step_time": 30.010152957000173 }, { "clip_ratio/high_max": 0.04583333572372794, "clip_ratio/high_mean": 0.01458333432674408, "clip_ratio/low_mean": 0.019791668048128486, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.034375002374872565, "entropy": 0.0898241214454174, "epoch": 0.00364, "grad_norm": 0.898304283618927, "kl": 1.3444663938134909, "learning_rate": 9.999960526100922e-05, "loss": -0.0074, "step": 182, "step_time": 8.117577253999116 }, { "clip_ratio/high_max": 0.012500000651925802, "clip_ratio/high_mean": 0.0031250001629814506, "clip_ratio/low_mean": 0.0011160714784637094, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00424107164144516, "completions/clipped_ratio": 0.0, "completions/max_length": 1037.0, "completions/max_terminated_length": 1037.0, "completions/mean_length": 965.296875, "completions/mean_terminated_length": 965.296875, "completions/min_length": 887.0, "completions/min_terminated_length": 887.0, "entropy": 0.12110280524939299, "epoch": 0.00366, "frac_reward_zero_std": 0.0, "grad_norm": 0.47400349378585815, "kl": 0.513819495216012, "learning_rate": 9.999959983511699e-05, "loss": 0.0011, "num_tokens": 9435640.0, "reward": 11.970619201660156, "reward_std": 16.7136287689209, "rewards/rollout_reward_func/mean": 11.970619201660156, "rewards/rollout_reward_func/std": 17.193565368652344, "sampling/importance_sampling_ratio/max": 1.4852927923202515, "sampling/importance_sampling_ratio/mean": 0.9956411123275757, "sampling/importance_sampling_ratio/min": 0.58425372838974, "sampling/sampling_logp_difference/max": 0.4939703941345215, "sampling/sampling_logp_difference/mean": 0.007358514238148928, "step": 183, "step_time": 30.018645907000064 }, { "clip_ratio/high_max": 0.03750000195577741, "clip_ratio/high_mean": 0.01041666732635349, "clip_ratio/low_mean": 0.01875000086147338, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.029166668420657516, "entropy": 0.12478661234490573, "epoch": 0.00368, "grad_norm": 0.29323798418045044, "kl": 0.46843259409070015, "learning_rate": 9.999959437218822e-05, "loss": -0.0073, "step": 184, "step_time": 8.045792180003446 }, { "clip_ratio/high_max": 0.004166666883975267, "clip_ratio/high_mean": 0.0010416667209938169, "clip_ratio/low_mean": 0.0010416667209938169, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0020833334419876337, "completions/clipped_ratio": 0.0, "completions/max_length": 1026.0, "completions/max_terminated_length": 1026.0, "completions/mean_length": 959.875, "completions/mean_terminated_length": 959.875, "completions/min_length": 676.0, "completions/min_terminated_length": 676.0, "entropy": 0.12372714094817638, "epoch": 0.0037, "frac_reward_zero_std": 0.0, "grad_norm": 0.49018675088882446, "kl": 0.5567406937479973, "learning_rate": 9.999958887222293e-05, "loss": -0.0266, "num_tokens": 9548327.0, "reward": 8.300872802734375, "reward_std": 11.473505020141602, "rewards/rollout_reward_func/mean": 8.300872802734375, "rewards/rollout_reward_func/std": 13.137120246887207, "sampling/importance_sampling_ratio/max": 1.3434193134307861, "sampling/importance_sampling_ratio/mean": 1.0231890678405762, "sampling/importance_sampling_ratio/min": 0.8001201748847961, "sampling/sampling_logp_difference/max": 0.24235105514526367, "sampling/sampling_logp_difference/mean": 0.006944713182747364, "step": 185, "step_time": 30.03380806199948 }, { "clip_ratio/high_max": 0.058333336375653744, "clip_ratio/high_mean": 0.01770833437331021, "clip_ratio/low_mean": 0.012500000651925802, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03020833490882069, "entropy": 0.13152629090473056, "epoch": 0.00372, "grad_norm": 0.23521849513053894, "kl": 0.5634740013629198, "learning_rate": 9.999958333522109e-05, "loss": -0.0341, "step": 186, "step_time": 8.600791754000966 }, { "clip_ratio/high_max": 0.004166666883975267, "clip_ratio/high_mean": 0.0010416667209938169, "clip_ratio/low_mean": 0.0020833334419876337, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250001629814506, "completions/clipped_ratio": 0.0, "completions/max_length": 1045.0, "completions/max_terminated_length": 1045.0, "completions/mean_length": 945.21875, "completions/mean_terminated_length": 945.21875, "completions/min_length": 289.0, "completions/min_terminated_length": 289.0, "entropy": 0.1315653999336064, "epoch": 0.00374, "frac_reward_zero_std": 0.0, "grad_norm": 0.36500951647758484, "kl": 0.5178995914757252, "learning_rate": 9.999957776118273e-05, "loss": -0.0136, "num_tokens": 9660136.0, "reward": 7.931632041931152, "reward_std": 11.40542984008789, "rewards/rollout_reward_func/mean": 7.931632041931152, "rewards/rollout_reward_func/std": 12.151664733886719, "sampling/importance_sampling_ratio/max": 1.7536835670471191, "sampling/importance_sampling_ratio/mean": 1.001771092414856, "sampling/importance_sampling_ratio/min": 0.7216951251029968, "sampling/sampling_logp_difference/max": 0.5699708461761475, "sampling/sampling_logp_difference/mean": 0.0067958529107272625, "step": 187, "step_time": 29.347854906000975 }, { "clip_ratio/high_max": 0.054166669491678476, "clip_ratio/high_mean": 0.01770833448972553, "clip_ratio/low_mean": 0.025694445823319256, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.04340278054587543, "entropy": 0.13414463540539145, "epoch": 0.00376, "grad_norm": 0.21745486557483673, "kl": 0.5746774040162563, "learning_rate": 9.999957215010784e-05, "loss": -0.019, "step": 188, "step_time": 8.856123159000163 }, { "clip_ratio/high_max": 0.004166666883975267, "clip_ratio/high_mean": 0.0010416667209938169, "clip_ratio/low_mean": 0.0010416667209938169, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0020833334419876337, "completions/clipped_ratio": 0.0, "completions/max_length": 1026.0, "completions/max_terminated_length": 1026.0, "completions/mean_length": 947.34375, "completions/mean_terminated_length": 947.34375, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "entropy": 0.14527452224865556, "epoch": 0.00378, "frac_reward_zero_std": 0.0, "grad_norm": 0.4758737087249756, "kl": 0.6551676895469427, "learning_rate": 9.999956650199645e-05, "loss": -0.0064, "num_tokens": 9771998.0, "reward": 8.513150215148926, "reward_std": 14.811095237731934, "rewards/rollout_reward_func/mean": 8.513150215148926, "rewards/rollout_reward_func/std": 15.769759178161621, "sampling/importance_sampling_ratio/max": 1.4140323400497437, "sampling/importance_sampling_ratio/mean": 1.0076611042022705, "sampling/importance_sampling_ratio/min": 0.5691302418708801, "sampling/sampling_logp_difference/max": 0.7131770253181458, "sampling/sampling_logp_difference/mean": 0.009376442059874535, "step": 189, "step_time": 30.213357230003567 }, { "clip_ratio/high_max": 0.054166669491678476, "clip_ratio/high_mean": 0.014583334210328758, "clip_ratio/low_mean": 0.0281250016996637, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.042708336375653744, "entropy": 0.13438974926248193, "epoch": 0.0038, "grad_norm": 0.2324807345867157, "kl": 0.737682543694973, "learning_rate": 9.999956081684854e-05, "loss": -0.0149, "step": 190, "step_time": 7.734431613998822 }, { "clip_ratio/high_max": 0.004166666883975267, "clip_ratio/high_mean": 0.0010416667209938169, "clip_ratio/low_mean": 0.0010416667209938169, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0020833334419876337, "completions/clipped_ratio": 0.0, "completions/max_length": 1019.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 962.203125, "completions/mean_terminated_length": 962.203125, "completions/min_length": 881.0, "completions/min_terminated_length": 881.0, "entropy": 0.1253855088725686, "epoch": 0.00382, "frac_reward_zero_std": 0.0, "grad_norm": 0.41439345479011536, "kl": 0.709712341427803, "learning_rate": 9.999955509466414e-05, "loss": 0.0269, "num_tokens": 9884808.0, "reward": 9.057092666625977, "reward_std": 9.098945617675781, "rewards/rollout_reward_func/mean": 9.05709171295166, "rewards/rollout_reward_func/std": 10.38012981414795, "sampling/importance_sampling_ratio/max": 1.3585758209228516, "sampling/importance_sampling_ratio/mean": 0.989570677280426, "sampling/importance_sampling_ratio/min": 0.6827925443649292, "sampling/sampling_logp_difference/max": 0.40184950828552246, "sampling/sampling_logp_difference/mean": 0.00655590184032917, "step": 191, "step_time": 31.590866651999022 }, { "clip_ratio/high_max": 0.03392857313156128, "clip_ratio/high_mean": 0.010565476841293275, "clip_ratio/low_mean": 0.0293154779355973, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0398809548933059, "entropy": 0.11225170968100429, "epoch": 0.00384, "grad_norm": 0.23349761962890625, "kl": 0.8278532009571791, "learning_rate": 9.999954933544323e-05, "loss": 0.0201, "step": 192, "step_time": 7.970918687003177 }, { "clip_ratio/high_max": 0.004166666883975267, "clip_ratio/high_mean": 0.0010416667209938169, "clip_ratio/low_mean": 0.0020833334419876337, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250001629814506, "completions/clipped_ratio": 0.0, "completions/max_length": 1031.0, "completions/max_terminated_length": 1031.0, "completions/mean_length": 977.734375, "completions/mean_terminated_length": 977.734375, "completions/min_length": 896.0, "completions/min_terminated_length": 896.0, "entropy": 0.11717891087755561, "epoch": 0.00386, "frac_reward_zero_std": 0.0, "grad_norm": 0.4010028839111328, "kl": 0.6346222888678312, "learning_rate": 9.999954353918583e-05, "loss": 0.0125, "num_tokens": 9998710.0, "reward": 12.752401351928711, "reward_std": 15.009429931640625, "rewards/rollout_reward_func/mean": 12.752399444580078, "rewards/rollout_reward_func/std": 15.288240432739258, "sampling/importance_sampling_ratio/max": 1.3140867948532104, "sampling/importance_sampling_ratio/mean": 0.9636229276657104, "sampling/importance_sampling_ratio/min": 0.5537927746772766, "sampling/sampling_logp_difference/max": 0.36048221588134766, "sampling/sampling_logp_difference/mean": 0.007171455770730972, "step": 193, "step_time": 30.459012025998163 }, { "clip_ratio/high_max": 0.029166667722165585, "clip_ratio/high_mean": 0.007291666930541396, "clip_ratio/low_mean": 0.03020833502523601, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03750000218860805, "entropy": 0.11490702140145004, "epoch": 0.00388, "grad_norm": 0.23535722494125366, "kl": 0.6073946505784988, "learning_rate": 9.999953770589194e-05, "loss": 0.006, "step": 194, "step_time": 8.631377130000146 }, { "clip_ratio/high_max": 0.008333333767950535, "clip_ratio/high_mean": 0.0020833334419876337, "clip_ratio/low_mean": 0.0010416667209938169, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250001629814506, "completions/clipped_ratio": 0.0, "completions/max_length": 1033.0, "completions/max_terminated_length": 1033.0, "completions/mean_length": 970.203125, "completions/mean_terminated_length": 970.203125, "completions/min_length": 898.0, "completions/min_terminated_length": 898.0, "entropy": 0.11113500501960516, "epoch": 0.0039, "frac_reward_zero_std": 0.0, "grad_norm": 0.604935348033905, "kl": 0.6790309809148312, "learning_rate": 9.999953183556157e-05, "loss": 0.0026, "num_tokens": 10112081.0, "reward": 7.972203731536865, "reward_std": 13.011554718017578, "rewards/rollout_reward_func/mean": 7.972204208374023, "rewards/rollout_reward_func/std": 13.773921966552734, "sampling/importance_sampling_ratio/max": 1.3542617559432983, "sampling/importance_sampling_ratio/mean": 0.9855128526687622, "sampling/importance_sampling_ratio/min": 0.597061276435852, "sampling/sampling_logp_difference/max": 0.4635782241821289, "sampling/sampling_logp_difference/mean": 0.006834958214312792, "step": 195, "step_time": 30.052685054003632 }, { "clip_ratio/high_max": 0.029166668187826872, "clip_ratio/high_mean": 0.007291667046956718, "clip_ratio/low_mean": 0.015625000814907253, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.022916667978279293, "entropy": 0.11238743201829493, "epoch": 0.00392, "grad_norm": 0.4268299341201782, "kl": 0.700402544811368, "learning_rate": 9.999952592819473e-05, "loss": -0.0015, "step": 196, "step_time": 8.260044886000287 }, { "clip_ratio/high_max": 0.008333333767950535, "clip_ratio/high_mean": 0.0020833334419876337, "clip_ratio/low_mean": 0.0032738096779212356, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005357143119908869, "completions/clipped_ratio": 0.0, "completions/max_length": 1043.0, "completions/max_terminated_length": 1043.0, "completions/mean_length": 934.5, "completions/mean_terminated_length": 934.5, "completions/min_length": 878.0, "completions/min_terminated_length": 878.0, "entropy": 0.1125073074363172, "epoch": 0.00394, "frac_reward_zero_std": 0.0, "grad_norm": 0.6487244963645935, "kl": 0.6208249572664499, "learning_rate": 9.99995199837914e-05, "loss": 0.0046, "num_tokens": 10223022.0, "reward": 8.661399841308594, "reward_std": 15.73376178741455, "rewards/rollout_reward_func/mean": 8.661399841308594, "rewards/rollout_reward_func/std": 15.457544326782227, "sampling/importance_sampling_ratio/max": 1.324127435684204, "sampling/importance_sampling_ratio/mean": 1.0008368492126465, "sampling/importance_sampling_ratio/min": 0.6733382344245911, "sampling/sampling_logp_difference/max": 0.35140299797058105, "sampling/sampling_logp_difference/mean": 0.007979365065693855, "step": 197, "step_time": 31.166683005998493 }, { "clip_ratio/high_max": 0.021130953449755907, "clip_ratio/high_mean": 0.00840773864183575, "clip_ratio/low_mean": 0.02730654936749488, "clip_ratio/low_min": 0.004166666883975267, "clip_ratio/region_mean": 0.035714288242161274, "entropy": 0.11146878870204091, "epoch": 0.00396, "grad_norm": 0.5962705016136169, "kl": 0.9501709761098027, "learning_rate": 9.999951400235163e-05, "loss": 0.004, "step": 198, "step_time": 8.287430281997331 }, { "clip_ratio/high_max": 0.012797619681805372, "clip_ratio/high_mean": 0.003199404920451343, "clip_ratio/low_mean": 0.0020833334419876337, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005282738362438977, "completions/clipped_ratio": 0.0, "completions/max_length": 1047.0, "completions/max_terminated_length": 1047.0, "completions/mean_length": 980.65625, "completions/mean_terminated_length": 980.65625, "completions/min_length": 902.0, "completions/min_terminated_length": 902.0, "entropy": 0.11779335234314203, "epoch": 0.00398, "frac_reward_zero_std": 0.0, "grad_norm": 0.523526668548584, "kl": 0.5669353120028973, "learning_rate": 9.999950798387541e-05, "loss": 0.0049, "num_tokens": 10337112.0, "reward": 10.420181274414062, "reward_std": 16.354602813720703, "rewards/rollout_reward_func/mean": 10.420181274414062, "rewards/rollout_reward_func/std": 17.055269241333008, "sampling/importance_sampling_ratio/max": 1.23856782913208, "sampling/importance_sampling_ratio/mean": 0.9714287519454956, "sampling/importance_sampling_ratio/min": 0.7061982750892639, "sampling/sampling_logp_difference/max": 0.447023868560791, "sampling/sampling_logp_difference/mean": 0.00747651606798172, "step": 199, "step_time": 30.34761462899951 }, { "clip_ratio/high_max": 0.029464287217706442, "clip_ratio/high_mean": 0.010491072083823383, "clip_ratio/low_mean": 0.02091703994665295, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.031408111681230366, "entropy": 0.11395548144355416, "epoch": 0.004, "grad_norm": 0.3284382224082947, "kl": 0.5632808655500412, "learning_rate": 9.999950192836271e-05, "loss": -0.001, "step": 200, "step_time": 8.547375084998748 }, { "clip_ratio/high_max": 0.004166666883975267, "clip_ratio/high_mean": 0.0010416667209938169, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0010416667209938169, "completions/clipped_ratio": 0.0, "completions/max_length": 1034.0, "completions/max_terminated_length": 1034.0, "completions/mean_length": 971.3125, "completions/mean_terminated_length": 971.3125, "completions/min_length": 873.0, "completions/min_terminated_length": 873.0, "entropy": 0.1107462802901864, "epoch": 0.00402, "frac_reward_zero_std": 0.0, "grad_norm": 0.42466774582862854, "kl": 0.504519259557128, "learning_rate": 9.999949583581359e-05, "loss": 0.0037, "num_tokens": 10450565.0, "reward": 12.199589729309082, "reward_std": 12.77005672454834, "rewards/rollout_reward_func/mean": 12.199588775634766, "rewards/rollout_reward_func/std": 13.816198348999023, "sampling/importance_sampling_ratio/max": 1.1825975179672241, "sampling/importance_sampling_ratio/mean": 0.9908883571624756, "sampling/importance_sampling_ratio/min": 0.6934873461723328, "sampling/sampling_logp_difference/max": 0.3765444755554199, "sampling/sampling_logp_difference/mean": 0.006183322053402662, "step": 201, "step_time": 30.161383785001817 }, { "clip_ratio/high_max": 0.03750000195577741, "clip_ratio/high_mean": 0.011458334047347307, "clip_ratio/low_mean": 0.021875001257285476, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03333333553746343, "entropy": 0.10510897357016802, "epoch": 0.00404, "grad_norm": 0.21419784426689148, "kl": 0.5648845955729485, "learning_rate": 9.999948970622802e-05, "loss": -0.0012, "step": 202, "step_time": 8.714965140998174 }, { "clip_ratio/high_max": 0.004166666883975267, "clip_ratio/high_mean": 0.0010416667209938169, "clip_ratio/low_mean": 0.0010416667209938169, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0020833334419876337, "completions/clipped_ratio": 0.0, "completions/max_length": 1046.0, "completions/max_terminated_length": 1046.0, "completions/mean_length": 979.578125, "completions/mean_terminated_length": 979.578125, "completions/min_length": 294.0, "completions/min_terminated_length": 294.0, "entropy": 0.12641333835199475, "epoch": 0.00406, "frac_reward_zero_std": 0.0, "grad_norm": 0.6306821703910828, "kl": 0.5104430429637432, "learning_rate": 9.9999483539606e-05, "loss": -0.0021, "num_tokens": 10564630.0, "reward": 10.778827667236328, "reward_std": 13.483461380004883, "rewards/rollout_reward_func/mean": 10.778827667236328, "rewards/rollout_reward_func/std": 14.313225746154785, "sampling/importance_sampling_ratio/max": 1.4068244695663452, "sampling/importance_sampling_ratio/mean": 0.9891500473022461, "sampling/importance_sampling_ratio/min": 0.6753217577934265, "sampling/sampling_logp_difference/max": 0.3969893455505371, "sampling/sampling_logp_difference/mean": 0.007549474947154522, "step": 203, "step_time": 29.916299866999907 }, { "clip_ratio/high_max": 0.04583333572372794, "clip_ratio/high_mean": 0.013541667489334941, "clip_ratio/low_mean": 0.03132440650369972, "clip_ratio/low_min": 0.004166666883975267, "clip_ratio/region_mean": 0.04486607445869595, "entropy": 0.12076347460970283, "epoch": 0.00408, "grad_norm": 0.29815390706062317, "kl": 0.5736292470246553, "learning_rate": 9.999947733594757e-05, "loss": -0.0096, "step": 204, "step_time": 7.709945141001299 }, { "clip_ratio/high_max": 0.004166666883975267, "clip_ratio/high_mean": 0.0010416667209938169, "clip_ratio/low_mean": 0.006250000325962901, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007291667046956718, "completions/clipped_ratio": 0.0, "completions/max_length": 1016.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 948.3125, "completions/mean_terminated_length": 948.3125, "completions/min_length": 878.0, "completions/min_terminated_length": 878.0, "entropy": 0.10954847000539303, "epoch": 0.0041, "frac_reward_zero_std": 0.0, "grad_norm": 0.8904768228530884, "kl": 0.5102124018594623, "learning_rate": 9.999947109525271e-05, "loss": 0.0269, "num_tokens": 10676487.0, "reward": 7.509866237640381, "reward_std": 12.055532455444336, "rewards/rollout_reward_func/mean": 7.509865760803223, "rewards/rollout_reward_func/std": 12.425904273986816, "sampling/importance_sampling_ratio/max": 2.821709156036377, "sampling/importance_sampling_ratio/mean": 1.0446405410766602, "sampling/importance_sampling_ratio/min": 0.6838214993476868, "sampling/sampling_logp_difference/max": 0.6221010684967041, "sampling/sampling_logp_difference/mean": 0.007641012314707041, "step": 205, "step_time": 32.10779399100011 }, { "clip_ratio/high_max": 0.029166668187826872, "clip_ratio/high_mean": 0.008333333767950535, "clip_ratio/low_mean": 0.0238932310603559, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.032226564711891115, "entropy": 0.09176747733727098, "epoch": 0.00412, "grad_norm": 0.5064001083374023, "kl": 0.6276722047477961, "learning_rate": 9.999946481752144e-05, "loss": 0.0257, "step": 206, "step_time": 8.04664100899663 }, { "clip_ratio/high_max": 0.004166666883975267, "clip_ratio/high_mean": 0.0010416667209938169, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0010416667209938169, "completions/clipped_ratio": 0.0, "completions/max_length": 1032.0, "completions/max_terminated_length": 1032.0, "completions/mean_length": 948.5, "completions/mean_terminated_length": 948.5, "completions/min_length": 695.0, "completions/min_terminated_length": 695.0, "entropy": 0.0839753916952759, "epoch": 0.00414, "frac_reward_zero_std": 0.0, "grad_norm": 0.6526506543159485, "kl": 0.5436345608904958, "learning_rate": 9.999945850275377e-05, "loss": -0.0066, "num_tokens": 10788398.0, "reward": 4.734495639801025, "reward_std": 13.251731872558594, "rewards/rollout_reward_func/mean": 4.734495639801025, "rewards/rollout_reward_func/std": 15.050627708435059, "sampling/importance_sampling_ratio/max": 1.249489426612854, "sampling/importance_sampling_ratio/mean": 1.0017802715301514, "sampling/importance_sampling_ratio/min": 0.5872460603713989, "sampling/sampling_logp_difference/max": 0.5519323348999023, "sampling/sampling_logp_difference/mean": 0.007509762421250343, "step": 207, "step_time": 30.502280216000145 }, { "clip_ratio/high_max": 0.041666668839752674, "clip_ratio/high_mean": 0.010416667209938169, "clip_ratio/low_mean": 0.020126489107497036, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.030543157132342458, "entropy": 0.0718412920832634, "epoch": 0.00416, "grad_norm": 0.9516690969467163, "kl": 1.0864872355014086, "learning_rate": 9.999945215094969e-05, "loss": -0.0086, "step": 208, "step_time": 8.340999965001174 }, { "clip_ratio/high_max": 0.008333333767950535, "clip_ratio/high_mean": 0.0020833334419876337, "clip_ratio/low_mean": 0.0020833334419876337, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004166666883975267, "completions/clipped_ratio": 0.0, "completions/max_length": 1014.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 948.453125, "completions/mean_terminated_length": 948.453125, "completions/min_length": 656.0, "completions/min_terminated_length": 656.0, "entropy": 0.0782642443664372, "epoch": 0.00418, "frac_reward_zero_std": 0.0, "grad_norm": 0.9959932565689087, "kl": 0.5945225208997726, "learning_rate": 9.99994457621092e-05, "loss": 0.0171, "num_tokens": 10900280.0, "reward": 9.700709342956543, "reward_std": 13.213409423828125, "rewards/rollout_reward_func/mean": 9.700709342956543, "rewards/rollout_reward_func/std": 14.225313186645508, "sampling/importance_sampling_ratio/max": 1.3207358121871948, "sampling/importance_sampling_ratio/mean": 0.968299150466919, "sampling/importance_sampling_ratio/min": 0.3971961438655853, "sampling/sampling_logp_difference/max": 0.8888199329376221, "sampling/sampling_logp_difference/mean": 0.00841559562832117, "step": 209, "step_time": 29.9366263410011 }, { "clip_ratio/high_max": 0.025000001303851604, "clip_ratio/high_mean": 0.006250000325962901, "clip_ratio/low_mean": 0.02656250144354999, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03281250165309757, "entropy": 0.07524953898973763, "epoch": 0.0042, "grad_norm": 0.22927281260490417, "kl": 0.5741278808563948, "learning_rate": 9.999943933623233e-05, "loss": 0.0142, "step": 210, "step_time": 8.610201505000987 }, { "clip_ratio/high_max": 0.012500000651925802, "clip_ratio/high_mean": 0.0031250001629814506, "clip_ratio/low_mean": 0.0011160714784637094, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00424107164144516, "completions/clipped_ratio": 0.0, "completions/max_length": 1033.0, "completions/max_terminated_length": 1033.0, "completions/mean_length": 961.625, "completions/mean_terminated_length": 961.625, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "entropy": 0.08438117569312453, "epoch": 0.00422, "frac_reward_zero_std": 0.0, "grad_norm": 0.6056447625160217, "kl": 0.4765475448220968, "learning_rate": 9.999943287331907e-05, "loss": -0.0396, "num_tokens": 11013133.0, "reward": 6.143215179443359, "reward_std": 9.006479263305664, "rewards/rollout_reward_func/mean": 6.143215179443359, "rewards/rollout_reward_func/std": 10.255783081054688, "sampling/importance_sampling_ratio/max": 1.5546733140945435, "sampling/importance_sampling_ratio/mean": 0.9941245913505554, "sampling/importance_sampling_ratio/min": 0.5497701168060303, "sampling/sampling_logp_difference/max": 0.6002916693687439, "sampling/sampling_logp_difference/mean": 0.0072316620498895645, "step": 211, "step_time": 29.858850818000974 }, { "clip_ratio/high_max": 0.020833334419876337, "clip_ratio/high_mean": 0.006250000325962901, "clip_ratio/low_mean": 0.01889881060924381, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02514881093520671, "entropy": 0.08111793245188892, "epoch": 0.00424, "grad_norm": 0.3952238857746124, "kl": 0.5354121858254075, "learning_rate": 9.999942637336943e-05, "loss": -0.0419, "step": 212, "step_time": 8.145115041997997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0031250001629814506, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250001629814506, "completions/clipped_ratio": 0.0, "completions/max_length": 1046.0, "completions/max_terminated_length": 1046.0, "completions/mean_length": 974.1875, "completions/mean_terminated_length": 974.1875, "completions/min_length": 853.0, "completions/min_terminated_length": 853.0, "entropy": 0.08407697454094887, "epoch": 0.00426, "frac_reward_zero_std": 0.0, "grad_norm": 0.5582248568534851, "kl": 0.5609004180878401, "learning_rate": 9.999941983638342e-05, "loss": -0.0096, "num_tokens": 11126805.0, "reward": 7.366800308227539, "reward_std": 11.575126647949219, "rewards/rollout_reward_func/mean": 7.366800785064697, "rewards/rollout_reward_func/std": 12.478679656982422, "sampling/importance_sampling_ratio/max": 1.7624305486679077, "sampling/importance_sampling_ratio/mean": 1.0073318481445312, "sampling/importance_sampling_ratio/min": 0.5805040001869202, "sampling/sampling_logp_difference/max": 0.5259637832641602, "sampling/sampling_logp_difference/mean": 0.007181447930634022, "step": 213, "step_time": 30.68919447299777 }, { "clip_ratio/high_max": 0.012500000651925802, "clip_ratio/high_mean": 0.005208333721384406, "clip_ratio/low_mean": 0.02083333453629166, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.026041668257676065, "entropy": 0.08252408797852695, "epoch": 0.00428, "grad_norm": 0.4972332715988159, "kl": 0.8270881623029709, "learning_rate": 9.999941326236106e-05, "loss": -0.0102, "step": 214, "step_time": 8.636868058998516 }, { "clip_ratio/high_max": 0.004166666883975267, "clip_ratio/high_mean": 0.0010416667209938169, "clip_ratio/low_mean": 0.0010416667209938169, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0020833334419876337, "completions/clipped_ratio": 0.0, "completions/max_length": 1021.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 947.234375, "completions/mean_terminated_length": 947.234375, "completions/min_length": 827.0, "completions/min_terminated_length": 827.0, "entropy": 0.09235736005939543, "epoch": 0.0043, "frac_reward_zero_std": 0.0, "grad_norm": 0.7149195075035095, "kl": 0.6700677536427975, "learning_rate": 9.999940665130233e-05, "loss": 0.0269, "num_tokens": 11238594.0, "reward": 8.236663818359375, "reward_std": 12.342934608459473, "rewards/rollout_reward_func/mean": 8.236662864685059, "rewards/rollout_reward_func/std": 13.346291542053223, "sampling/importance_sampling_ratio/max": 1.3215135335922241, "sampling/importance_sampling_ratio/mean": 1.0117213726043701, "sampling/importance_sampling_ratio/min": 0.607474684715271, "sampling/sampling_logp_difference/max": 0.3575429916381836, "sampling/sampling_logp_difference/mean": 0.00792029220610857, "step": 215, "step_time": 30.079993358000138 }, { "clip_ratio/high_max": 0.03333333507180214, "clip_ratio/high_mean": 0.009375000605359674, "clip_ratio/low_mean": 0.03020833502523601, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.039583335630595684, "entropy": 0.09003956150263548, "epoch": 0.00432, "grad_norm": 0.22825075685977936, "kl": 0.7063372246921062, "learning_rate": 9.999940000320725e-05, "loss": 0.0204, "step": 216, "step_time": 8.830438550000508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0010416667209938169, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0010416667209938169, "completions/clipped_ratio": 0.0, "completions/max_length": 1038.0, "completions/max_terminated_length": 1038.0, "completions/mean_length": 973.421875, "completions/mean_terminated_length": 973.421875, "completions/min_length": 899.0, "completions/min_terminated_length": 899.0, "entropy": 0.08580271410755813, "epoch": 0.00434, "frac_reward_zero_std": 0.0, "grad_norm": 0.544740617275238, "kl": 0.8732884284108877, "learning_rate": 9.999939331807582e-05, "loss": -0.0038, "num_tokens": 11352163.0, "reward": 6.939154148101807, "reward_std": 12.035371780395508, "rewards/rollout_reward_func/mean": 6.939153671264648, "rewards/rollout_reward_func/std": 12.4366455078125, "sampling/importance_sampling_ratio/max": 1.316995620727539, "sampling/importance_sampling_ratio/mean": 1.0068674087524414, "sampling/importance_sampling_ratio/min": 0.7823165059089661, "sampling/sampling_logp_difference/max": 0.2636311650276184, "sampling/sampling_logp_difference/mean": 0.0060178861021995544, "step": 217, "step_time": 30.360063980000632 }, { "clip_ratio/high_max": 0.025000001303851604, "clip_ratio/high_mean": 0.006250000325962901, "clip_ratio/low_mean": 0.018750000977888703, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.025000001420266926, "entropy": 0.08710528793744743, "epoch": 0.00436, "grad_norm": 0.38059887290000916, "kl": 0.908846540376544, "learning_rate": 9.999938659590807e-05, "loss": -0.0104, "step": 218, "step_time": 7.607007630000226 }, { "clip_ratio/high_max": 0.008333333767950535, "clip_ratio/high_mean": 0.0031250001629814506, "clip_ratio/low_mean": 0.0030598959419876337, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006184896221384406, "completions/clipped_ratio": 0.0, "completions/max_length": 1034.0, "completions/max_terminated_length": 1034.0, "completions/mean_length": 963.703125, "completions/mean_terminated_length": 963.703125, "completions/min_length": 890.0, "completions/min_terminated_length": 890.0, "entropy": 0.09868060098960996, "epoch": 0.00438, "frac_reward_zero_std": 0.0, "grad_norm": 0.48504236340522766, "kl": 0.5343823749572039, "learning_rate": 9.999937983670398e-05, "loss": 0.0194, "num_tokens": 11465042.0, "reward": 6.008334636688232, "reward_std": 13.721019744873047, "rewards/rollout_reward_func/mean": 6.008334636688232, "rewards/rollout_reward_func/std": 14.5517578125, "sampling/importance_sampling_ratio/max": 1.471420407295227, "sampling/importance_sampling_ratio/mean": 0.9755445718765259, "sampling/importance_sampling_ratio/min": 0.5715925097465515, "sampling/sampling_logp_difference/max": 0.46605920791625977, "sampling/sampling_logp_difference/mean": 0.008881919085979462, "step": 219, "step_time": 31.71598935200018 }, { "clip_ratio/high_max": 0.03750000195577741, "clip_ratio/high_mean": 0.010416667209938169, "clip_ratio/low_mean": 0.022851563524454832, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.033268230734393, "entropy": 0.09918246325105429, "epoch": 0.0044, "grad_norm": 0.22379587590694427, "kl": 0.5816311649978161, "learning_rate": 9.999937304046355e-05, "loss": 0.0147, "step": 220, "step_time": 8.254640479001864 }, { "clip_ratio/high_max": 0.012500000651925802, "clip_ratio/high_mean": 0.0031250001629814506, "clip_ratio/low_mean": 0.0020833334419876337, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005208333604969084, "completions/clipped_ratio": 0.0, "completions/max_length": 1030.0, "completions/max_terminated_length": 1030.0, "completions/mean_length": 955.53125, "completions/mean_terminated_length": 955.53125, "completions/min_length": 894.0, "completions/min_terminated_length": 894.0, "entropy": 0.09134439891204238, "epoch": 0.00442, "frac_reward_zero_std": 0.0, "grad_norm": 0.7109507322311401, "kl": 0.6976822856813669, "learning_rate": 9.999936620718681e-05, "loss": 0.0063, "num_tokens": 11577407.0, "reward": 7.297264099121094, "reward_std": 9.222230911254883, "rewards/rollout_reward_func/mean": 7.297264575958252, "rewards/rollout_reward_func/std": 10.19138240814209, "sampling/importance_sampling_ratio/max": 1.4536468982696533, "sampling/importance_sampling_ratio/mean": 0.9984610080718994, "sampling/importance_sampling_ratio/min": 0.7001582384109497, "sampling/sampling_logp_difference/max": 0.37113046646118164, "sampling/sampling_logp_difference/mean": 0.006036648992449045, "step": 221, "step_time": 29.85640163500102 }, { "clip_ratio/high_max": 0.03333333507180214, "clip_ratio/high_mean": 0.009375000488944352, "clip_ratio/low_mean": 0.014583334210328758, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02395833469927311, "entropy": 0.09828592231497169, "epoch": 0.00444, "grad_norm": 1.0275782346725464, "kl": 0.5333473347127438, "learning_rate": 9.999935933687375e-05, "loss": 0.0064, "step": 222, "step_time": 8.916792385998633 }, { "clip_ratio/high_max": 0.008333333767950535, "clip_ratio/high_mean": 0.0020833334419876337, "clip_ratio/low_mean": 0.0020833334419876337, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004166666883975267, "completions/clipped_ratio": 0.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 955.5625, "completions/mean_terminated_length": 955.5625, "completions/min_length": 864.0, "completions/min_terminated_length": 864.0, "entropy": 0.11322583490982652, "epoch": 0.00446, "frac_reward_zero_std": 0.0, "grad_norm": 0.6080738306045532, "kl": 0.4430428724735975, "learning_rate": 9.999935242952441e-05, "loss": 0.0136, "num_tokens": 11689757.0, "reward": 7.010858535766602, "reward_std": 12.169811248779297, "rewards/rollout_reward_func/mean": 7.010858535766602, "rewards/rollout_reward_func/std": 12.808332443237305, "sampling/importance_sampling_ratio/max": 1.297809362411499, "sampling/importance_sampling_ratio/mean": 0.9824950695037842, "sampling/importance_sampling_ratio/min": 0.6718153953552246, "sampling/sampling_logp_difference/max": 0.3088874816894531, "sampling/sampling_logp_difference/mean": 0.007251087576150894, "step": 223, "step_time": 31.12233561499943 }, { "clip_ratio/high_max": 0.05000000260770321, "clip_ratio/high_mean": 0.01770833437331021, "clip_ratio/low_mean": 0.02285156410653144, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.04055989871267229, "entropy": 0.11228394089266658, "epoch": 0.00448, "grad_norm": 0.5497627258300781, "kl": 0.6058794800192118, "learning_rate": 9.999934548513874e-05, "loss": 0.0127, "step": 224, "step_time": 8.354907415001435 }, { "clip_ratio/high_max": 0.004166666883975267, "clip_ratio/high_mean": 0.0010416667209938169, "clip_ratio/low_mean": 0.0031250001629814506, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004166666883975267, "completions/clipped_ratio": 0.0, "completions/max_length": 1039.0, "completions/max_terminated_length": 1039.0, "completions/mean_length": 941.9375, "completions/mean_terminated_length": 941.9375, "completions/min_length": 811.0, "completions/min_terminated_length": 811.0, "entropy": 0.10548029001802206, "epoch": 0.0045, "frac_reward_zero_std": 0.0, "grad_norm": 0.5894492864608765, "kl": 0.5073134936392307, "learning_rate": 9.999933850371681e-05, "loss": 0.0086, "num_tokens": 11801157.0, "reward": 6.051244258880615, "reward_std": 9.136930465698242, "rewards/rollout_reward_func/mean": 6.051244258880615, "rewards/rollout_reward_func/std": 9.732189178466797, "sampling/importance_sampling_ratio/max": 1.4082491397857666, "sampling/importance_sampling_ratio/mean": 0.9974700212478638, "sampling/importance_sampling_ratio/min": 0.6021063923835754, "sampling/sampling_logp_difference/max": 0.5747667551040649, "sampling/sampling_logp_difference/mean": 0.0068025123327970505, "step": 225, "step_time": 31.567075909998493 }, { "clip_ratio/high_max": 0.025000001303851604, "clip_ratio/high_mean": 0.006250000325962901, "clip_ratio/low_mean": 0.027083334745839238, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.033333335421048105, "entropy": 0.10574616026133299, "epoch": 0.00452, "grad_norm": 0.2798631489276886, "kl": 0.7715174313634634, "learning_rate": 9.999933148525857e-05, "loss": 0.007, "step": 226, "step_time": 7.815335610000147 }, { "clip_ratio/high_max": 0.012500000651925802, "clip_ratio/high_mean": 0.0031250001629814506, "clip_ratio/low_mean": 0.0020833334419876337, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005208333604969084, "completions/clipped_ratio": 0.0, "completions/max_length": 1017.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 959.5625, "completions/mean_terminated_length": 959.5625, "completions/min_length": 894.0, "completions/min_terminated_length": 894.0, "entropy": 0.11399172944948077, "epoch": 0.00454, "frac_reward_zero_std": 0.0, "grad_norm": 0.568816602230072, "kl": 0.5976129788905382, "learning_rate": 9.999932442976408e-05, "loss": -0.0166, "num_tokens": 11913755.0, "reward": 8.459915161132812, "reward_std": 15.611612319946289, "rewards/rollout_reward_func/mean": 8.459915161132812, "rewards/rollout_reward_func/std": 15.882699012756348, "sampling/importance_sampling_ratio/max": 1.710551381111145, "sampling/importance_sampling_ratio/mean": 1.017797827720642, "sampling/importance_sampling_ratio/min": 0.7439659833908081, "sampling/sampling_logp_difference/max": 0.36053359508514404, "sampling/sampling_logp_difference/mean": 0.008504325523972511, "step": 227, "step_time": 31.015096133000043 }, { "clip_ratio/high_max": 0.03750000195577741, "clip_ratio/high_mean": 0.013541667489334941, "clip_ratio/low_mean": 0.020833334769122303, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03437500272411853, "entropy": 0.11824841611087322, "epoch": 0.00456, "grad_norm": 0.31711068749427795, "kl": 0.6181838270276785, "learning_rate": 9.999931733723329e-05, "loss": -0.0224, "step": 228, "step_time": 8.539993245000005 }, { "clip_ratio/high_max": 0.008333333767950535, "clip_ratio/high_mean": 0.0020833334419876337, "clip_ratio/low_mean": 0.0020833334419876337, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004166666883975267, "completions/clipped_ratio": 0.0, "completions/max_length": 1029.0, "completions/max_terminated_length": 1029.0, "completions/mean_length": 949.875, "completions/mean_terminated_length": 949.875, "completions/min_length": 829.0, "completions/min_terminated_length": 829.0, "entropy": 0.11835443088784814, "epoch": 0.00458, "frac_reward_zero_std": 0.0, "grad_norm": 0.510497510433197, "kl": 0.562442360445857, "learning_rate": 9.999931020766625e-05, "loss": -0.0151, "num_tokens": 12025731.0, "reward": 7.8150177001953125, "reward_std": 10.93730640411377, "rewards/rollout_reward_func/mean": 7.8150177001953125, "rewards/rollout_reward_func/std": 12.047165870666504, "sampling/importance_sampling_ratio/max": 1.8081343173980713, "sampling/importance_sampling_ratio/mean": 1.0230156183242798, "sampling/importance_sampling_ratio/min": 0.5872366428375244, "sampling/sampling_logp_difference/max": 0.5174302458763123, "sampling/sampling_logp_difference/mean": 0.008099589496850967, "step": 229, "step_time": 30.196818825002993 }, { "clip_ratio/high_max": 0.029166668187826872, "clip_ratio/high_mean": 0.007291667046956718, "clip_ratio/low_mean": 0.028125001466833055, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03541666886303574, "entropy": 0.12235437287017703, "epoch": 0.0046, "grad_norm": 0.6880154013633728, "kl": 0.5698418729007244, "learning_rate": 9.999930304106295e-05, "loss": -0.0198, "step": 230, "step_time": 9.264137213997856 }, { "clip_ratio/high_max": 0.01666666753590107, "clip_ratio/high_mean": 0.004166666883975267, "clip_ratio/low_mean": 0.0031250001629814506, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007291667046956718, "completions/clipped_ratio": 0.0, "completions/max_length": 1022.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 953.078125, "completions/mean_terminated_length": 953.078125, "completions/min_length": 641.0, "completions/min_terminated_length": 641.0, "entropy": 0.10955408262088895, "epoch": 0.00462, "frac_reward_zero_std": 0.0, "grad_norm": 0.4517523944377899, "kl": 0.562993137165904, "learning_rate": 9.99992958374234e-05, "loss": 0.0172, "num_tokens": 12137928.0, "reward": 6.407680034637451, "reward_std": 12.907535552978516, "rewards/rollout_reward_func/mean": 6.407680034637451, "rewards/rollout_reward_func/std": 14.238213539123535, "sampling/importance_sampling_ratio/max": 1.3800395727157593, "sampling/importance_sampling_ratio/mean": 0.989479124546051, "sampling/importance_sampling_ratio/min": 0.5886368155479431, "sampling/sampling_logp_difference/max": 0.4858388900756836, "sampling/sampling_logp_difference/mean": 0.007202588953077793, "step": 231, "step_time": 30.985224181000376 }, { "clip_ratio/high_max": 0.04583333572372794, "clip_ratio/high_mean": 0.012500000768341124, "clip_ratio/low_mean": 0.023177084513008595, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03567708551418036, "entropy": 0.09367383690550923, "epoch": 0.00464, "grad_norm": 0.3370003402233124, "kl": 0.5898754354566336, "learning_rate": 9.99992885967476e-05, "loss": 0.0141, "step": 232, "step_time": 7.848031210001864 }, { "clip_ratio/high_max": 0.008333333767950535, "clip_ratio/high_mean": 0.0020833334419876337, "clip_ratio/low_mean": 0.0031250001629814506, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005208333604969084, "completions/clipped_ratio": 0.0, "completions/max_length": 1034.0, "completions/max_terminated_length": 1034.0, "completions/mean_length": 956.953125, "completions/mean_terminated_length": 956.953125, "completions/min_length": 643.0, "completions/min_terminated_length": 643.0, "entropy": 0.09109799051657319, "epoch": 0.00466, "frac_reward_zero_std": 0.0, "grad_norm": 0.9314411878585815, "kl": 0.6043513156473637, "learning_rate": 9.999928131903557e-05, "loss": -0.0214, "num_tokens": 12250409.0, "reward": 4.949494361877441, "reward_std": 13.414144515991211, "rewards/rollout_reward_func/mean": 4.949494361877441, "rewards/rollout_reward_func/std": 14.449867248535156, "sampling/importance_sampling_ratio/max": 1.7543931007385254, "sampling/importance_sampling_ratio/mean": 1.0081617832183838, "sampling/importance_sampling_ratio/min": 0.7344788908958435, "sampling/sampling_logp_difference/max": 0.40094685554504395, "sampling/sampling_logp_difference/mean": 0.007154828868806362, "step": 233, "step_time": 31.97192203099803 }, { "clip_ratio/high_max": 0.06250000279396772, "clip_ratio/high_mean": 0.018750001094304025, "clip_ratio/low_mean": 0.026041668141260743, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.04479166946839541, "entropy": 0.07302908715792, "epoch": 0.00468, "grad_norm": 0.6798368692398071, "kl": 1.063211616128683, "learning_rate": 9.999927400428733e-05, "loss": -0.0247, "step": 234, "step_time": 8.365730943999552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0017361111240461469, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0017361111240461469, "completions/clipped_ratio": 0.0, "completions/max_length": 1030.0, "completions/max_terminated_length": 1030.0, "completions/mean_length": 939.296875, "completions/mean_terminated_length": 939.296875, "completions/min_length": 455.0, "completions/min_terminated_length": 455.0, "entropy": 0.08065455732867122, "epoch": 0.0047, "frac_reward_zero_std": 0.0, "grad_norm": 0.6933859586715698, "kl": 0.5536066945642233, "learning_rate": 9.999926665250286e-05, "loss": -0.0262, "num_tokens": 12361673.0, "reward": 6.229083061218262, "reward_std": 13.382326126098633, "rewards/rollout_reward_func/mean": 6.229083061218262, "rewards/rollout_reward_func/std": 14.236706733703613, "sampling/importance_sampling_ratio/max": 1.6296361684799194, "sampling/importance_sampling_ratio/mean": 0.9904996752738953, "sampling/importance_sampling_ratio/min": 0.554724395275116, "sampling/sampling_logp_difference/max": 0.5841927528381348, "sampling/sampling_logp_difference/mean": 0.007334005553275347, "step": 235, "step_time": 30.556930122000267 }, { "clip_ratio/high_max": 0.041666668839752674, "clip_ratio/high_mean": 0.012500000768341124, "clip_ratio/low_mean": 0.023177084629423916, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03567708586342633, "entropy": 0.07718153693713248, "epoch": 0.00472, "grad_norm": 0.373847097158432, "kl": 0.7903371974825859, "learning_rate": 9.999925926368217e-05, "loss": -0.0281, "step": 236, "step_time": 8.405578448001506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0010416667209938169, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0010416667209938169, "completions/clipped_ratio": 0.0, "completions/max_length": 1029.0, "completions/max_terminated_length": 1029.0, "completions/mean_length": 969.71875, "completions/mean_terminated_length": 969.71875, "completions/min_length": 882.0, "completions/min_terminated_length": 882.0, "entropy": 0.09033584129065275, "epoch": 0.00474, "frac_reward_zero_std": 0.0, "grad_norm": 0.672709584236145, "kl": 0.4807140491902828, "learning_rate": 9.999925183782528e-05, "loss": 0.0206, "num_tokens": 12475023.0, "reward": 7.968780517578125, "reward_std": 14.767425537109375, "rewards/rollout_reward_func/mean": 7.968780517578125, "rewards/rollout_reward_func/std": 15.451577186584473, "sampling/importance_sampling_ratio/max": 1.446393370628357, "sampling/importance_sampling_ratio/mean": 1.0078678131103516, "sampling/importance_sampling_ratio/min": 0.7536318898200989, "sampling/sampling_logp_difference/max": 0.36260342597961426, "sampling/sampling_logp_difference/mean": 0.006248952820897102, "step": 237, "step_time": 30.60275455199826 }, { "clip_ratio/high_max": 0.054166669491678476, "clip_ratio/high_mean": 0.01770833448972553, "clip_ratio/low_mean": 0.010416667209938169, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.028125002165324986, "entropy": 0.10236532241106033, "epoch": 0.00476, "grad_norm": 0.1718801110982895, "kl": 0.45644159242510796, "learning_rate": 9.999924437493219e-05, "loss": 0.0137, "step": 238, "step_time": 8.174696675001542 }, { "clip_ratio/high_max": 0.008333333767950535, "clip_ratio/high_mean": 0.0020833334419876337, "clip_ratio/low_mean": 0.0020833334419876337, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004166666883975267, "completions/clipped_ratio": 0.0, "completions/max_length": 1038.0, "completions/max_terminated_length": 1038.0, "completions/mean_length": 986.125, "completions/mean_terminated_length": 986.125, "completions/min_length": 910.0, "completions/min_terminated_length": 910.0, "entropy": 0.11793840350583196, "epoch": 0.00478, "frac_reward_zero_std": 0.0, "grad_norm": 0.403202623128891, "kl": 0.5491157062351704, "learning_rate": 9.99992368750029e-05, "loss": 0.0205, "num_tokens": 12589470.0, "reward": 7.7764387130737305, "reward_std": 10.855308532714844, "rewards/rollout_reward_func/mean": 7.776438236236572, "rewards/rollout_reward_func/std": 11.845745086669922, "sampling/importance_sampling_ratio/max": 1.4125927686691284, "sampling/importance_sampling_ratio/mean": 1.0074553489685059, "sampling/importance_sampling_ratio/min": 0.6787428855895996, "sampling/sampling_logp_difference/max": 0.36117464303970337, "sampling/sampling_logp_difference/mean": 0.007153394166380167, "step": 239, "step_time": 31.446214477003195 }, { "clip_ratio/high_max": 0.06614583590999246, "clip_ratio/high_mean": 0.02070312586147338, "clip_ratio/low_mean": 0.018750000977888703, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03945312695577741, "entropy": 0.12702699471265078, "epoch": 0.0048, "grad_norm": 0.28333526849746704, "kl": 0.5486433319747448, "learning_rate": 9.999922933803743e-05, "loss": 0.0157, "step": 240, "step_time": 8.18167577299937 }, { "clip_ratio/high_max": 0.012500000651925802, "clip_ratio/high_mean": 0.004101562546566129, "clip_ratio/low_mean": 0.0015997024602256715, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0057012650067918, "completions/clipped_ratio": 0.0, "completions/max_length": 1028.0, "completions/max_terminated_length": 1028.0, "completions/mean_length": 947.078125, "completions/mean_terminated_length": 947.078125, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "entropy": 0.15819989750161767, "epoch": 0.00482, "frac_reward_zero_std": 0.0, "grad_norm": 0.5497854948043823, "kl": 0.6653936766088009, "learning_rate": 9.999922176403578e-05, "loss": 0.0274, "num_tokens": 12701387.0, "reward": 4.367884635925293, "reward_std": 14.958491325378418, "rewards/rollout_reward_func/mean": 4.367884635925293, "rewards/rollout_reward_func/std": 15.79384708404541, "sampling/importance_sampling_ratio/max": 1.7386236190795898, "sampling/importance_sampling_ratio/mean": 1.0037915706634521, "sampling/importance_sampling_ratio/min": 1.5100153958014693e-17, "sampling/sampling_logp_difference/max": 32.36700439453125, "sampling/sampling_logp_difference/mean": 0.050702136009931564, "step": 241, "step_time": 30.312135679001585 }, { "clip_ratio/high_max": 0.0713541698642075, "clip_ratio/high_mean": 0.02304687607102096, "clip_ratio/low_mean": 0.019182722782716155, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.042229598737321794, "entropy": 0.1654459210112691, "epoch": 0.00484, "grad_norm": 0.2647717595100403, "kl": 0.6706695519387722, "learning_rate": 9.999921415299796e-05, "loss": 0.0208, "step": 242, "step_time": 8.75198459999956 }, { "clip_ratio/high_max": 0.008333333767950535, "clip_ratio/high_mean": 0.0020833334419876337, "clip_ratio/low_mean": 0.0010416667209938169, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250001629814506, "completions/clipped_ratio": 0.0, "completions/max_length": 1034.0, "completions/max_terminated_length": 1034.0, "completions/mean_length": 975.015625, "completions/mean_terminated_length": 975.015625, "completions/min_length": 829.0, "completions/min_terminated_length": 829.0, "entropy": 0.1656077685765922, "epoch": 0.00486, "frac_reward_zero_std": 0.0, "grad_norm": 0.5207899808883667, "kl": 0.4836368393152952, "learning_rate": 9.999920650492399e-05, "loss": -0.0058, "num_tokens": 12815104.0, "reward": 8.474200248718262, "reward_std": 13.949186325073242, "rewards/rollout_reward_func/mean": 8.474200248718262, "rewards/rollout_reward_func/std": 15.287591934204102, "sampling/importance_sampling_ratio/max": 1.3931519985198975, "sampling/importance_sampling_ratio/mean": 0.9963239431381226, "sampling/importance_sampling_ratio/min": 7.17475301392767e-10, "sampling/sampling_logp_difference/max": 14.08260726928711, "sampling/sampling_logp_difference/mean": 0.0297236330807209, "step": 243, "step_time": 29.95098099400184 }, { "clip_ratio/high_max": 0.04583333572372794, "clip_ratio/high_mean": 0.01562500116415322, "clip_ratio/low_mean": 0.019308037008158863, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.034933038405142725, "entropy": 0.17500293161720037, "epoch": 0.00488, "grad_norm": 0.2083873599767685, "kl": 0.4789597373455763, "learning_rate": 9.999919881981386e-05, "loss": -0.0127, "step": 244, "step_time": 9.603725530998418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0005580357392318547, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0005580357392318547, "completions/clipped_ratio": 0.0, "completions/max_length": 1032.0, "completions/max_terminated_length": 1032.0, "completions/mean_length": 960.71875, "completions/mean_terminated_length": 960.71875, "completions/min_length": 870.0, "completions/min_terminated_length": 870.0, "entropy": 0.1825911095365882, "epoch": 0.0049, "frac_reward_zero_std": 0.0, "grad_norm": 0.5889225602149963, "kl": 0.9096355475485325, "learning_rate": 9.999919109766759e-05, "loss": 0.0086, "num_tokens": 12927807.0, "reward": 4.357412338256836, "reward_std": 10.907012939453125, "rewards/rollout_reward_func/mean": 4.357412338256836, "rewards/rollout_reward_func/std": 11.52568531036377, "sampling/importance_sampling_ratio/max": 1.709380030632019, "sampling/importance_sampling_ratio/mean": 1.0086195468902588, "sampling/importance_sampling_ratio/min": 0.7435536980628967, "sampling/sampling_logp_difference/max": 0.24706459045410156, "sampling/sampling_logp_difference/mean": 0.007025801111012697, "step": 245, "step_time": 31.461640581997926 }, { "clip_ratio/high_max": 0.04583333572372794, "clip_ratio/high_mean": 0.014583334210328758, "clip_ratio/low_mean": 0.02460007555782795, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.039183410233817995, "entropy": 0.1923405658453703, "epoch": 0.00492, "grad_norm": 0.2593821585178375, "kl": 0.6401933804154396, "learning_rate": 9.999918333848517e-05, "loss": -0.0009, "step": 246, "step_time": 7.859529026000018 }, { "clip_ratio/high_max": 0.004464285913854837, "clip_ratio/high_mean": 0.0011160714784637094, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0011160714784637094, "completions/clipped_ratio": 0.0, "completions/max_length": 1032.0, "completions/max_terminated_length": 1032.0, "completions/mean_length": 962.265625, "completions/mean_terminated_length": 962.265625, "completions/min_length": 699.0, "completions/min_terminated_length": 699.0, "entropy": 0.21913561783730984, "epoch": 0.00494, "frac_reward_zero_std": 0.0, "grad_norm": 0.5455278158187866, "kl": 0.5414405167102814, "learning_rate": 9.999917554226662e-05, "loss": 0.0071, "num_tokens": 13040672.0, "reward": 8.484970092773438, "reward_std": 13.802679061889648, "rewards/rollout_reward_func/mean": 8.484970092773438, "rewards/rollout_reward_func/std": 13.872236251831055, "sampling/importance_sampling_ratio/max": 1.3358259201049805, "sampling/importance_sampling_ratio/mean": 0.9915522336959839, "sampling/importance_sampling_ratio/min": 0.004653692711144686, "sampling/sampling_logp_difference/max": 4.521495819091797, "sampling/sampling_logp_difference/mean": 0.016463816165924072, "step": 247, "step_time": 32.296578387999034 }, { "clip_ratio/high_max": 0.07113095559179783, "clip_ratio/high_mean": 0.025074406410567462, "clip_ratio/low_mean": 0.03020833490882069, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.05528274131938815, "entropy": 0.2346064280718565, "epoch": 0.00496, "grad_norm": 0.28457146883010864, "kl": 0.5182771291583776, "learning_rate": 9.999916770901196e-05, "loss": 0.0003, "step": 248, "step_time": 8.322071146998496 }, { "clip_ratio/high_max": 0.008333333767950535, "clip_ratio/high_mean": 0.0020833334419876337, "clip_ratio/low_mean": 0.0010416667209938169, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250001629814506, "completions/clipped_ratio": 0.0, "completions/max_length": 1038.0, "completions/max_terminated_length": 1038.0, "completions/mean_length": 947.125, "completions/mean_terminated_length": 947.125, "completions/min_length": 887.0, "completions/min_terminated_length": 887.0, "entropy": 0.22256971709430218, "epoch": 0.00498, "frac_reward_zero_std": 0.0, "grad_norm": 0.4911050498485565, "kl": 0.5106718242168427, "learning_rate": 9.999915983872117e-05, "loss": 0.0185, "num_tokens": 13152426.0, "reward": 8.859310150146484, "reward_std": 14.251840591430664, "rewards/rollout_reward_func/mean": 8.859310150146484, "rewards/rollout_reward_func/std": 15.995503425598145, "sampling/importance_sampling_ratio/max": 1.1877168416976929, "sampling/importance_sampling_ratio/mean": 1.0040578842163086, "sampling/importance_sampling_ratio/min": 0.7722747921943665, "sampling/sampling_logp_difference/max": 0.2686450481414795, "sampling/sampling_logp_difference/mean": 0.008781258016824722, "step": 249, "step_time": 31.257320647998313 }, { "clip_ratio/high_max": 0.058333335909992456, "clip_ratio/high_mean": 0.019791667815297842, "clip_ratio/low_mean": 0.02544642984867096, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.04523809743113816, "entropy": 0.22759789694100618, "epoch": 0.005, "grad_norm": 0.32039502263069153, "kl": 0.5074543356895447, "learning_rate": 9.999915193139428e-05, "loss": 0.0067, "step": 250, "step_time": 8.727711221999925 }, { "clip_ratio/high_max": 0.004166666883975267, "clip_ratio/high_mean": 0.0010416667209938169, "clip_ratio/low_mean": 0.0010416667209938169, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0020833334419876337, "completions/clipped_ratio": 0.0, "completions/max_length": 1033.0, "completions/max_terminated_length": 1033.0, "completions/mean_length": 972.3125, "completions/mean_terminated_length": 972.3125, "completions/min_length": 885.0, "completions/min_terminated_length": 885.0, "entropy": 0.2637898661196232, "epoch": 0.00502, "frac_reward_zero_std": 0.0, "grad_norm": 0.6115661263465881, "kl": 0.5041398257017136, "learning_rate": 9.999914398703127e-05, "loss": 0.0222, "num_tokens": 13265926.0, "reward": 8.36276626586914, "reward_std": 12.634754180908203, "rewards/rollout_reward_func/mean": 8.36276626586914, "rewards/rollout_reward_func/std": 13.79938793182373, "sampling/importance_sampling_ratio/max": 1.3860008716583252, "sampling/importance_sampling_ratio/mean": 0.9989358186721802, "sampling/importance_sampling_ratio/min": 0.6789365410804749, "sampling/sampling_logp_difference/max": 0.4403858184814453, "sampling/sampling_logp_difference/mean": 0.011640775017440319, "step": 251, "step_time": 30.78323078500125 }, { "clip_ratio/high_max": 0.08333333721384406, "clip_ratio/high_mean": 0.026041668141260743, "clip_ratio/low_mean": 0.024479167768731713, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.05052083625923842, "entropy": 0.25762353744357824, "epoch": 0.00504, "grad_norm": 0.33068326115608215, "kl": 0.5315965916961432, "learning_rate": 9.99991360056322e-05, "loss": 0.0108, "step": 252, "step_time": 8.417674423999415 }, { "clip_ratio/high_max": 0.004166666883975267, "clip_ratio/high_mean": 0.0010416667209938169, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0010416667209938169, "completions/clipped_ratio": 0.0, "completions/max_length": 1039.0, "completions/max_terminated_length": 1039.0, "completions/mean_length": 962.484375, "completions/mean_terminated_length": 962.484375, "completions/min_length": 640.0, "completions/min_terminated_length": 640.0, "entropy": 0.2320685014128685, "epoch": 0.00506, "frac_reward_zero_std": 0.0, "grad_norm": 0.606693685054779, "kl": 0.5243742056190968, "learning_rate": 9.999912798719702e-05, "loss": 0.0154, "num_tokens": 13378838.0, "reward": 5.471320152282715, "reward_std": 16.305179595947266, "rewards/rollout_reward_func/mean": 5.471320629119873, "rewards/rollout_reward_func/std": 16.513338088989258, "sampling/importance_sampling_ratio/max": 1.3763768672943115, "sampling/importance_sampling_ratio/mean": 0.9975243806838989, "sampling/importance_sampling_ratio/min": 0.706875205039978, "sampling/sampling_logp_difference/max": 0.28901320695877075, "sampling/sampling_logp_difference/mean": 0.009924216195940971, "step": 253, "step_time": 31.570553302000008 }, { "clip_ratio/high_max": 0.058333336375653744, "clip_ratio/high_mean": 0.018750001094304025, "clip_ratio/low_mean": 0.026041667792014778, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.044791669119149446, "entropy": 0.21940706949681044, "epoch": 0.00508, "grad_norm": 0.3425885736942291, "kl": 0.6501965597271919, "learning_rate": 9.999911993172577e-05, "loss": 0.0077, "step": 254, "step_time": 8.264053588000024 }, { "clip_ratio/high_max": 0.004166666883975267, "clip_ratio/high_mean": 0.0010416667209938169, "clip_ratio/low_mean": 0.0020833334419876337, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250001629814506, "completions/clipped_ratio": 0.0, "completions/max_length": 1031.0, "completions/max_terminated_length": 1031.0, "completions/mean_length": 953.46875, "completions/mean_terminated_length": 953.46875, "completions/min_length": 900.0, "completions/min_terminated_length": 900.0, "entropy": 0.21140480507165194, "epoch": 0.0051, "frac_reward_zero_std": 0.0, "grad_norm": 0.5503631234169006, "kl": 0.5327232480049133, "learning_rate": 9.999911183921846e-05, "loss": 0.0038, "num_tokens": 13491042.0, "reward": 8.259774208068848, "reward_std": 10.848722457885742, "rewards/rollout_reward_func/mean": 8.259774208068848, "rewards/rollout_reward_func/std": 11.306256294250488, "sampling/importance_sampling_ratio/max": 1.4545994997024536, "sampling/importance_sampling_ratio/mean": 0.9899890422821045, "sampling/importance_sampling_ratio/min": 0.6251944303512573, "sampling/sampling_logp_difference/max": 0.42076706886291504, "sampling/sampling_logp_difference/mean": 0.01122802309691906, "step": 255, "step_time": 31.768314117000045 }, { "clip_ratio/high_max": 0.05476190708577633, "clip_ratio/high_mean": 0.022023811121471226, "clip_ratio/low_mean": 0.022916668327525258, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.04494047968182713, "entropy": 0.21271847933530807, "epoch": 0.00512, "grad_norm": 1.165165662765503, "kl": 0.5640581175684929, "learning_rate": 9.999910370967507e-05, "loss": -0.0008, "step": 256, "step_time": 8.655397303001337 }, { "clip_ratio/high_max": 0.0034722222480922937, "clip_ratio/high_mean": 0.0008680555620230734, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0008680555620230734, "completions/clipped_ratio": 0.0, "completions/max_length": 1343.0, "completions/max_terminated_length": 1343.0, "completions/mean_length": 1241.921875, "completions/mean_terminated_length": 1241.921875, "completions/min_length": 1084.0, "completions/min_terminated_length": 1084.0, "entropy": 0.2426544101908803, "epoch": 0.00514, "frac_reward_zero_std": 0.0, "grad_norm": 0.8530539870262146, "kl": 0.5651203468441963, "learning_rate": 9.999909554309565e-05, "loss": 0.0047, "num_tokens": 13621717.0, "reward": 4.329623699188232, "reward_std": 14.394445419311523, "rewards/rollout_reward_func/mean": 4.329623222351074, "rewards/rollout_reward_func/std": 14.93822193145752, "sampling/importance_sampling_ratio/max": 1.2321135997772217, "sampling/importance_sampling_ratio/mean": 0.9581992626190186, "sampling/importance_sampling_ratio/min": 0.2660026550292969, "sampling/sampling_logp_difference/max": 1.2229857444763184, "sampling/sampling_logp_difference/mean": 0.013221165165305138, "step": 257, "step_time": 37.82865672800108 }, { "clip_ratio/high_max": 0.06597222317941487, "clip_ratio/high_mean": 0.027732091082725674, "clip_ratio/low_mean": 0.038194445020053536, "clip_ratio/low_min": 0.0034722222480922937, "clip_ratio/region_mean": 0.06592653610277921, "entropy": 0.2270987592637539, "epoch": 0.00516, "grad_norm": 0.43574994802474976, "kl": 0.6589642316102982, "learning_rate": 9.999908733948017e-05, "loss": -0.0093, "step": 258, "step_time": 10.512357729995529 }, { "clip_ratio/high_max": 0.0034722222480922937, "clip_ratio/high_mean": 0.0008680555620230734, "clip_ratio/low_mean": 0.0017361111240461469, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0026041666860692203, "completions/clipped_ratio": 0.0, "completions/max_length": 1366.0, "completions/max_terminated_length": 1366.0, "completions/mean_length": 1250.96875, "completions/mean_terminated_length": 1250.96875, "completions/min_length": 735.0, "completions/min_terminated_length": 735.0, "entropy": 0.213697855360806, "epoch": 0.00518, "frac_reward_zero_std": 0.0, "grad_norm": 0.6505308747291565, "kl": 0.5941296126693487, "learning_rate": 9.999907909882866e-05, "loss": -0.0204, "num_tokens": 13753091.0, "reward": 6.001728057861328, "reward_std": 15.827871322631836, "rewards/rollout_reward_func/mean": 6.001728057861328, "rewards/rollout_reward_func/std": 16.02460479736328, "sampling/importance_sampling_ratio/max": 1.3350346088409424, "sampling/importance_sampling_ratio/mean": 0.9674654006958008, "sampling/importance_sampling_ratio/min": 0.5470981001853943, "sampling/sampling_logp_difference/max": 0.5512038469314575, "sampling/sampling_logp_difference/mean": 0.011880462057888508, "step": 259, "step_time": 38.35317581399886 }, { "clip_ratio/high_max": 0.05208333395421505, "clip_ratio/high_mean": 0.015625000174622983, "clip_ratio/low_mean": 0.04037990275537595, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.056004903570283204, "entropy": 0.1957033844664693, "epoch": 0.0052, "grad_norm": 0.45430490374565125, "kl": 0.781089099124074, "learning_rate": 9.999907082114112e-05, "loss": -0.0313, "step": 260, "step_time": 9.020615039000404 }, { "clip_ratio/high_max": 0.016812865156680346, "clip_ratio/high_mean": 0.004203216289170086, "clip_ratio/low_mean": 0.0026041666860692203, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006807382975239307, "completions/clipped_ratio": 0.0, "completions/max_length": 1335.0, "completions/max_terminated_length": 1335.0, "completions/mean_length": 1213.078125, "completions/mean_terminated_length": 1213.078125, "completions/min_length": 626.0, "completions/min_terminated_length": 626.0, "entropy": 0.18847014661878347, "epoch": 0.00522, "frac_reward_zero_std": 0.0, "grad_norm": 0.6835371851921082, "kl": 0.542348800227046, "learning_rate": 9.999906250641758e-05, "loss": 0.0145, "num_tokens": 13881882.0, "reward": 5.304417610168457, "reward_std": 14.105676651000977, "rewards/rollout_reward_func/mean": 5.304417133331299, "rewards/rollout_reward_func/std": 14.791868209838867, "sampling/importance_sampling_ratio/max": 1.3720983266830444, "sampling/importance_sampling_ratio/mean": 0.9722362756729126, "sampling/importance_sampling_ratio/min": 8.055465437370129e-20, "sampling/sampling_logp_difference/max": 38.39814376831055, "sampling/sampling_logp_difference/mean": 0.05052501708269119, "step": 261, "step_time": 39.094199752998065 }, { "clip_ratio/high_max": 0.05559855583123863, "clip_ratio/high_mean": 0.018239916767925024, "clip_ratio/low_mean": 0.02690972271375358, "clip_ratio/low_min": 0.0034722222480922937, "clip_ratio/region_mean": 0.04514963936526328, "entropy": 0.1835553077980876, "epoch": 0.00524, "grad_norm": 0.266696572303772, "kl": 0.5767297390848398, "learning_rate": 9.9999054154658e-05, "loss": 0.0025, "step": 262, "step_time": 9.272368914001163 }, { "clip_ratio/high_max": 0.010416666744276881, "clip_ratio/high_mean": 0.0026041666860692203, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0026041666860692203, "completions/clipped_ratio": 0.0, "completions/max_length": 1344.0, "completions/max_terminated_length": 1344.0, "completions/mean_length": 1241.4375, "completions/mean_terminated_length": 1241.4375, "completions/min_length": 902.0, "completions/min_terminated_length": 902.0, "entropy": 0.1920191366225481, "epoch": 0.00526, "frac_reward_zero_std": 0.0, "grad_norm": 0.8278535604476929, "kl": 0.5627734903246164, "learning_rate": 9.999904576586242e-05, "loss": -0.0123, "num_tokens": 14012567.0, "reward": 3.131194591522217, "reward_std": 12.649508476257324, "rewards/rollout_reward_func/mean": 3.131195068359375, "rewards/rollout_reward_func/std": 12.768006324768066, "sampling/importance_sampling_ratio/max": 1.5249695777893066, "sampling/importance_sampling_ratio/mean": 1.0084636211395264, "sampling/importance_sampling_ratio/min": 0.6291685700416565, "sampling/sampling_logp_difference/max": 0.48067259788513184, "sampling/sampling_logp_difference/mean": 0.011031190864741802, "step": 263, "step_time": 37.94770654900185 }, { "clip_ratio/high_max": 0.06597222364507616, "clip_ratio/high_mean": 0.02170138922519982, "clip_ratio/low_mean": 0.027777778508607298, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.04947916854871437, "entropy": 0.1821621311828494, "epoch": 0.00528, "grad_norm": 0.30830591917037964, "kl": 0.6204142663627863, "learning_rate": 9.999903734003084e-05, "loss": -0.0238, "step": 264, "step_time": 9.788196208000045 }, { "clip_ratio/high_max": 0.010416666744276881, "clip_ratio/high_mean": 0.0026041666860692203, "clip_ratio/low_mean": 0.0017361111240461469, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004340277810115367, "completions/clipped_ratio": 0.0, "completions/max_length": 1354.0, "completions/max_terminated_length": 1354.0, "completions/mean_length": 1215.875, "completions/mean_terminated_length": 1215.875, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "entropy": 0.18026093766093254, "epoch": 0.0053, "frac_reward_zero_std": 0.0, "grad_norm": 0.7985097765922546, "kl": 0.5250384621322155, "learning_rate": 9.999902887716329e-05, "loss": -0.0455, "num_tokens": 14141610.0, "reward": 2.849423408508301, "reward_std": 12.35162353515625, "rewards/rollout_reward_func/mean": 2.849423408508301, "rewards/rollout_reward_func/std": 12.910691261291504, "sampling/importance_sampling_ratio/max": 1.7354861497879028, "sampling/importance_sampling_ratio/mean": 0.9913997650146484, "sampling/importance_sampling_ratio/min": 0.53452068567276, "sampling/sampling_logp_difference/max": 0.5425161123275757, "sampling/sampling_logp_difference/mean": 0.012209449894726276, "step": 265, "step_time": 38.11352587100009 }, { "clip_ratio/high_max": 0.06311274622566998, "clip_ratio/high_mean": 0.01925040880450979, "clip_ratio/low_mean": 0.030831291631329805, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.050081700494047254, "entropy": 0.1743807177990675, "epoch": 0.00532, "grad_norm": 0.9081993103027344, "kl": 1.4623642209917307, "learning_rate": 9.999902037725976e-05, "loss": -0.0483, "step": 266, "step_time": 9.7086338709978 }, { "clip_ratio/high_max": 0.010416666744276881, "clip_ratio/high_mean": 0.0026041666860692203, "clip_ratio/low_mean": 0.005259395460598171, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007863562146667391, "completions/clipped_ratio": 0.0, "completions/max_length": 1349.0, "completions/max_terminated_length": 1349.0, "completions/mean_length": 1235.953125, "completions/mean_terminated_length": 1235.953125, "completions/min_length": 769.0, "completions/min_terminated_length": 769.0, "entropy": 0.1784328306093812, "epoch": 0.00534, "frac_reward_zero_std": 0.0, "grad_norm": 0.8215274810791016, "kl": 0.49557984061539173, "learning_rate": 9.999901184032026e-05, "loss": 0.0099, "num_tokens": 14271910.0, "reward": 6.570675849914551, "reward_std": 11.428293228149414, "rewards/rollout_reward_func/mean": 6.570675849914551, "rewards/rollout_reward_func/std": 11.919609069824219, "sampling/importance_sampling_ratio/max": 1.5103188753128052, "sampling/importance_sampling_ratio/mean": 1.018727421760559, "sampling/importance_sampling_ratio/min": 1.0843930725359919e-15, "sampling/sampling_logp_difference/max": 27.71844482421875, "sampling/sampling_logp_difference/mean": 0.04110131412744522, "step": 267, "step_time": 40.345479872002215 }, { "clip_ratio/high_max": 0.08261846494860947, "clip_ratio/high_mean": 0.025862949551083148, "clip_ratio/low_mean": 0.025904605397954583, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.05176755564752966, "entropy": 0.18014734331518412, "epoch": 0.00536, "grad_norm": 0.3775624632835388, "kl": 0.5100179798901081, "learning_rate": 9.99990032663448e-05, "loss": -0.0005, "step": 268, "step_time": 8.8038962849987 }, { "clip_ratio/high_max": 0.010620915098115802, "clip_ratio/high_mean": 0.0026552287745289505, "clip_ratio/low_mean": 0.001787173212505877, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0044424019870348275, "completions/clipped_ratio": 0.0, "completions/max_length": 1348.0, "completions/max_terminated_length": 1348.0, "completions/mean_length": 1209.5625, "completions/mean_terminated_length": 1209.5625, "completions/min_length": 423.0, "completions/min_terminated_length": 423.0, "entropy": 0.19260858092457056, "epoch": 0.00538, "frac_reward_zero_std": 0.0, "grad_norm": 0.9969385266304016, "kl": 0.48063670098781586, "learning_rate": 9.999899465533337e-05, "loss": -0.0145, "num_tokens": 14400520.0, "reward": 4.870312690734863, "reward_std": 12.755669593811035, "rewards/rollout_reward_func/mean": 4.870312690734863, "rewards/rollout_reward_func/std": 12.786203384399414, "sampling/importance_sampling_ratio/max": 1.391904354095459, "sampling/importance_sampling_ratio/mean": 0.9837595224380493, "sampling/importance_sampling_ratio/min": 0.5111071467399597, "sampling/sampling_logp_difference/max": 0.6141395568847656, "sampling/sampling_logp_difference/mean": 0.012509889900684357, "step": 269, "step_time": 39.08969898599935 }, { "clip_ratio/high_max": 0.07679738639853895, "clip_ratio/high_mean": 0.025275735883042216, "clip_ratio/low_mean": 0.03416053985711187, "clip_ratio/low_min": 0.0034722222480922937, "clip_ratio/region_mean": 0.05943627591477707, "entropy": 0.190420214086771, "epoch": 0.0054, "grad_norm": 2.133584499359131, "kl": 1.2963667679578066, "learning_rate": 9.999898600728599e-05, "loss": -0.0154, "step": 270, "step_time": 10.067689283000618 }, { "clip_ratio/high_max": 0.0034722222480922937, "clip_ratio/high_mean": 0.0008680555620230734, "clip_ratio/low_mean": 0.0008680555620230734, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0017361111240461469, "completions/clipped_ratio": 0.0, "completions/max_length": 1356.0, "completions/max_terminated_length": 1356.0, "completions/mean_length": 1241.09375, "completions/mean_terminated_length": 1241.09375, "completions/min_length": 641.0, "completions/min_terminated_length": 641.0, "entropy": 0.17894164565950632, "epoch": 0.00542, "frac_reward_zero_std": 0.0, "grad_norm": 0.759871780872345, "kl": 0.4996040966361761, "learning_rate": 9.999897732220269e-05, "loss": -0.0452, "num_tokens": 14531215.0, "reward": 6.598260879516602, "reward_std": 12.557649612426758, "rewards/rollout_reward_func/mean": 6.598260402679443, "rewards/rollout_reward_func/std": 12.858835220336914, "sampling/importance_sampling_ratio/max": 1.6857366561889648, "sampling/importance_sampling_ratio/mean": 1.0335665941238403, "sampling/importance_sampling_ratio/min": 0.555221676826477, "sampling/sampling_logp_difference/max": 0.583274245262146, "sampling/sampling_logp_difference/mean": 0.010182222351431847, "step": 271, "step_time": 38.82532325500051 }, { "clip_ratio/high_max": 0.03513071942143142, "clip_ratio/high_mean": 0.009650735417380929, "clip_ratio/low_mean": 0.020450367941521108, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.030101103708148003, "entropy": 0.17972450237721205, "epoch": 0.00544, "grad_norm": 0.39173194766044617, "kl": 0.5205750651657581, "learning_rate": 9.999896860008347e-05, "loss": -0.052, "step": 272, "step_time": 10.316601943999558 }, { "clip_ratio/high_max": 0.0069444444961845875, "clip_ratio/high_mean": 0.0017361111240461469, "clip_ratio/low_mean": 0.0026041666860692203, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004340277810115367, "completions/clipped_ratio": 0.0, "completions/max_length": 1340.0, "completions/max_terminated_length": 1340.0, "completions/mean_length": 1249.921875, "completions/mean_terminated_length": 1249.921875, "completions/min_length": 663.0, "completions/min_terminated_length": 663.0, "entropy": 0.18446057755500078, "epoch": 0.00546, "frac_reward_zero_std": 0.0, "grad_norm": 0.5517368912696838, "kl": 0.5441189091652632, "learning_rate": 9.999895984092831e-05, "loss": 0.0131, "num_tokens": 14662474.0, "reward": 5.8217644691467285, "reward_std": 11.078777313232422, "rewards/rollout_reward_func/mean": 5.8217644691467285, "rewards/rollout_reward_func/std": 11.748489379882812, "sampling/importance_sampling_ratio/max": 2.5323374271392822, "sampling/importance_sampling_ratio/mean": 0.9794174432754517, "sampling/importance_sampling_ratio/min": 1.1464784742225287e-13, "sampling/sampling_logp_difference/max": 23.46839714050293, "sampling/sampling_logp_difference/mean": 0.03605649992823601, "step": 273, "step_time": 38.64782374399874 }, { "clip_ratio/high_max": 0.041483918437734246, "clip_ratio/high_mean": 0.01210709079168737, "clip_ratio/low_mean": 0.049096201779320836, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.06120329239638522, "entropy": 0.17116414476186037, "epoch": 0.00548, "grad_norm": 0.3144800662994385, "kl": 0.6644695494323969, "learning_rate": 9.999895104473725e-05, "loss": 0.0043, "step": 274, "step_time": 8.841590996999912 }, { "clip_ratio/high_max": 0.01756535959430039, "clip_ratio/high_mean": 0.006127451022621244, "clip_ratio/low_mean": 0.0026552287745289505, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008782679797150195, "completions/clipped_ratio": 0.0, "completions/max_length": 1370.0, "completions/max_terminated_length": 1370.0, "completions/mean_length": 1246.375, "completions/mean_terminated_length": 1246.375, "completions/min_length": 1053.0, "completions/min_terminated_length": 1053.0, "entropy": 0.17010682448744774, "epoch": 0.0055, "frac_reward_zero_std": 0.0, "grad_norm": 0.5666040182113647, "kl": 0.5793958213180304, "learning_rate": 9.99989422115103e-05, "loss": 0.0183, "num_tokens": 14793449.0, "reward": 2.9848508834838867, "reward_std": 12.649776458740234, "rewards/rollout_reward_func/mean": 2.984851121902466, "rewards/rollout_reward_func/std": 13.012813568115234, "sampling/importance_sampling_ratio/max": 1.5569446086883545, "sampling/importance_sampling_ratio/mean": 0.9847633838653564, "sampling/importance_sampling_ratio/min": 0.6424822807312012, "sampling/sampling_logp_difference/max": 0.4623146057128906, "sampling/sampling_logp_difference/mean": 0.00930742733180523, "step": 275, "step_time": 39.34428709400072 }, { "clip_ratio/high_max": 0.054125817492604256, "clip_ratio/high_mean": 0.016135621059220284, "clip_ratio/low_mean": 0.0301164222182706, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046252043335698545, "entropy": 0.16701707802712917, "epoch": 0.00552, "grad_norm": 0.6509947180747986, "kl": 0.6457913182675838, "learning_rate": 9.999893334124744e-05, "loss": 0.0127, "step": 276, "step_time": 9.492375128998901 }, { "clip_ratio/high_max": 0.010416666744276881, "clip_ratio/high_mean": 0.0026041666860692203, "clip_ratio/low_mean": 0.0008680555620230734, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0034722222480922937, "completions/clipped_ratio": 0.0, "completions/max_length": 1342.0, "completions/max_terminated_length": 1342.0, "completions/mean_length": 1211.75, "completions/mean_terminated_length": 1211.75, "completions/min_length": 786.0, "completions/min_terminated_length": 786.0, "entropy": 0.16395934531465173, "epoch": 0.00554, "frac_reward_zero_std": 0.0, "grad_norm": 0.6039373874664307, "kl": 0.6709765158593655, "learning_rate": 9.999892443394869e-05, "loss": -0.0199, "num_tokens": 14922202.0, "reward": 9.567506790161133, "reward_std": 12.886774063110352, "rewards/rollout_reward_func/mean": 9.567506790161133, "rewards/rollout_reward_func/std": 14.272911071777344, "sampling/importance_sampling_ratio/max": 1.3702117204666138, "sampling/importance_sampling_ratio/mean": 0.9926258325576782, "sampling/importance_sampling_ratio/min": 2.4751771812714374e-13, "sampling/sampling_logp_difference/max": 22.218292236328125, "sampling/sampling_logp_difference/mean": 0.03479118272662163, "step": 277, "step_time": 38.66672314299831 }, { "clip_ratio/high_max": 0.056832108180969954, "clip_ratio/high_mean": 0.015944138227496296, "clip_ratio/low_mean": 0.026416234264615923, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.04236037301598117, "entropy": 0.16752836294472218, "epoch": 0.00556, "grad_norm": 0.2952568829059601, "kl": 0.7055745627731085, "learning_rate": 9.999891548961409e-05, "loss": -0.0283, "step": 278, "step_time": 10.249573535998024 }, { "clip_ratio/high_max": 0.0034722222480922937, "clip_ratio/high_mean": 0.0008680555620230734, "clip_ratio/low_mean": 0.0026552287745289505, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003523284336552024, "completions/clipped_ratio": 0.0, "completions/max_length": 1344.0, "completions/max_terminated_length": 1344.0, "completions/mean_length": 1206.921875, "completions/mean_terminated_length": 1206.921875, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "entropy": 0.16296257637441158, "epoch": 0.00558, "frac_reward_zero_std": 0.0, "grad_norm": 0.6338427662849426, "kl": 0.5948988310992718, "learning_rate": 9.99989065082436e-05, "loss": -0.0193, "num_tokens": 15050626.0, "reward": 6.859679222106934, "reward_std": 13.335336685180664, "rewards/rollout_reward_func/mean": 6.859679222106934, "rewards/rollout_reward_func/std": 13.806427955627441, "sampling/importance_sampling_ratio/max": 1.73651123046875, "sampling/importance_sampling_ratio/mean": 1.0235867500305176, "sampling/importance_sampling_ratio/min": 0.6915313005447388, "sampling/sampling_logp_difference/max": 0.3299523591995239, "sampling/sampling_logp_difference/mean": 0.00790142547339201, "step": 279, "step_time": 37.97661670400066 }, { "clip_ratio/high_max": 0.04227941203862429, "clip_ratio/high_mean": 0.01235702628036961, "clip_ratio/low_mean": 0.023852379759773612, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.036209406214766204, "entropy": 0.17103052884340286, "epoch": 0.0056, "grad_norm": 0.310857355594635, "kl": 0.6127588897943497, "learning_rate": 9.999889748983726e-05, "loss": -0.0289, "step": 280, "step_time": 9.740956478998669 }, { "clip_ratio/high_max": 0.007148692850023508, "clip_ratio/high_mean": 0.001787173212505877, "clip_ratio/low_mean": 0.0035807291860692203, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005367902398575097, "completions/clipped_ratio": 0.0, "completions/max_length": 1347.0, "completions/max_terminated_length": 1347.0, "completions/mean_length": 1242.203125, "completions/mean_terminated_length": 1242.203125, "completions/min_length": 1071.0, "completions/min_terminated_length": 1071.0, "entropy": 0.18003392685204744, "epoch": 0.00562, "frac_reward_zero_std": 0.0, "grad_norm": 0.5553699731826782, "kl": 0.5920679531991482, "learning_rate": 9.999888843439508e-05, "loss": 0.0235, "num_tokens": 15181392.0, "reward": 3.9797325134277344, "reward_std": 11.883782386779785, "rewards/rollout_reward_func/mean": 3.9797325134277344, "rewards/rollout_reward_func/std": 12.557183265686035, "sampling/importance_sampling_ratio/max": 2.3216447830200195, "sampling/importance_sampling_ratio/mean": 1.0259038209915161, "sampling/importance_sampling_ratio/min": 0.37790024280548096, "sampling/sampling_logp_difference/max": 1.4781968593597412, "sampling/sampling_logp_difference/mean": 0.011046608909964561, "step": 281, "step_time": 39.572295284000575 }, { "clip_ratio/high_max": 0.04312193673104048, "clip_ratio/high_mean": 0.01343571295728907, "clip_ratio/low_mean": 0.023201337666250765, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03663705079816282, "entropy": 0.18677309434860945, "epoch": 0.00564, "grad_norm": 0.5401102304458618, "kl": 0.5960894413292408, "learning_rate": 9.999887934191704e-05, "loss": 0.0166, "step": 282, "step_time": 9.04144253800041 }, { "clip_ratio/high_max": 0.007582720601931214, "clip_ratio/high_mean": 0.0018956801504828036, "clip_ratio/low_mean": 0.0009191176504828036, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002814797800965607, "completions/clipped_ratio": 0.0, "completions/max_length": 1338.0, "completions/max_terminated_length": 1338.0, "completions/mean_length": 1215.859375, "completions/mean_terminated_length": 1215.859375, "completions/min_length": 1067.0, "completions/min_terminated_length": 1067.0, "entropy": 0.18976869899779558, "epoch": 0.00566, "frac_reward_zero_std": 0.0, "grad_norm": 0.6778466105461121, "kl": 0.8045855388045311, "learning_rate": 9.99988702124032e-05, "loss": 0.0435, "num_tokens": 15310402.0, "reward": 9.532302856445312, "reward_std": 13.447786331176758, "rewards/rollout_reward_func/mean": 9.532302856445312, "rewards/rollout_reward_func/std": 14.893537521362305, "sampling/importance_sampling_ratio/max": 1.5014206171035767, "sampling/importance_sampling_ratio/mean": 0.984979510307312, "sampling/importance_sampling_ratio/min": 0.5918754935264587, "sampling/sampling_logp_difference/max": 0.49157631397247314, "sampling/sampling_logp_difference/mean": 0.009644631296396255, "step": 283, "step_time": 37.75929147200077 }, { "clip_ratio/high_max": 0.04337724717333913, "clip_ratio/high_mean": 0.017022824671585113, "clip_ratio/low_mean": 0.019767412508372217, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.036790237470995635, "entropy": 0.19258240424096584, "epoch": 0.00568, "grad_norm": 0.23295848071575165, "kl": 0.7846251800656319, "learning_rate": 9.999886104585351e-05, "loss": 0.0377, "step": 284, "step_time": 9.718582901003174 }, { "clip_ratio/high_max": 0.0034722222480922937, "clip_ratio/high_mean": 0.0008680555620230734, "clip_ratio/low_mean": 0.0036764706601388752, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004544526163954288, "completions/clipped_ratio": 0.0, "completions/max_length": 1352.0, "completions/max_terminated_length": 1352.0, "completions/mean_length": 1182.8125, "completions/mean_terminated_length": 1182.8125, "completions/min_length": 644.0, "completions/min_terminated_length": 644.0, "entropy": 0.21535112708806992, "epoch": 0.0057, "frac_reward_zero_std": 0.0, "grad_norm": 0.8615608215332031, "kl": 0.8677664548158646, "learning_rate": 9.999885184226802e-05, "loss": -0.0133, "num_tokens": 15437277.0, "reward": 4.226072788238525, "reward_std": 8.478089332580566, "rewards/rollout_reward_func/mean": 4.226072788238525, "rewards/rollout_reward_func/std": 9.509638786315918, "sampling/importance_sampling_ratio/max": 1.921204924583435, "sampling/importance_sampling_ratio/mean": 0.9768272638320923, "sampling/importance_sampling_ratio/min": 0.7244350910186768, "sampling/sampling_logp_difference/max": 0.4400520324707031, "sampling/sampling_logp_difference/mean": 0.010012689046561718, "step": 285, "step_time": 36.26275280400023 }, { "clip_ratio/high_max": 0.04353043343871832, "clip_ratio/high_mean": 0.01611264329403639, "clip_ratio/low_mean": 0.029692606767639518, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.045805250061675906, "entropy": 0.21773250121623278, "epoch": 0.00572, "grad_norm": 0.48840415477752686, "kl": 1.049767030403018, "learning_rate": 9.999884260164671e-05, "loss": -0.0254, "step": 286, "step_time": 10.79683871799898 }, { "clip_ratio/high_max": 0.01797385630197823, "clip_ratio/high_mean": 0.004493464075494558, "clip_ratio/low_mean": 0.003523284336552024, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008016748412046582, "completions/clipped_ratio": 0.0, "completions/max_length": 1349.0, "completions/max_terminated_length": 1349.0, "completions/mean_length": 1224.109375, "completions/mean_terminated_length": 1224.109375, "completions/min_length": 697.0, "completions/min_terminated_length": 697.0, "entropy": 0.22404625453054905, "epoch": 0.00574, "frac_reward_zero_std": 0.0, "grad_norm": 1.1797651052474976, "kl": 0.8139622360467911, "learning_rate": 9.999883332398962e-05, "loss": -0.0606, "num_tokens": 15566944.0, "reward": 5.630161762237549, "reward_std": 12.35897445678711, "rewards/rollout_reward_func/mean": 5.630161762237549, "rewards/rollout_reward_func/std": 13.792024612426758, "sampling/importance_sampling_ratio/max": 2.566322088241577, "sampling/importance_sampling_ratio/mean": 0.9811519384384155, "sampling/importance_sampling_ratio/min": 0.388390451669693, "sampling/sampling_logp_difference/max": 1.9641337394714355, "sampling/sampling_logp_difference/mean": 0.015188181772828102, "step": 287, "step_time": 36.44446017899918 }, { "clip_ratio/high_max": 0.053717320784926414, "clip_ratio/high_mean": 0.01695261470740661, "clip_ratio/low_mean": 0.04400914063444361, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.060961755632888526, "entropy": 0.2100704526528716, "epoch": 0.00576, "grad_norm": 1.2667500972747803, "kl": 2.074540827423334, "learning_rate": 9.999882400929674e-05, "loss": -0.057, "step": 288, "step_time": 8.99862431100064 }, { "clip_ratio/high_max": 0.010850694496184587, "clip_ratio/high_mean": 0.0035807291860692203, "clip_ratio/low_mean": 0.001787173212505877, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005367902398575097, "completions/clipped_ratio": 0.0, "completions/max_length": 1377.0, "completions/max_terminated_length": 1377.0, "completions/mean_length": 1206.5, "completions/mean_terminated_length": 1206.5, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "entropy": 0.21379029098898172, "epoch": 0.00578, "frac_reward_zero_std": 0.0, "grad_norm": 0.9330686926841736, "kl": 0.6734739989042282, "learning_rate": 9.999881465756809e-05, "loss": -0.0075, "num_tokens": 15695392.0, "reward": 5.131735801696777, "reward_std": 13.59388256072998, "rewards/rollout_reward_func/mean": 5.1317362785339355, "rewards/rollout_reward_func/std": 15.563157081604004, "sampling/importance_sampling_ratio/max": 1.5120335817337036, "sampling/importance_sampling_ratio/mean": 0.9915132522583008, "sampling/importance_sampling_ratio/min": 0.7389032244682312, "sampling/sampling_logp_difference/max": 0.3448265790939331, "sampling/sampling_logp_difference/mean": 0.010639440268278122, "step": 289, "step_time": 38.13999567300107 }, { "clip_ratio/high_max": 0.052309082355350256, "clip_ratio/high_mean": 0.015732499363366514, "clip_ratio/low_mean": 0.04032860859297216, "clip_ratio/low_min": 0.0034722222480922937, "clip_ratio/region_mean": 0.056061108596622944, "entropy": 0.20592329651117325, "epoch": 0.0058, "grad_norm": 1.7719804048538208, "kl": 2.317722400650382, "learning_rate": 9.999880526880367e-05, "loss": 0.0124, "step": 290, "step_time": 9.157000397000957 }, { "clip_ratio/high_max": 0.0034722222480922937, "clip_ratio/high_mean": 0.0008680555620230734, "clip_ratio/low_mean": 0.0017361111240461469, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0026041666860692203, "completions/clipped_ratio": 0.015625, "completions/max_length": 1351.0, "completions/max_terminated_length": 1351.0, "completions/mean_length": 1227.28125, "completions/mean_terminated_length": 1227.2381591796875, "completions/min_length": 1085.0, "completions/min_terminated_length": 1085.0, "entropy": 0.2259034337475896, "epoch": 0.00582, "frac_reward_zero_std": 0.0, "grad_norm": 0.5913375616073608, "kl": 0.5888024400919676, "learning_rate": 9.999879584300349e-05, "loss": -0.0201, "num_tokens": 15825170.0, "reward": 4.390281677246094, "reward_std": 13.705522537231445, "rewards/rollout_reward_func/mean": 4.390281677246094, "rewards/rollout_reward_func/std": 13.848193168640137, "sampling/importance_sampling_ratio/max": 1.4006119966506958, "sampling/importance_sampling_ratio/mean": 0.9845026731491089, "sampling/importance_sampling_ratio/min": 0.57123863697052, "sampling/sampling_logp_difference/max": 0.4590674638748169, "sampling/sampling_logp_difference/mean": 0.010318214073777199, "step": 291, "step_time": 38.378031336002095 }, { "clip_ratio/high_max": 0.05575980432331562, "clip_ratio/high_mean": 0.018331290979404002, "clip_ratio/low_mean": 0.025366254791151732, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0436975461198017, "entropy": 0.25106900557875633, "epoch": 0.00584, "grad_norm": 0.3503086268901825, "kl": 0.6040437389165163, "learning_rate": 9.999878638016755e-05, "loss": -0.0261, "step": 292, "step_time": 10.2765881680034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1355.0, "completions/max_terminated_length": 1355.0, "completions/mean_length": 1203.28125, "completions/mean_terminated_length": 1203.28125, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "entropy": 0.255804393440485, "epoch": 0.00586, "frac_reward_zero_std": 0.0, "grad_norm": 0.733392059803009, "kl": 0.7084890268743038, "learning_rate": 9.99987768802959e-05, "loss": -0.0295, "num_tokens": 15953483.0, "reward": 4.652621269226074, "reward_std": 13.876627922058105, "rewards/rollout_reward_func/mean": 4.652621269226074, "rewards/rollout_reward_func/std": 14.444734573364258, "sampling/importance_sampling_ratio/max": 1.5388283729553223, "sampling/importance_sampling_ratio/mean": 0.9943192005157471, "sampling/importance_sampling_ratio/min": 0.66633540391922, "sampling/sampling_logp_difference/max": 0.3228440284729004, "sampling/sampling_logp_difference/mean": 0.009999222122132778, "step": 293, "step_time": 36.11094247699839 }, { "clip_ratio/high_max": 0.07255117082968354, "clip_ratio/high_mean": 0.020741959451697767, "clip_ratio/low_mean": 0.026092729007359594, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0468346884008497, "entropy": 0.2808321360498667, "epoch": 0.00588, "grad_norm": 0.3306209444999695, "kl": 0.6640463471412659, "learning_rate": 9.99987673433885e-05, "loss": -0.0368, "step": 294, "step_time": 9.500305491999825 }, { "clip_ratio/high_max": 0.0034722222480922937, "clip_ratio/high_mean": 0.0008680555620230734, "clip_ratio/low_mean": 0.0026041666860692203, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0034722222480922937, "completions/clipped_ratio": 0.0, "completions/max_length": 1353.0, "completions/max_terminated_length": 1353.0, "completions/mean_length": 1203.546875, "completions/mean_terminated_length": 1203.546875, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "entropy": 0.29141946602612734, "epoch": 0.0059, "frac_reward_zero_std": 0.0, "grad_norm": 0.7943304777145386, "kl": 0.6770852543413639, "learning_rate": 9.999875776944538e-05, "loss": -0.0049, "num_tokens": 16081715.0, "reward": 2.4234745502471924, "reward_std": 10.446115493774414, "rewards/rollout_reward_func/mean": 2.4234743118286133, "rewards/rollout_reward_func/std": 11.454586029052734, "sampling/importance_sampling_ratio/max": 1.456477165222168, "sampling/importance_sampling_ratio/mean": 0.9954730272293091, "sampling/importance_sampling_ratio/min": 0.6373972296714783, "sampling/sampling_logp_difference/max": 0.3991684913635254, "sampling/sampling_logp_difference/mean": 0.010583357885479927, "step": 295, "step_time": 37.65815602800012 }, { "clip_ratio/high_max": 0.049019608180969954, "clip_ratio/high_mean": 0.022671569080557674, "clip_ratio/low_mean": 0.029513889458030462, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.05218545877141878, "entropy": 0.2855409812182188, "epoch": 0.00592, "grad_norm": 0.3455994129180908, "kl": 0.6785521320998669, "learning_rate": 9.999874815846655e-05, "loss": -0.0152, "step": 296, "step_time": 8.978216180002164 }, { "clip_ratio/high_max": 0.010620915098115802, "clip_ratio/high_mean": 0.003523284336552024, "clip_ratio/low_mean": 0.0008680555620230734, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004391339898575097, "completions/clipped_ratio": 0.0, "completions/max_length": 1353.0, "completions/max_terminated_length": 1353.0, "completions/mean_length": 1205.796875, "completions/mean_terminated_length": 1205.796875, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "entropy": 0.3199264472350478, "epoch": 0.00594, "frac_reward_zero_std": 0.0, "grad_norm": 0.5850762128829956, "kl": 0.8113113101571798, "learning_rate": 9.999873851045201e-05, "loss": -0.0035, "num_tokens": 16210207.0, "reward": 2.8288328647613525, "reward_std": 14.778526306152344, "rewards/rollout_reward_func/mean": 2.8288326263427734, "rewards/rollout_reward_func/std": 16.02610969543457, "sampling/importance_sampling_ratio/max": 1.4778478145599365, "sampling/importance_sampling_ratio/mean": 1.0175740718841553, "sampling/importance_sampling_ratio/min": 0.6004241108894348, "sampling/sampling_logp_difference/max": 0.35140562057495117, "sampling/sampling_logp_difference/mean": 0.012288028374314308, "step": 297, "step_time": 36.8206370079979 }, { "clip_ratio/high_max": 0.0490196084138006, "clip_ratio/high_mean": 0.015590063121635467, "clip_ratio/low_mean": 0.026909722771961242, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.04249978606821969, "entropy": 0.3119704835116863, "epoch": 0.00596, "grad_norm": 0.6210339069366455, "kl": 0.8435764815658331, "learning_rate": 9.99987288254018e-05, "loss": -0.0125, "step": 298, "step_time": 9.949690105999252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0017361111240461469, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0017361111240461469, "completions/clipped_ratio": 0.0, "completions/max_length": 1320.0, "completions/max_terminated_length": 1320.0, "completions/mean_length": 1166.171875, "completions/mean_terminated_length": 1166.171875, "completions/min_length": 865.0, "completions/min_terminated_length": 865.0, "entropy": 0.3107016496360302, "epoch": 0.00598, "frac_reward_zero_std": 0.0, "grad_norm": 0.7741795778274536, "kl": 0.8151105176657438, "learning_rate": 9.99987191033159e-05, "loss": 0.013, "num_tokens": 16335952.0, "reward": 0.605268120765686, "reward_std": 9.44769287109375, "rewards/rollout_reward_func/mean": 0.6052679419517517, "rewards/rollout_reward_func/std": 10.618112564086914, "sampling/importance_sampling_ratio/max": 1.4987179040908813, "sampling/importance_sampling_ratio/mean": 1.0136826038360596, "sampling/importance_sampling_ratio/min": 0.7334418892860413, "sampling/sampling_logp_difference/max": 0.23920416831970215, "sampling/sampling_logp_difference/mean": 0.011820180341601372, "step": 299, "step_time": 35.96857580300002 }, { "clip_ratio/high_max": 0.07089971494860947, "clip_ratio/high_mean": 0.022029462968930602, "clip_ratio/low_mean": 0.03730450588045642, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.05933396948967129, "entropy": 0.28032723255455494, "epoch": 0.006, "grad_norm": 0.37243181467056274, "kl": 0.8571038488298655, "learning_rate": 9.999870934419433e-05, "loss": -0.0014, "step": 300, "step_time": 9.851978641999267 }, { "clip_ratio/high_max": 0.014093137346208096, "clip_ratio/high_mean": 0.004391339898575097, "clip_ratio/low_mean": 0.0008680555620230734, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005259395460598171, "completions/clipped_ratio": 0.0, "completions/max_length": 1347.0, "completions/max_terminated_length": 1347.0, "completions/mean_length": 1216.4375, "completions/mean_terminated_length": 1216.4375, "completions/min_length": 825.0, "completions/min_terminated_length": 825.0, "entropy": 0.2521855002269149, "epoch": 0.00602, "frac_reward_zero_std": 0.0, "grad_norm": 0.6381392478942871, "kl": 0.8058282844722271, "learning_rate": 9.999869954803708e-05, "loss": 0.0246, "num_tokens": 16465057.0, "reward": 4.353306293487549, "reward_std": 11.903841018676758, "rewards/rollout_reward_func/mean": 4.353306293487549, "rewards/rollout_reward_func/std": 13.071228981018066, "sampling/importance_sampling_ratio/max": 1.870285153388977, "sampling/importance_sampling_ratio/mean": 1.014232873916626, "sampling/importance_sampling_ratio/min": 0.6221296191215515, "sampling/sampling_logp_difference/max": 0.5893880128860474, "sampling/sampling_logp_difference/mean": 0.010516786947846413, "step": 301, "step_time": 37.21239350799988 }, { "clip_ratio/high_max": 0.0890522887930274, "clip_ratio/high_mean": 0.02920751681085676, "clip_ratio/low_mean": 0.026308735250495374, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.05551625177031383, "entropy": 0.26322738360613585, "epoch": 0.00604, "grad_norm": 0.3497966527938843, "kl": 0.8271188456565142, "learning_rate": 9.999868971484418e-05, "loss": 0.0178, "step": 302, "step_time": 9.60695320800096 }, { "clip_ratio/high_max": 0.007504480192437768, "clip_ratio/high_mean": 0.0027441755519248545, "clip_ratio/low_mean": 0.004391339898575097, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007135515450499952, "completions/clipped_ratio": 0.015625, "completions/max_length": 1357.0, "completions/max_terminated_length": 1357.0, "completions/mean_length": 1233.578125, "completions/mean_terminated_length": 1232.1270751953125, "completions/min_length": 963.0, "completions/min_terminated_length": 963.0, "entropy": 0.3282460719347, "epoch": 0.00606, "frac_reward_zero_std": 0.0, "grad_norm": 0.8028053641319275, "kl": 1.0723739713430405, "learning_rate": 9.999867984461563e-05, "loss": -0.009, "num_tokens": 16595305.0, "reward": 4.948282718658447, "reward_std": 13.311994552612305, "rewards/rollout_reward_func/mean": 4.9482831954956055, "rewards/rollout_reward_func/std": 13.96406078338623, "sampling/importance_sampling_ratio/max": 1.4959264993667603, "sampling/importance_sampling_ratio/mean": 1.0020052194595337, "sampling/importance_sampling_ratio/min": 0.664537250995636, "sampling/sampling_logp_difference/max": 0.4060518741607666, "sampling/sampling_logp_difference/mean": 0.01389513909816742, "step": 303, "step_time": 37.319652419000704 }, { "clip_ratio/high_max": 0.10214776475913823, "clip_ratio/high_mean": 0.0419843090348877, "clip_ratio/low_mean": 0.032362769707106054, "clip_ratio/low_min": 0.0029761905316263437, "clip_ratio/region_mean": 0.07434707973152399, "entropy": 0.35452230647206306, "epoch": 0.00608, "grad_norm": 0.5315479040145874, "kl": 0.8896235972642899, "learning_rate": 9.999866993735147e-05, "loss": -0.0191, "step": 304, "step_time": 9.31032760600101 }, { "clip_ratio/high_max": 0.010416666744276881, "clip_ratio/high_mean": 0.0026041666860692203, "clip_ratio/low_mean": 0.0013720877468585968, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003976254432927817, "completions/clipped_ratio": 0.0, "completions/max_length": 1349.0, "completions/max_terminated_length": 1349.0, "completions/mean_length": 1178.75, "completions/mean_terminated_length": 1178.75, "completions/min_length": 276.0, "completions/min_terminated_length": 276.0, "entropy": 0.3360240999609232, "epoch": 0.0061, "frac_reward_zero_std": 0.0, "grad_norm": 0.8160866498947144, "kl": 0.7584020271897316, "learning_rate": 9.999865999305169e-05, "loss": 0.0343, "num_tokens": 16721992.0, "reward": 5.459122657775879, "reward_std": 12.891645431518555, "rewards/rollout_reward_func/mean": 5.459122657775879, "rewards/rollout_reward_func/std": 13.743046760559082, "sampling/importance_sampling_ratio/max": 1.5625214576721191, "sampling/importance_sampling_ratio/mean": 0.9862264394760132, "sampling/importance_sampling_ratio/min": 0.7355522513389587, "sampling/sampling_logp_difference/max": 0.3090386390686035, "sampling/sampling_logp_difference/mean": 0.012195384129881859, "step": 305, "step_time": 36.28136819499923 }, { "clip_ratio/high_max": 0.10116884484887123, "clip_ratio/high_mean": 0.03489725984400138, "clip_ratio/low_mean": 0.055271854158490896, "clip_ratio/low_min": 0.0069444444961845875, "clip_ratio/region_mean": 0.09016911429353058, "entropy": 0.3460291214287281, "epoch": 0.00612, "grad_norm": 0.4136711359024048, "kl": 0.7873252909630537, "learning_rate": 9.999865001171627e-05, "loss": 0.0177, "step": 306, "step_time": 10.277970246998848 }, { "clip_ratio/high_max": 0.017422385746613145, "clip_ratio/high_mean": 0.004355596436653286, "clip_ratio/low_mean": 0.0008680555620230734, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00522365199867636, "completions/clipped_ratio": 0.0, "completions/max_length": 1335.0, "completions/max_terminated_length": 1335.0, "completions/mean_length": 1148.03125, "completions/mean_terminated_length": 1148.03125, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "entropy": 0.4305746052414179, "epoch": 0.00614, "frac_reward_zero_std": 0.0, "grad_norm": 0.9119190573692322, "kl": 0.862309418618679, "learning_rate": 9.999863999334527e-05, "loss": -0.039, "num_tokens": 16846641.0, "reward": 4.154011249542236, "reward_std": 13.017316818237305, "rewards/rollout_reward_func/mean": 4.1540117263793945, "rewards/rollout_reward_func/std": 12.931968688964844, "sampling/importance_sampling_ratio/max": 1.4475128650665283, "sampling/importance_sampling_ratio/mean": 0.9774882793426514, "sampling/importance_sampling_ratio/min": 9.214395739476355e-13, "sampling/sampling_logp_difference/max": 23.965322494506836, "sampling/sampling_logp_difference/mean": 0.03728090599179268, "step": 307, "step_time": 33.57406117500068 }, { "clip_ratio/high_max": 0.08014640025794506, "clip_ratio/high_mean": 0.026986419688910246, "clip_ratio/low_mean": 0.03807902126573026, "clip_ratio/low_min": 0.0034722222480922937, "clip_ratio/region_mean": 0.06506544025614858, "entropy": 0.4615292586386204, "epoch": 0.00616, "grad_norm": 0.544769287109375, "kl": 0.8701771721243858, "learning_rate": 9.999862993793865e-05, "loss": -0.0498, "step": 308, "step_time": 9.750469571001304 }, { "clip_ratio/high_max": 0.0021551724057644606, "clip_ratio/high_mean": 0.0005387931014411151, "clip_ratio/low_mean": 0.004073183808941394, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004611976910382509, "completions/clipped_ratio": 0.0, "completions/max_length": 1337.0, "completions/max_terminated_length": 1337.0, "completions/mean_length": 1176.34375, "completions/mean_terminated_length": 1176.34375, "completions/min_length": 862.0, "completions/min_terminated_length": 862.0, "entropy": 0.4520879667252302, "epoch": 0.00618, "frac_reward_zero_std": 0.0, "grad_norm": 0.6592026352882385, "kl": 1.228204183280468, "learning_rate": 9.999861984549645e-05, "loss": 0.0146, "num_tokens": 16973130.0, "reward": 5.186724662780762, "reward_std": 12.892146110534668, "rewards/rollout_reward_func/mean": 5.186724662780762, "rewards/rollout_reward_func/std": 12.396245002746582, "sampling/importance_sampling_ratio/max": 1.4907543659210205, "sampling/importance_sampling_ratio/mean": 0.992376446723938, "sampling/importance_sampling_ratio/min": 0.6941927671432495, "sampling/sampling_logp_difference/max": 0.338625431060791, "sampling/sampling_logp_difference/mean": 0.014916637912392616, "step": 309, "step_time": 35.957562657998096 }, { "clip_ratio/high_max": 0.07991837477311492, "clip_ratio/high_mean": 0.02518792706541717, "clip_ratio/low_mean": 0.04024840978672728, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.06543633691035211, "entropy": 0.44149017706513405, "epoch": 0.0062, "grad_norm": 0.5282915234565735, "kl": 1.2467477656900883, "learning_rate": 9.999860971601868e-05, "loss": -0.002, "step": 310, "step_time": 8.980034561999673 }, { "clip_ratio/high_max": 0.0034722222480922937, "clip_ratio/high_mean": 0.0008680555620230734, "clip_ratio/low_mean": 0.0008223684271797538, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0016904239892028272, "completions/clipped_ratio": 0.0, "completions/max_length": 1348.0, "completions/max_terminated_length": 1348.0, "completions/mean_length": 1184.96875, "completions/mean_terminated_length": 1184.96875, "completions/min_length": 917.0, "completions/min_terminated_length": 917.0, "entropy": 0.5129956435412169, "epoch": 0.00622, "frac_reward_zero_std": 0.0, "grad_norm": 0.7465701699256897, "kl": 1.0012187995016575, "learning_rate": 9.999859954950535e-05, "loss": 0.018, "num_tokens": 17100245.0, "reward": 4.89565372467041, "reward_std": 13.874456405639648, "rewards/rollout_reward_func/mean": 4.89565372467041, "rewards/rollout_reward_func/std": 14.702526092529297, "sampling/importance_sampling_ratio/max": 1.5052223205566406, "sampling/importance_sampling_ratio/mean": 1.027785062789917, "sampling/importance_sampling_ratio/min": 0.5468899607658386, "sampling/sampling_logp_difference/max": 0.4049875736236572, "sampling/sampling_logp_difference/mean": 0.017239127308130264, "step": 311, "step_time": 34.84647880300054 }, { "clip_ratio/high_max": 0.08282635360956192, "clip_ratio/high_mean": 0.031164190906565636, "clip_ratio/low_mean": 0.05087516509229317, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.08203935588244349, "entropy": 0.5392583776265383, "epoch": 0.00624, "grad_norm": 0.5834032297134399, "kl": 1.0580051615834236, "learning_rate": 9.999858934595648e-05, "loss": 0.0006, "step": 312, "step_time": 9.797768173001714 }, { "clip_ratio/high_max": 0.006076388992369175, "clip_ratio/high_mean": 0.0015190972480922937, "clip_ratio/low_mean": 0.0006793478387407959, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0021984450868330896, "completions/clipped_ratio": 0.015625, "completions/max_length": 1312.0, "completions/max_terminated_length": 1312.0, "completions/mean_length": 1134.828125, "completions/mean_terminated_length": 1133.3968505859375, "completions/min_length": 492.0, "completions/min_terminated_length": 492.0, "entropy": 0.5534908715635538, "epoch": 0.00626, "frac_reward_zero_std": 0.0, "grad_norm": 0.7070833444595337, "kl": 0.9324860982596874, "learning_rate": 9.999857910537204e-05, "loss": 0.0171, "num_tokens": 17224021.0, "reward": 2.2121267318725586, "reward_std": 12.934229850769043, "rewards/rollout_reward_func/mean": 2.2121264934539795, "rewards/rollout_reward_func/std": 13.348692893981934, "sampling/importance_sampling_ratio/max": 1.3357430696487427, "sampling/importance_sampling_ratio/mean": 0.9801706075668335, "sampling/importance_sampling_ratio/min": 0.6364750862121582, "sampling/sampling_logp_difference/max": 0.28098082542419434, "sampling/sampling_logp_difference/mean": 0.01595621556043625, "step": 313, "step_time": 34.49526453500039 }, { "clip_ratio/high_max": 0.09578519035130739, "clip_ratio/high_mean": 0.028240888088475913, "clip_ratio/low_mean": 0.043463885551318526, "clip_ratio/low_min": 0.003289473708719015, "clip_ratio/region_mean": 0.07170477387262508, "entropy": 0.5145694836974144, "epoch": 0.00628, "grad_norm": 8.991610527038574, "kl": 2.500880379229784, "learning_rate": 9.999856882775207e-05, "loss": 0.0362, "step": 314, "step_time": 9.686568430998705 }, { "clip_ratio/high_max": 0.0034722222480922937, "clip_ratio/high_mean": 0.0008680555620230734, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0008680555620230734, "completions/clipped_ratio": 0.0, "completions/max_length": 1341.0, "completions/max_terminated_length": 1341.0, "completions/mean_length": 1152.8125, "completions/mean_terminated_length": 1152.8125, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "entropy": 0.47399672865867615, "epoch": 0.0063, "frac_reward_zero_std": 0.0, "grad_norm": 0.9034998416900635, "kl": 0.8184376284480095, "learning_rate": 9.999855851309658e-05, "loss": 0.0293, "num_tokens": 17349042.0, "reward": 2.9872653484344482, "reward_std": 10.313895225524902, "rewards/rollout_reward_func/mean": 2.9872655868530273, "rewards/rollout_reward_func/std": 11.123116493225098, "sampling/importance_sampling_ratio/max": 1.5023268461227417, "sampling/importance_sampling_ratio/mean": 0.9912445545196533, "sampling/importance_sampling_ratio/min": 0.5293837189674377, "sampling/sampling_logp_difference/max": 0.49621057510375977, "sampling/sampling_logp_difference/mean": 0.01648723892867565, "step": 315, "step_time": 36.106211563002034 }, { "clip_ratio/high_max": 0.07373366155661643, "clip_ratio/high_mean": 0.02871302078710869, "clip_ratio/low_mean": 0.052897133806254715, "clip_ratio/low_min": 0.0034722222480922937, "clip_ratio/region_mean": 0.08161015470977873, "entropy": 0.439556997269392, "epoch": 0.00632, "grad_norm": 1.1542975902557373, "kl": 0.8081017658114433, "learning_rate": 9.999854816140556e-05, "loss": 0.0112, "step": 316, "step_time": 9.529359940000177 }, { "clip_ratio/high_max": 0.0069444444961845875, "clip_ratio/high_mean": 0.0017361111240461469, "clip_ratio/low_mean": 0.0008680555620230734, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0026041666860692203, "completions/clipped_ratio": 0.015625, "completions/max_length": 1345.0, "completions/max_terminated_length": 1345.0, "completions/mean_length": 1153.015625, "completions/mean_terminated_length": 1152.2857666015625, "completions/min_length": 391.0, "completions/min_terminated_length": 391.0, "entropy": 0.4202824104577303, "epoch": 0.00634, "frac_reward_zero_std": 0.0, "grad_norm": 0.8387591242790222, "kl": 0.7999376337975264, "learning_rate": 9.999853777267906e-05, "loss": -0.0113, "num_tokens": 17474080.0, "reward": 3.8928003311157227, "reward_std": 13.945871353149414, "rewards/rollout_reward_func/mean": 3.8928003311157227, "rewards/rollout_reward_func/std": 14.018685340881348, "sampling/importance_sampling_ratio/max": 1.3972409963607788, "sampling/importance_sampling_ratio/mean": 0.9933174252510071, "sampling/importance_sampling_ratio/min": 0.66861891746521, "sampling/sampling_logp_difference/max": 0.3364081382751465, "sampling/sampling_logp_difference/mean": 0.013292517513036728, "step": 317, "step_time": 35.11344056699909 }, { "clip_ratio/high_max": 0.062046968610957265, "clip_ratio/high_mean": 0.02252296026563272, "clip_ratio/low_mean": 0.06663749110884964, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.08916045201476663, "entropy": 0.3698996100574732, "epoch": 0.00636, "grad_norm": 0.5773271918296814, "kl": 0.9309169836342335, "learning_rate": 9.999852734691706e-05, "loss": -0.0303, "step": 318, "step_time": 9.084739140999773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0034635705524124205, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0034635705524124205, "completions/clipped_ratio": 0.0, "completions/max_length": 1360.0, "completions/max_terminated_length": 1360.0, "completions/mean_length": 1189.828125, "completions/mean_terminated_length": 1189.828125, "completions/min_length": 1056.0, "completions/min_terminated_length": 1056.0, "entropy": 0.3289623577147722, "epoch": 0.00638, "frac_reward_zero_std": 0.0, "grad_norm": 0.7116008400917053, "kl": 0.9500053711235523, "learning_rate": 9.999851688411959e-05, "loss": 0.0123, "num_tokens": 17601410.0, "reward": 4.444620609283447, "reward_std": 12.232638359069824, "rewards/rollout_reward_func/mean": 4.444620609283447, "rewards/rollout_reward_func/std": 12.037857055664062, "sampling/importance_sampling_ratio/max": 1.8450855016708374, "sampling/importance_sampling_ratio/mean": 0.9873309135437012, "sampling/importance_sampling_ratio/min": 2.6370022485067146e-11, "sampling/sampling_logp_difference/max": 11.255170822143555, "sampling/sampling_logp_difference/mean": 0.033629726618528366, "step": 319, "step_time": 38.23992628900032 }, { "clip_ratio/high_max": 0.06827694294042885, "clip_ratio/high_mean": 0.024013680347707123, "clip_ratio/low_mean": 0.04197527136420831, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.06598895112983882, "entropy": 0.3273693434894085, "epoch": 0.0064, "grad_norm": 0.5724111795425415, "kl": 1.0788306891918182, "learning_rate": 9.999850638428662e-05, "loss": 0.0049, "step": 320, "step_time": 10.348325264999403 }, { "clip_ratio/high_max": 0.009027777938172221, "clip_ratio/high_mean": 0.0022569444845430553, "clip_ratio/low_mean": 0.002170138934161514, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004427083418704569, "completions/clipped_ratio": 0.0, "completions/max_length": 1336.0, "completions/max_terminated_length": 1336.0, "completions/mean_length": 1203.65625, "completions/mean_terminated_length": 1203.65625, "completions/min_length": 898.0, "completions/min_terminated_length": 898.0, "entropy": 0.3083435148000717, "epoch": 0.00642, "frac_reward_zero_std": 0.0, "grad_norm": 0.8611142039299011, "kl": 0.9252029061317444, "learning_rate": 9.99984958474182e-05, "loss": 0.024, "num_tokens": 17729704.0, "reward": 2.3030171394348145, "reward_std": 10.394119262695312, "rewards/rollout_reward_func/mean": 2.3030171394348145, "rewards/rollout_reward_func/std": 11.775047302246094, "sampling/importance_sampling_ratio/max": 1.6587164402008057, "sampling/importance_sampling_ratio/mean": 1.0117642879486084, "sampling/importance_sampling_ratio/min": 0.4190000295639038, "sampling/sampling_logp_difference/max": 0.3578883409500122, "sampling/sampling_logp_difference/mean": 0.015003521926701069, "step": 321, "step_time": 35.224505068000326 }, { "clip_ratio/high_max": 0.07542938669212162, "clip_ratio/high_mean": 0.03128034179098904, "clip_ratio/low_mean": 0.03897239360958338, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0702527352841571, "entropy": 0.2796294568106532, "epoch": 0.00644, "grad_norm": 0.5799975395202637, "kl": 0.8807330075651407, "learning_rate": 9.999848527351433e-05, "loss": 0.0091, "step": 322, "step_time": 9.679342022999663 }, { "clip_ratio/high_max": 0.0069444444961845875, "clip_ratio/high_mean": 0.0017361111240461469, "clip_ratio/low_mean": 0.0017361111240461469, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0034722222480922937, "completions/clipped_ratio": 0.0, "completions/max_length": 1294.0, "completions/max_terminated_length": 1294.0, "completions/mean_length": 1194.96875, "completions/mean_terminated_length": 1194.96875, "completions/min_length": 300.0, "completions/min_terminated_length": 300.0, "entropy": 0.24392448458820581, "epoch": 0.00646, "frac_reward_zero_std": 0.0, "grad_norm": 0.7959789633750916, "kl": 0.779301343485713, "learning_rate": 9.9998474662575e-05, "loss": -0.0199, "num_tokens": 17857450.0, "reward": 4.581869125366211, "reward_std": 11.262429237365723, "rewards/rollout_reward_func/mean": 4.581869602203369, "rewards/rollout_reward_func/std": 12.287596702575684, "sampling/importance_sampling_ratio/max": 2.240818977355957, "sampling/importance_sampling_ratio/mean": 1.018520712852478, "sampling/importance_sampling_ratio/min": 0.3999040722846985, "sampling/sampling_logp_difference/max": 0.5928263664245605, "sampling/sampling_logp_difference/mean": 0.012124484404921532, "step": 323, "step_time": 37.114893339001355 }, { "clip_ratio/high_max": 0.0683479537256062, "clip_ratio/high_mean": 0.022346383950207382, "clip_ratio/low_mean": 0.03898888279218227, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.06133526662597433, "entropy": 0.2431696206331253, "epoch": 0.00648, "grad_norm": 0.35803645849227905, "kl": 0.767679963260889, "learning_rate": 9.999846401460026e-05, "loss": -0.0339, "step": 324, "step_time": 8.890772448001371 }, { "clip_ratio/high_max": 0.015318243764340878, "clip_ratio/high_mean": 0.0038295609410852194, "clip_ratio/low_mean": 0.0023561508278362453, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006185711768921465, "completions/clipped_ratio": 0.0, "completions/max_length": 1359.0, "completions/max_terminated_length": 1359.0, "completions/mean_length": 1233.796875, "completions/mean_terminated_length": 1233.796875, "completions/min_length": 663.0, "completions/min_terminated_length": 663.0, "entropy": 0.24552472867071629, "epoch": 0.0065, "frac_reward_zero_std": 0.0, "grad_norm": 0.7995591163635254, "kl": 0.8211700264364481, "learning_rate": 9.99984533295901e-05, "loss": -0.0057, "num_tokens": 17987636.0, "reward": 2.361278533935547, "reward_std": 11.01347541809082, "rewards/rollout_reward_func/mean": 2.361278533935547, "rewards/rollout_reward_func/std": 11.316116333007812, "sampling/importance_sampling_ratio/max": 1.4373282194137573, "sampling/importance_sampling_ratio/mean": 0.9916459321975708, "sampling/importance_sampling_ratio/min": 0.7290171384811401, "sampling/sampling_logp_difference/max": 0.3705787658691406, "sampling/sampling_logp_difference/mean": 0.01046331413090229, "step": 325, "step_time": 38.97714790000191 }, { "clip_ratio/high_max": 0.05977182672359049, "clip_ratio/high_mean": 0.02233774628257379, "clip_ratio/low_mean": 0.027810412109829485, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.05014815804315731, "entropy": 0.24062953237444162, "epoch": 0.00652, "grad_norm": 0.6732361316680908, "kl": 0.9134266618639231, "learning_rate": 9.999844260754451e-05, "loss": -0.011, "step": 326, "step_time": 9.702275648000068 }, { "clip_ratio/high_max": 0.0069444444961845875, "clip_ratio/high_mean": 0.0017361111240461469, "clip_ratio/low_mean": 0.0017361111240461469, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0034722222480922937, "completions/clipped_ratio": 0.0, "completions/max_length": 1343.0, "completions/max_terminated_length": 1343.0, "completions/mean_length": 1266.625, "completions/mean_terminated_length": 1266.625, "completions/min_length": 1005.0, "completions/min_terminated_length": 1005.0, "entropy": 0.19475865550339222, "epoch": 0.00654, "frac_reward_zero_std": 0.0, "grad_norm": 0.7081814408302307, "kl": 0.7645703088492155, "learning_rate": 9.999843184846354e-05, "loss": 0.0194, "num_tokens": 18120014.0, "reward": 6.7441205978393555, "reward_std": 12.950173377990723, "rewards/rollout_reward_func/mean": 6.7441205978393555, "rewards/rollout_reward_func/std": 13.17819881439209, "sampling/importance_sampling_ratio/max": 2.733876943588257, "sampling/importance_sampling_ratio/mean": 1.017435908317566, "sampling/importance_sampling_ratio/min": 0.8077232837677002, "sampling/sampling_logp_difference/max": 1.0649070739746094, "sampling/sampling_logp_difference/mean": 0.008961044251918793, "step": 327, "step_time": 38.45977644300001 }, { "clip_ratio/high_max": 0.05813231039792299, "clip_ratio/high_mean": 0.0188276685657911, "clip_ratio/low_mean": 0.017785656382329762, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03661332529736683, "entropy": 0.18874722812324762, "epoch": 0.00656, "grad_norm": 0.29629969596862793, "kl": 0.7521160487085581, "learning_rate": 9.999842105234716e-05, "loss": 0.0089, "step": 328, "step_time": 9.240072366999811 }, { "clip_ratio/high_max": 0.01736111124046147, "clip_ratio/high_mean": 0.004340277810115367, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004340277810115367, "completions/clipped_ratio": 0.0, "completions/max_length": 1335.0, "completions/max_terminated_length": 1335.0, "completions/mean_length": 1223.046875, "completions/mean_terminated_length": 1223.046875, "completions/min_length": 1069.0, "completions/min_terminated_length": 1069.0, "entropy": 0.17775962874293327, "epoch": 0.00658, "frac_reward_zero_std": 0.0, "grad_norm": 0.5002116560935974, "kl": 0.5365802068263292, "learning_rate": 9.999841021919543e-05, "loss": -0.0003, "num_tokens": 18249422.0, "reward": 5.926024436950684, "reward_std": 10.913434028625488, "rewards/rollout_reward_func/mean": 5.926024436950684, "rewards/rollout_reward_func/std": 11.495051383972168, "sampling/importance_sampling_ratio/max": 1.340820550918579, "sampling/importance_sampling_ratio/mean": 0.9783110618591309, "sampling/importance_sampling_ratio/min": 0.5937914848327637, "sampling/sampling_logp_difference/max": 0.4624512195587158, "sampling/sampling_logp_difference/mean": 0.009367045015096664, "step": 329, "step_time": 40.06546166299813 }, { "clip_ratio/high_max": 0.049223856534808874, "clip_ratio/high_mean": 0.015778186498209834, "clip_ratio/low_mean": 0.02711397095117718, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.04289215768221766, "entropy": 0.16470052115619183, "epoch": 0.0066, "grad_norm": 0.272549033164978, "kl": 0.5706925727427006, "learning_rate": 9.999839934900832e-05, "loss": -0.0098, "step": 330, "step_time": 9.584335595999619 }, { "clip_ratio/high_max": 0.006761695956811309, "clip_ratio/high_mean": 0.0016904239892028272, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0016904239892028272, "completions/clipped_ratio": 0.0, "completions/max_length": 1346.0, "completions/max_terminated_length": 1346.0, "completions/mean_length": 1230.453125, "completions/mean_terminated_length": 1230.453125, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "entropy": 0.15301176952198148, "epoch": 0.00662, "frac_reward_zero_std": 0.0, "grad_norm": 0.5951566696166992, "kl": 0.6225019320845604, "learning_rate": 9.999838844178584e-05, "loss": -0.0415, "num_tokens": 18379457.0, "reward": 5.441021919250488, "reward_std": 11.596078872680664, "rewards/rollout_reward_func/mean": 5.441021919250488, "rewards/rollout_reward_func/std": 13.130385398864746, "sampling/importance_sampling_ratio/max": 1.2981815338134766, "sampling/importance_sampling_ratio/mean": 0.9712120294570923, "sampling/importance_sampling_ratio/min": 0.5313878655433655, "sampling/sampling_logp_difference/max": 0.4391303062438965, "sampling/sampling_logp_difference/mean": 0.008532309904694557, "step": 331, "step_time": 38.03544032799982 }, { "clip_ratio/high_max": 0.040491855004802346, "clip_ratio/high_mean": 0.015382359270006418, "clip_ratio/low_mean": 0.02635878958972171, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.04174114967463538, "entropy": 0.13200736604630947, "epoch": 0.00664, "grad_norm": 0.4791216552257538, "kl": 0.6594886407256126, "learning_rate": 9.999837749752803e-05, "loss": -0.0494, "step": 332, "step_time": 9.623436052000216 }, { "clip_ratio/high_max": 0.010416666744276881, "clip_ratio/high_mean": 0.0034722222480922937, "clip_ratio/low_mean": 0.0008680555620230734, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004340277810115367, "completions/clipped_ratio": 0.0, "completions/max_length": 1344.0, "completions/max_terminated_length": 1344.0, "completions/mean_length": 1219.734375, "completions/mean_terminated_length": 1219.734375, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "entropy": 0.11573670757934451, "epoch": 0.00666, "frac_reward_zero_std": 0.0, "grad_norm": 0.6563036441802979, "kl": 0.851641334593296, "learning_rate": 9.999836651623487e-05, "loss": -0.0048, "num_tokens": 18508741.0, "reward": 5.738734245300293, "reward_std": 12.408971786499023, "rewards/rollout_reward_func/mean": 5.738734245300293, "rewards/rollout_reward_func/std": 12.671599388122559, "sampling/importance_sampling_ratio/max": 1.3496527671813965, "sampling/importance_sampling_ratio/mean": 1.0077811479568481, "sampling/importance_sampling_ratio/min": 0.6974970102310181, "sampling/sampling_logp_difference/max": 0.3328993320465088, "sampling/sampling_logp_difference/mean": 0.00713011808693409, "step": 333, "step_time": 39.30893147200186 }, { "clip_ratio/high_max": 0.03472222248092294, "clip_ratio/high_mean": 0.011284722364507616, "clip_ratio/low_mean": 0.024913194763939828, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03619791695382446, "entropy": 0.11344034224748611, "epoch": 0.00668, "grad_norm": 0.4923565983772278, "kl": 0.6980615984648466, "learning_rate": 9.999835549790641e-05, "loss": -0.0079, "step": 334, "step_time": 10.117216201999327 }, { "clip_ratio/high_max": 0.0036764706019312143, "clip_ratio/high_mean": 0.0009191176504828036, "clip_ratio/low_mean": 0.0034722222480922937, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004391339898575097, "completions/clipped_ratio": 0.0, "completions/max_length": 1359.0, "completions/max_terminated_length": 1359.0, "completions/mean_length": 1238.984375, "completions/mean_terminated_length": 1238.984375, "completions/min_length": 635.0, "completions/min_terminated_length": 635.0, "entropy": 0.12856985442340374, "epoch": 0.0067, "frac_reward_zero_std": 0.0, "grad_norm": 0.8339588642120361, "kl": 0.6355916745960712, "learning_rate": 9.999834444254262e-05, "loss": -0.0042, "num_tokens": 18639311.0, "reward": 6.144355297088623, "reward_std": 13.870889663696289, "rewards/rollout_reward_func/mean": 6.144355297088623, "rewards/rollout_reward_func/std": 14.220029830932617, "sampling/importance_sampling_ratio/max": 1.3300856351852417, "sampling/importance_sampling_ratio/mean": 0.9924861788749695, "sampling/importance_sampling_ratio/min": 0.6479190587997437, "sampling/sampling_logp_difference/max": 0.2639361619949341, "sampling/sampling_logp_difference/mean": 0.006817285902798176, "step": 335, "step_time": 37.15376971599926 }, { "clip_ratio/high_max": 0.03817401989363134, "clip_ratio/high_mean": 0.012147671717684716, "clip_ratio/low_mean": 0.020067402394488454, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03221507422858849, "entropy": 0.12425063038244843, "epoch": 0.00672, "grad_norm": 0.32718658447265625, "kl": 0.7219895403832197, "learning_rate": 9.999833335014352e-05, "loss": -0.011, "step": 336, "step_time": 9.780628326000624 }, { "clip_ratio/high_max": 0.0069444444961845875, "clip_ratio/high_mean": 0.0017361111240461469, "clip_ratio/low_mean": 0.0008680555620230734, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0026041666860692203, "completions/clipped_ratio": 0.0, "completions/max_length": 1356.0, "completions/max_terminated_length": 1356.0, "completions/mean_length": 1266.78125, "completions/mean_terminated_length": 1266.78125, "completions/min_length": 794.0, "completions/min_terminated_length": 794.0, "entropy": 0.11600295826792717, "epoch": 0.00674, "frac_reward_zero_std": 0.0, "grad_norm": 0.7365610003471375, "kl": 0.5392901804298162, "learning_rate": 9.999832222070914e-05, "loss": 0.0023, "num_tokens": 18771742.0, "reward": 5.880302429199219, "reward_std": 12.320051193237305, "rewards/rollout_reward_func/mean": 5.880302429199219, "rewards/rollout_reward_func/std": 12.716879844665527, "sampling/importance_sampling_ratio/max": 1.3457348346710205, "sampling/importance_sampling_ratio/mean": 0.9991644620895386, "sampling/importance_sampling_ratio/min": 0.6999140381813049, "sampling/sampling_logp_difference/max": 0.3562436103820801, "sampling/sampling_logp_difference/mean": 0.005911126732826233, "step": 337, "step_time": 38.84034024799803 }, { "clip_ratio/high_max": 0.03513071918860078, "clip_ratio/high_mean": 0.011386846599634737, "clip_ratio/low_mean": 0.024994894862174988, "clip_ratio/low_min": 0.0034722222480922937, "clip_ratio/region_mean": 0.036381741403602064, "entropy": 0.1102461889386177, "epoch": 0.00676, "grad_norm": 0.2775817811489105, "kl": 0.6620934028178453, "learning_rate": 9.999831105423947e-05, "loss": -0.006, "step": 338, "step_time": 9.023721004000436 }, { "clip_ratio/high_max": 0.0069444444961845875, "clip_ratio/high_mean": 0.0017361111240461469, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0017361111240461469, "completions/clipped_ratio": 0.0, "completions/max_length": 1350.0, "completions/max_terminated_length": 1350.0, "completions/mean_length": 1238.953125, "completions/mean_terminated_length": 1238.953125, "completions/min_length": 1062.0, "completions/min_terminated_length": 1062.0, "entropy": 0.11055759433656931, "epoch": 0.00678, "frac_reward_zero_std": 0.0, "grad_norm": 0.5181728601455688, "kl": 0.462925398722291, "learning_rate": 9.999829985073453e-05, "loss": 0.0105, "num_tokens": 18902239.0, "reward": 7.53302526473999, "reward_std": 12.4171142578125, "rewards/rollout_reward_func/mean": 7.533025741577148, "rewards/rollout_reward_func/std": 13.036537170410156, "sampling/importance_sampling_ratio/max": 1.3853559494018555, "sampling/importance_sampling_ratio/mean": 1.000986933708191, "sampling/importance_sampling_ratio/min": 0.702711284160614, "sampling/sampling_logp_difference/max": 0.4794572591781616, "sampling/sampling_logp_difference/mean": 0.00590522913262248, "step": 339, "step_time": 39.08791527499943 }, { "clip_ratio/high_max": 0.039215686498209834, "clip_ratio/high_mean": 0.013276143989060074, "clip_ratio/low_mean": 0.02185995056061074, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03513609484070912, "entropy": 0.11709691304713488, "epoch": 0.0068, "grad_norm": 0.30310577154159546, "kl": 0.5310502368956804, "learning_rate": 9.999828861019435e-05, "loss": 0.006, "step": 340, "step_time": 9.792470953999327 }, { "clip_ratio/high_max": 0.013888888992369175, "clip_ratio/high_mean": 0.0034722222480922937, "clip_ratio/low_mean": 0.0009191176504828036, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004391339898575097, "completions/clipped_ratio": 0.015625, "completions/max_length": 1347.0, "completions/max_terminated_length": 1347.0, "completions/mean_length": 1241.765625, "completions/mean_terminated_length": 1240.635009765625, "completions/min_length": 1101.0, "completions/min_terminated_length": 1101.0, "entropy": 0.12758585345000029, "epoch": 0.00682, "frac_reward_zero_std": 0.0, "grad_norm": 0.6102613210678101, "kl": 0.6113391723483801, "learning_rate": 9.99982773326189e-05, "loss": 0.0158, "num_tokens": 19032925.0, "reward": 3.9826180934906006, "reward_std": 12.427906036376953, "rewards/rollout_reward_func/mean": 3.9826183319091797, "rewards/rollout_reward_func/std": 13.354879379272461, "sampling/importance_sampling_ratio/max": 1.1800951957702637, "sampling/importance_sampling_ratio/mean": 0.997043251991272, "sampling/importance_sampling_ratio/min": 0.7389504313468933, "sampling/sampling_logp_difference/max": 0.2936210632324219, "sampling/sampling_logp_difference/mean": 0.005317248869687319, "step": 341, "step_time": 39.12008871799935 }, { "clip_ratio/high_max": 0.021037581842392683, "clip_ratio/high_mean": 0.006995506584644318, "clip_ratio/low_mean": 0.016595179855357856, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.023590686498209834, "entropy": 0.12726877955719829, "epoch": 0.00684, "grad_norm": 0.49857455492019653, "kl": 0.6323374789208174, "learning_rate": 9.999826601800824e-05, "loss": 0.0106, "step": 342, "step_time": 9.27907708300063 }, { "clip_ratio/high_max": 0.0034722222480922937, "clip_ratio/high_mean": 0.0008680555620230734, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0008680555620230734, "completions/clipped_ratio": 0.0, "completions/max_length": 1341.0, "completions/max_terminated_length": 1341.0, "completions/mean_length": 1219.203125, "completions/mean_terminated_length": 1219.203125, "completions/min_length": 735.0, "completions/min_terminated_length": 735.0, "entropy": 0.11009268835186958, "epoch": 0.00686, "frac_reward_zero_std": 0.0, "grad_norm": 0.6059837341308594, "kl": 0.7338532544672489, "learning_rate": 9.999825466636233e-05, "loss": -0.0167, "num_tokens": 19162127.0, "reward": 4.720416069030762, "reward_std": 10.753931999206543, "rewards/rollout_reward_func/mean": 4.720416069030762, "rewards/rollout_reward_func/std": 12.976871490478516, "sampling/importance_sampling_ratio/max": 1.5183767080307007, "sampling/importance_sampling_ratio/mean": 1.0038487911224365, "sampling/importance_sampling_ratio/min": 0.6935895681381226, "sampling/sampling_logp_difference/max": 0.4249706268310547, "sampling/sampling_logp_difference/mean": 0.004785279743373394, "step": 343, "step_time": 39.04033868700208 }, { "clip_ratio/high_max": 0.02798202633857727, "clip_ratio/high_mean": 0.012203840189613402, "clip_ratio/low_mean": 0.013071895577013493, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.025275735824834555, "entropy": 0.11364737106487155, "epoch": 0.00688, "grad_norm": 0.2909540832042694, "kl": 0.7444342169910669, "learning_rate": 9.999824327768122e-05, "loss": -0.0205, "step": 344, "step_time": 9.620514073000777 }, { "clip_ratio/high_max": 0.0034722222480922937, "clip_ratio/high_mean": 0.0008680555620230734, "clip_ratio/low_mean": 0.0008680555620230734, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0017361111240461469, "completions/clipped_ratio": 0.0, "completions/max_length": 1359.0, "completions/max_terminated_length": 1359.0, "completions/mean_length": 1240.3125, "completions/mean_terminated_length": 1240.3125, "completions/min_length": 1088.0, "completions/min_terminated_length": 1088.0, "entropy": 0.13259067060425878, "epoch": 0.0069, "frac_reward_zero_std": 0.0, "grad_norm": 0.8596341609954834, "kl": 0.7895576078444719, "learning_rate": 9.99982318519649e-05, "loss": -0.0057, "num_tokens": 19292776.0, "reward": 2.67919659614563, "reward_std": 14.777613639831543, "rewards/rollout_reward_func/mean": 2.679196357727051, "rewards/rollout_reward_func/std": 15.276268005371094, "sampling/importance_sampling_ratio/max": 1.4407294988632202, "sampling/importance_sampling_ratio/mean": 0.9704160690307617, "sampling/importance_sampling_ratio/min": 0.6675639152526855, "sampling/sampling_logp_difference/max": 0.4319186210632324, "sampling/sampling_logp_difference/mean": 0.007419218309223652, "step": 345, "step_time": 38.83812410200153 }, { "clip_ratio/high_max": 0.03472222248092294, "clip_ratio/high_mean": 0.009548611182253808, "clip_ratio/low_mean": 0.03599877539090812, "clip_ratio/low_min": 0.0034722222480922937, "clip_ratio/region_mean": 0.045547386282123625, "entropy": 0.1278433846309781, "epoch": 0.00692, "grad_norm": 0.6303772330284119, "kl": 1.1542848944664001, "learning_rate": 9.999822038921338e-05, "loss": -0.0049, "step": 346, "step_time": 9.405012558002454 }, { "clip_ratio/high_max": 0.0069444444961845875, "clip_ratio/high_mean": 0.0025584796094335616, "clip_ratio/low_mean": 0.0026041666860692203, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005162646295502782, "completions/clipped_ratio": 0.0, "completions/max_length": 1353.0, "completions/max_terminated_length": 1353.0, "completions/mean_length": 1241.171875, "completions/mean_terminated_length": 1241.171875, "completions/min_length": 719.0, "completions/min_terminated_length": 719.0, "entropy": 0.1106796741951257, "epoch": 0.00694, "frac_reward_zero_std": 0.0, "grad_norm": 0.4797411561012268, "kl": 0.6278085261583328, "learning_rate": 9.99982088894267e-05, "loss": 0.0106, "num_tokens": 19423491.0, "reward": 5.54637336730957, "reward_std": 12.041938781738281, "rewards/rollout_reward_func/mean": 5.54637336730957, "rewards/rollout_reward_func/std": 13.066041946411133, "sampling/importance_sampling_ratio/max": 1.482460618019104, "sampling/importance_sampling_ratio/mean": 0.9943655133247375, "sampling/importance_sampling_ratio/min": 0.6257169246673584, "sampling/sampling_logp_difference/max": 0.510839581489563, "sampling/sampling_logp_difference/mean": 0.006749512627720833, "step": 347, "step_time": 39.46550670700071 }, { "clip_ratio/high_max": 0.05993883335031569, "clip_ratio/high_mean": 0.015852763841394335, "clip_ratio/low_mean": 0.02315665892092511, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390094225294888, "entropy": 0.11026378069072962, "epoch": 0.00696, "grad_norm": 0.3181508183479309, "kl": 0.6370288580656052, "learning_rate": 9.999819735260483e-05, "loss": 0.0068, "step": 348, "step_time": 10.100482684999406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.002517361135687679, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002517361135687679, "completions/clipped_ratio": 0.0, "completions/max_length": 1332.0, "completions/max_terminated_length": 1332.0, "completions/mean_length": 1208.171875, "completions/mean_terminated_length": 1208.171875, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "entropy": 0.13927970174700022, "epoch": 0.00698, "frac_reward_zero_std": 0.0, "grad_norm": 0.5229349732398987, "kl": 0.5507344976067543, "learning_rate": 9.999818577874781e-05, "loss": 0.0234, "num_tokens": 19552011.0, "reward": 5.104192733764648, "reward_std": 11.615788459777832, "rewards/rollout_reward_func/mean": 5.104192733764648, "rewards/rollout_reward_func/std": 12.1382474899292, "sampling/importance_sampling_ratio/max": 1.415814995765686, "sampling/importance_sampling_ratio/mean": 1.0048539638519287, "sampling/importance_sampling_ratio/min": 1.834242700438695e-16, "sampling/sampling_logp_difference/max": 27.188508987426758, "sampling/sampling_logp_difference/mean": 0.039306361228227615, "step": 349, "step_time": 37.862728561997756 }, { "clip_ratio/high_max": 0.05868378118611872, "clip_ratio/high_mean": 0.017275112157221884, "clip_ratio/low_mean": 0.013766340038273484, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03104145231191069, "entropy": 0.14939681394025683, "epoch": 0.007, "grad_norm": 0.3000262379646301, "kl": 0.5239376667886972, "learning_rate": 9.999817416785565e-05, "loss": 0.0173, "step": 350, "step_time": 9.859687822999149 }, { "clip_ratio/high_max": 0.013706140452995896, "clip_ratio/high_mean": 0.003426535113248974, "clip_ratio/low_mean": 0.0026041666860692203, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006030701799318194, "completions/clipped_ratio": 0.0, "completions/max_length": 1342.0, "completions/max_terminated_length": 1342.0, "completions/mean_length": 1236.90625, "completions/mean_terminated_length": 1236.90625, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "entropy": 0.14154944382607937, "epoch": 0.00702, "frac_reward_zero_std": 0.0, "grad_norm": 0.47570475935935974, "kl": 0.5030098669230938, "learning_rate": 9.999816251992836e-05, "loss": -0.0158, "num_tokens": 19682494.0, "reward": 4.190635681152344, "reward_std": 14.216930389404297, "rewards/rollout_reward_func/mean": 4.190635681152344, "rewards/rollout_reward_func/std": 14.30445671081543, "sampling/importance_sampling_ratio/max": 1.5223360061645508, "sampling/importance_sampling_ratio/mean": 1.0101966857910156, "sampling/importance_sampling_ratio/min": 0.7218723297119141, "sampling/sampling_logp_difference/max": 0.302712082862854, "sampling/sampling_logp_difference/mean": 0.007096399553120136, "step": 351, "step_time": 39.30754639200131 }, { "clip_ratio/high_max": 0.03492647083476186, "clip_ratio/high_mean": 0.013026208442170173, "clip_ratio/low_mean": 0.018280228949151933, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.031306437274906784, "entropy": 0.1433765795081854, "epoch": 0.00704, "grad_norm": 0.2731061577796936, "kl": 0.5208645444363356, "learning_rate": 9.999815083496594e-05, "loss": -0.0214, "step": 352, "step_time": 9.398018900999887 }, { "clip_ratio/high_max": 0.013888888992369175, "clip_ratio/high_mean": 0.0034722222480922937, "clip_ratio/low_mean": 0.0034722222480922937, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0069444444961845875, "completions/clipped_ratio": 0.0, "completions/max_length": 1348.0, "completions/max_terminated_length": 1348.0, "completions/mean_length": 1237.3125, "completions/mean_terminated_length": 1237.3125, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "entropy": 0.1480951178818941, "epoch": 0.00706, "frac_reward_zero_std": 0.0, "grad_norm": 0.5071465373039246, "kl": 0.5567373130470514, "learning_rate": 9.99981391129684e-05, "loss": -0.008, "num_tokens": 19812942.0, "reward": 4.355041980743408, "reward_std": 13.132366180419922, "rewards/rollout_reward_func/mean": 4.355041980743408, "rewards/rollout_reward_func/std": 13.851308822631836, "sampling/importance_sampling_ratio/max": 1.4629567861557007, "sampling/importance_sampling_ratio/mean": 1.0300343036651611, "sampling/importance_sampling_ratio/min": 0.6640676856040955, "sampling/sampling_logp_difference/max": 0.5005507469177246, "sampling/sampling_logp_difference/mean": 0.007855242118239403, "step": 353, "step_time": 38.34223270599978 }, { "clip_ratio/high_max": 0.024305555736646056, "clip_ratio/high_mean": 0.007812500116415322, "clip_ratio/low_mean": 0.024994894512929022, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.032807394745759666, "entropy": 0.13850665464997292, "epoch": 0.00708, "grad_norm": 0.27875232696533203, "kl": 0.5900795683264732, "learning_rate": 9.999812735393576e-05, "loss": -0.0167, "step": 354, "step_time": 9.843489302002126 }, { "clip_ratio/high_max": 0.010416666744276881, "clip_ratio/high_mean": 0.0026041666860692203, "clip_ratio/low_mean": 0.0008680555620230734, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0034722222480922937, "completions/clipped_ratio": 0.0, "completions/max_length": 1358.0, "completions/max_terminated_length": 1358.0, "completions/mean_length": 1228.9375, "completions/mean_terminated_length": 1228.9375, "completions/min_length": 699.0, "completions/min_terminated_length": 699.0, "entropy": 0.13509350316599011, "epoch": 0.0071, "frac_reward_zero_std": 0.0, "grad_norm": 0.5068008899688721, "kl": 0.4818702656775713, "learning_rate": 9.999811555786804e-05, "loss": 0.0278, "num_tokens": 19942820.0, "reward": 5.804719924926758, "reward_std": 13.167655944824219, "rewards/rollout_reward_func/mean": 5.804719924926758, "rewards/rollout_reward_func/std": 13.18018913269043, "sampling/importance_sampling_ratio/max": 1.306014895439148, "sampling/importance_sampling_ratio/mean": 1.0091025829315186, "sampling/importance_sampling_ratio/min": 0.625792384147644, "sampling/sampling_logp_difference/max": 0.3519221544265747, "sampling/sampling_logp_difference/mean": 0.006864185445010662, "step": 355, "step_time": 39.10083819900228 }, { "clip_ratio/high_max": 0.031250000232830644, "clip_ratio/high_mean": 0.013888889225199819, "clip_ratio/low_mean": 0.026143791212234646, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.040032680495642126, "entropy": 0.1260006483644247, "epoch": 0.00712, "grad_norm": 0.2814493179321289, "kl": 0.5503856968134642, "learning_rate": 9.999810372476525e-05, "loss": 0.0244, "step": 356, "step_time": 9.433313735999036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1365.0, "completions/max_terminated_length": 1365.0, "completions/mean_length": 1269.96875, "completions/mean_terminated_length": 1269.96875, "completions/min_length": 1127.0, "completions/min_terminated_length": 1127.0, "entropy": 0.1161547633819282, "epoch": 0.00714, "frac_reward_zero_std": 0.0, "grad_norm": 0.4718828499317169, "kl": 0.9211016893386841, "learning_rate": 9.999809185462739e-05, "loss": 0.039, "num_tokens": 20075371.0, "reward": 3.961371898651123, "reward_std": 11.789936065673828, "rewards/rollout_reward_func/mean": 3.961371898651123, "rewards/rollout_reward_func/std": 12.59416675567627, "sampling/importance_sampling_ratio/max": 1.223926067352295, "sampling/importance_sampling_ratio/mean": 0.9972316026687622, "sampling/importance_sampling_ratio/min": 0.7068163156509399, "sampling/sampling_logp_difference/max": 0.21517443656921387, "sampling/sampling_logp_difference/mean": 0.0053621698170900345, "step": 357, "step_time": 38.67136614899937 }, { "clip_ratio/high_max": 0.049019608180969954, "clip_ratio/high_mean": 0.013991013227496296, "clip_ratio/low_mean": 0.01996527804294601, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03395629138685763, "entropy": 0.11733251390978694, "epoch": 0.00716, "grad_norm": 0.16502372920513153, "kl": 0.7801671754568815, "learning_rate": 9.999807994745449e-05, "loss": 0.0324, "step": 358, "step_time": 9.794801944999563 }, { "clip_ratio/high_max": 0.0069444444961845875, "clip_ratio/high_mean": 0.0017361111240461469, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0017361111240461469, "completions/clipped_ratio": 0.0, "completions/max_length": 1343.0, "completions/max_terminated_length": 1343.0, "completions/mean_length": 1256.96875, "completions/mean_terminated_length": 1256.96875, "completions/min_length": 1011.0, "completions/min_terminated_length": 1011.0, "entropy": 0.13261962542310357, "epoch": 0.00718, "frac_reward_zero_std": 0.0, "grad_norm": 0.4223484992980957, "kl": 0.6454224642366171, "learning_rate": 9.999806800324652e-05, "loss": -0.0021, "num_tokens": 20207093.0, "reward": 4.6267523765563965, "reward_std": 13.086963653564453, "rewards/rollout_reward_func/mean": 4.626751899719238, "rewards/rollout_reward_func/std": 14.676898956298828, "sampling/importance_sampling_ratio/max": 1.336045265197754, "sampling/importance_sampling_ratio/mean": 0.9978616237640381, "sampling/importance_sampling_ratio/min": 0.6580431461334229, "sampling/sampling_logp_difference/max": 0.287054181098938, "sampling/sampling_logp_difference/mean": 0.0057748714461922646, "step": 359, "step_time": 38.74908411499746 }, { "clip_ratio/high_max": 0.058662281604483724, "clip_ratio/high_mean": 0.018137792707420886, "clip_ratio/low_mean": 0.018183479725848883, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03632127266610041, "entropy": 0.1370791387744248, "epoch": 0.0072, "grad_norm": 0.26030489802360535, "kl": 0.6394520290195942, "learning_rate": 9.999805602200354e-05, "loss": -0.0085, "step": 360, "step_time": 9.32019592799952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0008680555620230734, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0008680555620230734, "completions/clipped_ratio": 0.0, "completions/max_length": 1344.0, "completions/max_terminated_length": 1344.0, "completions/mean_length": 1241.734375, "completions/mean_terminated_length": 1241.734375, "completions/min_length": 1025.0, "completions/min_terminated_length": 1025.0, "entropy": 0.1345509896054864, "epoch": 0.00722, "frac_reward_zero_std": 0.0, "grad_norm": 0.5100898742675781, "kl": 0.8387140035629272, "learning_rate": 9.999804400372554e-05, "loss": 0.007, "num_tokens": 20337789.0, "reward": 9.449151039123535, "reward_std": 12.286431312561035, "rewards/rollout_reward_func/mean": 9.449151039123535, "rewards/rollout_reward_func/std": 13.57576847076416, "sampling/importance_sampling_ratio/max": 1.4115562438964844, "sampling/importance_sampling_ratio/mean": 0.990313708782196, "sampling/importance_sampling_ratio/min": 0.6950281858444214, "sampling/sampling_logp_difference/max": 0.3388124704360962, "sampling/sampling_logp_difference/mean": 0.00530852098017931, "step": 361, "step_time": 39.235169910002696 }, { "clip_ratio/high_max": 0.0349264710675925, "clip_ratio/high_mean": 0.010467728832736611, "clip_ratio/low_mean": 0.01741217344533652, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.027879902394488454, "entropy": 0.15170079609379172, "epoch": 0.00724, "grad_norm": 0.33363601565361023, "kl": 0.6247174255549908, "learning_rate": 9.999803194841253e-05, "loss": 0.0003, "step": 362, "step_time": 9.35076707999906 }, { "clip_ratio/high_max": 0.0034722222480922937, "clip_ratio/high_mean": 0.0008680555620230734, "clip_ratio/low_mean": 0.0017361111240461469, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0026041666860692203, "completions/clipped_ratio": 0.0, "completions/max_length": 1359.0, "completions/max_terminated_length": 1359.0, "completions/mean_length": 1251.59375, "completions/mean_terminated_length": 1251.59375, "completions/min_length": 700.0, "completions/min_terminated_length": 700.0, "entropy": 0.1958311009220779, "epoch": 0.00726, "frac_reward_zero_std": 0.0, "grad_norm": 0.5317199230194092, "kl": 0.5815525501966476, "learning_rate": 9.999801985606452e-05, "loss": 0.0042, "num_tokens": 20469218.0, "reward": 3.8903188705444336, "reward_std": 13.076482772827148, "rewards/rollout_reward_func/mean": 3.8903186321258545, "rewards/rollout_reward_func/std": 13.372103691101074, "sampling/importance_sampling_ratio/max": 1.3623380661010742, "sampling/importance_sampling_ratio/mean": 1.0135592222213745, "sampling/importance_sampling_ratio/min": 0.7123748064041138, "sampling/sampling_logp_difference/max": 0.29522740840911865, "sampling/sampling_logp_difference/mean": 0.006953438278287649, "step": 363, "step_time": 39.74288477299888 }, { "clip_ratio/high_max": 0.06527777831070125, "clip_ratio/high_mean": 0.01979166700039059, "clip_ratio/low_mean": 0.021701389166992158, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.04149305640021339, "entropy": 0.2018888248130679, "epoch": 0.00728, "grad_norm": 0.2644880712032318, "kl": 0.5717838387936354, "learning_rate": 9.999800772668153e-05, "loss": -0.0029, "step": 364, "step_time": 9.321073625997087 }, { "clip_ratio/high_max": 0.0069444444961845875, "clip_ratio/high_mean": 0.0017361111240461469, "clip_ratio/low_mean": 0.0008680555620230734, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0026041666860692203, "completions/clipped_ratio": 0.0, "completions/max_length": 1354.0, "completions/max_terminated_length": 1354.0, "completions/mean_length": 1249.453125, "completions/mean_terminated_length": 1249.453125, "completions/min_length": 467.0, "completions/min_terminated_length": 467.0, "entropy": 0.18510928004980087, "epoch": 0.0073, "frac_reward_zero_std": 0.0, "grad_norm": 0.442364364862442, "kl": 0.4795332048088312, "learning_rate": 9.999799556026358e-05, "loss": -0.0238, "num_tokens": 20600462.0, "reward": 6.273903846740723, "reward_std": 12.39173698425293, "rewards/rollout_reward_func/mean": 6.273903846740723, "rewards/rollout_reward_func/std": 13.681985855102539, "sampling/importance_sampling_ratio/max": 1.3438879251480103, "sampling/importance_sampling_ratio/mean": 0.9609812498092651, "sampling/importance_sampling_ratio/min": 0.6316797733306885, "sampling/sampling_logp_difference/max": 0.33423590660095215, "sampling/sampling_logp_difference/mean": 0.007498072925955057, "step": 365, "step_time": 38.43022035099784 }, { "clip_ratio/high_max": 0.05171783687546849, "clip_ratio/high_mean": 0.013797514839097857, "clip_ratio/low_mean": 0.007766812981572002, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0215643277624622, "entropy": 0.18268039543181658, "epoch": 0.00732, "grad_norm": 0.2666545808315277, "kl": 0.47542588133364916, "learning_rate": 9.999798335681066e-05, "loss": -0.0309, "step": 366, "step_time": 9.454387761999897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0016904239892028272, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0016904239892028272, "completions/clipped_ratio": 0.0, "completions/max_length": 1344.0, "completions/max_terminated_length": 1344.0, "completions/mean_length": 1222.125, "completions/mean_terminated_length": 1222.125, "completions/min_length": 999.0, "completions/min_terminated_length": 999.0, "entropy": 0.21282331459224224, "epoch": 0.00734, "frac_reward_zero_std": 0.0, "grad_norm": 0.8037834763526917, "kl": 0.6722489278763533, "learning_rate": 9.99979711163228e-05, "loss": 0.0148, "num_tokens": 20729886.0, "reward": 5.174856662750244, "reward_std": 11.355770111083984, "rewards/rollout_reward_func/mean": 5.174857139587402, "rewards/rollout_reward_func/std": 11.9678955078125, "sampling/importance_sampling_ratio/max": 1.8758124113082886, "sampling/importance_sampling_ratio/mean": 1.0103557109832764, "sampling/importance_sampling_ratio/min": 0.7285647392272949, "sampling/sampling_logp_difference/max": 0.3263084888458252, "sampling/sampling_logp_difference/mean": 0.008695240132510662, "step": 367, "step_time": 37.67410640100388 }, { "clip_ratio/high_max": 0.036011905409395695, "clip_ratio/high_mean": 0.01073908741818741, "clip_ratio/low_mean": 0.02561856439569965, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0363576520467177, "entropy": 0.21391737554222345, "epoch": 0.00736, "grad_norm": 0.4400097727775574, "kl": 0.749302851036191, "learning_rate": 9.999795883880001e-05, "loss": 0.005, "step": 368, "step_time": 9.906740201999128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.003426535113248974, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003426535113248974, "completions/clipped_ratio": 0.0, "completions/max_length": 1348.0, "completions/max_terminated_length": 1348.0, "completions/mean_length": 1231.171875, "completions/mean_terminated_length": 1231.171875, "completions/min_length": 994.0, "completions/min_terminated_length": 994.0, "entropy": 0.2234082594513893, "epoch": 0.00738, "frac_reward_zero_std": 0.0, "grad_norm": 1.0522061586380005, "kl": 0.9143509455025196, "learning_rate": 9.999794652424228e-05, "loss": 0.0039, "num_tokens": 20859908.0, "reward": 8.738959312438965, "reward_std": 11.845466613769531, "rewards/rollout_reward_func/mean": 8.738959312438965, "rewards/rollout_reward_func/std": 12.123114585876465, "sampling/importance_sampling_ratio/max": 1.3250545263290405, "sampling/importance_sampling_ratio/mean": 1.020609974861145, "sampling/importance_sampling_ratio/min": 0.5885343551635742, "sampling/sampling_logp_difference/max": 0.4251088500022888, "sampling/sampling_logp_difference/mean": 0.008847212418913841, "step": 369, "step_time": 38.576368669004296 }, { "clip_ratio/high_max": 0.05455874605104327, "clip_ratio/high_mean": 0.016198166005779058, "clip_ratio/low_mean": 0.03593064745655283, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.05212881352053955, "entropy": 0.22893889155238867, "epoch": 0.0074, "grad_norm": 0.39881831407546997, "kl": 1.0176227018237114, "learning_rate": 9.999793417264966e-05, "loss": -0.0017, "step": 370, "step_time": 8.888960126005259 }, { "clip_ratio/high_max": 0.0034722222480922937, "clip_ratio/high_mean": 0.0008680555620230734, "clip_ratio/low_mean": 0.0008680555620230734, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0017361111240461469, "completions/clipped_ratio": 0.0, "completions/max_length": 1348.0, "completions/max_terminated_length": 1348.0, "completions/mean_length": 1242.28125, "completions/mean_terminated_length": 1242.28125, "completions/min_length": 913.0, "completions/min_terminated_length": 913.0, "entropy": 0.2308051260188222, "epoch": 0.00742, "frac_reward_zero_std": 0.0, "grad_norm": 0.5715855956077576, "kl": 0.87254199385643, "learning_rate": 9.999792178402214e-05, "loss": -0.0234, "num_tokens": 20990697.0, "reward": 6.123772621154785, "reward_std": 10.485084533691406, "rewards/rollout_reward_func/mean": 6.123772144317627, "rewards/rollout_reward_func/std": 11.30632209777832, "sampling/importance_sampling_ratio/max": 1.4539350271224976, "sampling/importance_sampling_ratio/mean": 1.0005998611450195, "sampling/importance_sampling_ratio/min": 0.5505736470222473, "sampling/sampling_logp_difference/max": 0.3510777950286865, "sampling/sampling_logp_difference/mean": 0.009203520603477955, "step": 371, "step_time": 38.65458783400027 }, { "clip_ratio/high_max": 0.045200163731351495, "clip_ratio/high_mean": 0.013036152173299342, "clip_ratio/low_mean": 0.023381332110147923, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03641748463269323, "entropy": 0.2447242382913828, "epoch": 0.00744, "grad_norm": 0.298229455947876, "kl": 0.8313354179263115, "learning_rate": 9.999790935835973e-05, "loss": -0.0303, "step": 372, "step_time": 9.79756171700501 }, { "clip_ratio/high_max": 0.010051169665530324, "clip_ratio/high_mean": 0.002512792416382581, "clip_ratio/low_mean": 0.0008680555620230734, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0033808479784056544, "completions/clipped_ratio": 0.0, "completions/max_length": 1358.0, "completions/max_terminated_length": 1358.0, "completions/mean_length": 1223.75, "completions/mean_terminated_length": 1223.75, "completions/min_length": 908.0, "completions/min_terminated_length": 908.0, "entropy": 0.25220474135130644, "epoch": 0.00746, "frac_reward_zero_std": 0.0, "grad_norm": 0.5470872521400452, "kl": 0.7501334678381681, "learning_rate": 9.999789689566245e-05, "loss": 0.0016, "num_tokens": 21120250.0, "reward": 4.980414867401123, "reward_std": 13.811859130859375, "rewards/rollout_reward_func/mean": 4.980414867401123, "rewards/rollout_reward_func/std": 15.705443382263184, "sampling/importance_sampling_ratio/max": 1.4358227252960205, "sampling/importance_sampling_ratio/mean": 0.9660643339157104, "sampling/importance_sampling_ratio/min": 0.4660157859325409, "sampling/sampling_logp_difference/max": 0.5715584754943848, "sampling/sampling_logp_difference/mean": 0.011051887646317482, "step": 373, "step_time": 37.840678287995615 }, { "clip_ratio/high_max": 0.07236842135898769, "clip_ratio/high_mean": 0.021564327646046877, "clip_ratio/low_mean": 0.025087612215429544, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.04665194044355303, "entropy": 0.2590667102485895, "epoch": 0.00748, "grad_norm": 0.33827197551727295, "kl": 0.7311984747648239, "learning_rate": 9.999788439593031e-05, "loss": -0.0111, "step": 374, "step_time": 8.941922394003996 }, { "clip_ratio/high_max": 0.0034722222480922937, "clip_ratio/high_mean": 0.0008680555620230734, "clip_ratio/low_mean": 0.0008680555620230734, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0017361111240461469, "completions/clipped_ratio": 0.0, "completions/max_length": 1351.0, "completions/max_terminated_length": 1351.0, "completions/mean_length": 1216.34375, "completions/mean_terminated_length": 1216.34375, "completions/min_length": 993.0, "completions/min_terminated_length": 993.0, "entropy": 0.2550716269761324, "epoch": 0.0075, "frac_reward_zero_std": 0.0, "grad_norm": 0.8335476517677307, "kl": 0.9307033438235521, "learning_rate": 9.999787185916331e-05, "loss": 0.0294, "num_tokens": 21249311.0, "reward": 5.41689920425415, "reward_std": 12.388166427612305, "rewards/rollout_reward_func/mean": 5.416898727416992, "rewards/rollout_reward_func/std": 13.267603874206543, "sampling/importance_sampling_ratio/max": 1.4684741497039795, "sampling/importance_sampling_ratio/mean": 1.0054916143417358, "sampling/importance_sampling_ratio/min": 0.6331810355186462, "sampling/sampling_logp_difference/max": 0.2799299955368042, "sampling/sampling_logp_difference/mean": 0.01017037034034729, "step": 375, "step_time": 38.13824768100312 }, { "clip_ratio/high_max": 0.06950894417241216, "clip_ratio/high_mean": 0.021580452797934413, "clip_ratio/low_mean": 0.02794391370844096, "clip_ratio/low_min": 0.0034722222480922937, "clip_ratio/region_mean": 0.049524366680998355, "entropy": 0.2559032328426838, "epoch": 0.00752, "grad_norm": 0.2704547047615051, "kl": 0.9575543515384197, "learning_rate": 9.999785928536148e-05, "loss": 0.0164, "step": 376, "step_time": 9.173922988000413 }, { "clip_ratio/high_max": 0.00657894741743803, "clip_ratio/high_mean": 0.0016447368543595076, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0016447368543595076, "completions/clipped_ratio": 0.0, "completions/max_length": 1330.0, "completions/max_terminated_length": 1330.0, "completions/mean_length": 1201.6875, "completions/mean_terminated_length": 1201.6875, "completions/min_length": 1002.0, "completions/min_terminated_length": 1002.0, "entropy": 0.2529036393389106, "epoch": 0.00754, "frac_reward_zero_std": 0.0, "grad_norm": 0.5121884346008301, "kl": 0.7823121659457684, "learning_rate": 9.999784667452484e-05, "loss": -0.0058, "num_tokens": 21377388.0, "reward": 5.937844753265381, "reward_std": 10.75206184387207, "rewards/rollout_reward_func/mean": 5.937845230102539, "rewards/rollout_reward_func/std": 10.630824089050293, "sampling/importance_sampling_ratio/max": 1.2855048179626465, "sampling/importance_sampling_ratio/mean": 0.9824115037918091, "sampling/importance_sampling_ratio/min": 0.7048435807228088, "sampling/sampling_logp_difference/max": 0.3823585510253906, "sampling/sampling_logp_difference/mean": 0.010149901732802391, "step": 377, "step_time": 37.091166489997704 }, { "clip_ratio/high_max": 0.030701754614710808, "clip_ratio/high_mean": 0.011025112122297287, "clip_ratio/low_mean": 0.024379125621635467, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03540423803497106, "entropy": 0.24686269089579582, "epoch": 0.00756, "grad_norm": 0.308444082736969, "kl": 0.7720872350037098, "learning_rate": 9.999783402665338e-05, "loss": -0.0141, "step": 378, "step_time": 8.784612373994605 }, { "clip_ratio/high_max": 0.01686507952399552, "clip_ratio/high_mean": 0.00421626988099888, "clip_ratio/low_mean": 0.0035807291860692203, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0077969990670681, "completions/clipped_ratio": 0.0, "completions/max_length": 1343.0, "completions/max_terminated_length": 1343.0, "completions/mean_length": 1210.0625, "completions/mean_terminated_length": 1210.0625, "completions/min_length": 720.0, "completions/min_terminated_length": 720.0, "entropy": 0.2716317633166909, "epoch": 0.00758, "frac_reward_zero_std": 0.0, "grad_norm": 0.6121698021888733, "kl": 0.925221860408783, "learning_rate": 9.999782134174711e-05, "loss": -0.0013, "num_tokens": 21506045.0, "reward": 2.2015371322631836, "reward_std": 15.299311637878418, "rewards/rollout_reward_func/mean": 2.2015371322631836, "rewards/rollout_reward_func/std": 15.50017261505127, "sampling/importance_sampling_ratio/max": 1.400822639465332, "sampling/importance_sampling_ratio/mean": 0.9883875846862793, "sampling/importance_sampling_ratio/min": 0.625456690788269, "sampling/sampling_logp_difference/max": 0.3319031000137329, "sampling/sampling_logp_difference/mean": 0.011153844185173512, "step": 379, "step_time": 36.844649089001905 }, { "clip_ratio/high_max": 0.05834899842739105, "clip_ratio/high_mean": 0.017242478381376714, "clip_ratio/low_mean": 0.03576550219440833, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.05300798005191609, "entropy": 0.2637836243957281, "epoch": 0.0076, "grad_norm": 0.4454600512981415, "kl": 0.9274842478334904, "learning_rate": 9.999780861980607e-05, "loss": -0.0126, "step": 380, "step_time": 9.811575109000842 }, { "clip_ratio/high_max": 0.010620915098115802, "clip_ratio/high_mean": 0.0026552287745289505, "clip_ratio/low_mean": 0.0009191176504828036, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003574346425011754, "completions/clipped_ratio": 0.0, "completions/max_length": 1341.0, "completions/max_terminated_length": 1341.0, "completions/mean_length": 1219.8125, "completions/mean_terminated_length": 1219.8125, "completions/min_length": 991.0, "completions/min_terminated_length": 991.0, "entropy": 0.2292822152376175, "epoch": 0.00762, "frac_reward_zero_std": 0.0, "grad_norm": 0.6078642010688782, "kl": 0.803357319906354, "learning_rate": 9.999779586083025e-05, "loss": 0.004, "num_tokens": 21635298.0, "reward": 5.408005714416504, "reward_std": 9.926593780517578, "rewards/rollout_reward_func/mean": 5.4080047607421875, "rewards/rollout_reward_func/std": 11.208430290222168, "sampling/importance_sampling_ratio/max": 1.2364863157272339, "sampling/importance_sampling_ratio/mean": 0.997908890247345, "sampling/importance_sampling_ratio/min": 0.6723216772079468, "sampling/sampling_logp_difference/max": 0.38260674476623535, "sampling/sampling_logp_difference/mean": 0.007029087748378515, "step": 381, "step_time": 37.91188854600114 }, { "clip_ratio/high_max": 0.05433114105835557, "clip_ratio/high_mean": 0.014450840826611966, "clip_ratio/low_mean": 0.02246758935507387, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03691843029810116, "entropy": 0.21105156652629375, "epoch": 0.00764, "grad_norm": 0.3075491189956665, "kl": 0.9027222413569689, "learning_rate": 9.999778306481968e-05, "loss": -0.0043, "step": 382, "step_time": 9.498284126000726 }, { "clip_ratio/high_max": 0.0036764706019312143, "clip_ratio/high_mean": 0.0009191176504828036, "clip_ratio/low_mean": 0.0008680555620230734, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001787173212505877, "completions/clipped_ratio": 0.0, "completions/max_length": 1357.0, "completions/max_terminated_length": 1357.0, "completions/mean_length": 1213.765625, "completions/mean_terminated_length": 1213.765625, "completions/min_length": 957.0, "completions/min_terminated_length": 957.0, "entropy": 0.20513668935745955, "epoch": 0.00766, "frac_reward_zero_std": 0.0, "grad_norm": 0.6294713020324707, "kl": 0.7374063562601805, "learning_rate": 9.999777023177434e-05, "loss": 0.0252, "num_tokens": 21764144.0, "reward": 8.72990894317627, "reward_std": 11.312125205993652, "rewards/rollout_reward_func/mean": 8.729909896850586, "rewards/rollout_reward_func/std": 11.270212173461914, "sampling/importance_sampling_ratio/max": 1.667926549911499, "sampling/importance_sampling_ratio/mean": 1.0118814706802368, "sampling/importance_sampling_ratio/min": 0.7219305038452148, "sampling/sampling_logp_difference/max": 0.31063222885131836, "sampling/sampling_logp_difference/mean": 0.007431398145854473, "step": 383, "step_time": 37.184195774003456 }, { "clip_ratio/high_max": 0.024509804090484977, "clip_ratio/high_mean": 0.006995506584644318, "clip_ratio/low_mean": 0.034743722644634545, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.041739229462109506, "entropy": 0.19153737649321556, "epoch": 0.00768, "grad_norm": 0.37729939818382263, "kl": 1.0056524686515331, "learning_rate": 9.999775736169427e-05, "loss": 0.0245, "step": 384, "step_time": 8.749921898001048 }, { "clip_ratio/high_max": 0.0059523810632526875, "clip_ratio/high_mean": 0.0014880952658131719, "clip_ratio/low_mean": 0.004579809028655291, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0060679042944684625, "completions/clipped_ratio": 0.0, "completions/max_length": 1551.0, "completions/max_terminated_length": 1551.0, "completions/mean_length": 1410.3125, "completions/mean_terminated_length": 1410.3125, "completions/min_length": 765.0, "completions/min_terminated_length": 765.0, "entropy": 0.20708153676241636, "epoch": 0.0077, "frac_reward_zero_std": 0.0, "grad_norm": 0.738293468952179, "kl": 0.887890812009573, "learning_rate": 9.99977444545795e-05, "loss": -0.0628, "num_tokens": 21905662.0, "reward": 9.202791213989258, "reward_std": 15.181166648864746, "rewards/rollout_reward_func/mean": 9.202792167663574, "rewards/rollout_reward_func/std": 15.67770767211914, "sampling/importance_sampling_ratio/max": 1.5441806316375732, "sampling/importance_sampling_ratio/mean": 0.9917982816696167, "sampling/importance_sampling_ratio/min": 5.679499839178481e-13, "sampling/sampling_logp_difference/max": 22.66815948486328, "sampling/sampling_logp_difference/mean": 0.030492324382066727, "step": 385, "step_time": 39.20689014000345 }, { "clip_ratio/high_max": 0.023971861926838756, "clip_ratio/high_mean": 0.007481060747522861, "clip_ratio/low_mean": 0.03716492815874517, "clip_ratio/low_min": 0.0029761905316263437, "clip_ratio/region_mean": 0.04464598890626803, "entropy": 0.1879758802242577, "epoch": 0.00772, "grad_norm": 0.41567400097846985, "kl": 0.8819043859839439, "learning_rate": 9.999773151042999e-05, "loss": -0.0737, "step": 386, "step_time": 10.475744582005063 }, { "clip_ratio/high_max": 0.009424603311344981, "clip_ratio/high_mean": 0.0023561508278362453, "clip_ratio/low_mean": 0.0007440476329065859, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031001984607428312, "completions/clipped_ratio": 0.0, "completions/max_length": 1556.0, "completions/max_terminated_length": 1556.0, "completions/mean_length": 1433.34375, "completions/mean_terminated_length": 1433.34375, "completions/min_length": 1211.0, "completions/min_terminated_length": 1211.0, "entropy": 0.15365674067288637, "epoch": 0.00774, "frac_reward_zero_std": 0.0, "grad_norm": 0.5962560176849365, "kl": 0.6632435545325279, "learning_rate": 9.99977185292458e-05, "loss": 0.0216, "num_tokens": 22048591.0, "reward": 13.268250465393066, "reward_std": 13.775822639465332, "rewards/rollout_reward_func/mean": 13.268250465393066, "rewards/rollout_reward_func/std": 14.63206958770752, "sampling/importance_sampling_ratio/max": 1.2218079566955566, "sampling/importance_sampling_ratio/mean": 0.9793609380722046, "sampling/importance_sampling_ratio/min": 0.6325286626815796, "sampling/sampling_logp_difference/max": 0.38329482078552246, "sampling/sampling_logp_difference/mean": 0.0064071910455822945, "step": 387, "step_time": 41.232673029999205 }, { "clip_ratio/high_max": 0.05530754057690501, "clip_ratio/high_mean": 0.01680307579226792, "clip_ratio/low_mean": 0.014248512219637632, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.031051588244736195, "entropy": 0.14256418915465474, "epoch": 0.00776, "grad_norm": 0.527172863483429, "kl": 0.646535612642765, "learning_rate": 9.999770551102692e-05, "loss": 0.0167, "step": 388, "step_time": 10.636822301992652 }, { "clip_ratio/high_max": 0.0031250000465661287, "clip_ratio/high_mean": 0.0007812500116415322, "clip_ratio/low_mean": 0.0007440476329065859, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0015252976445481181, "completions/clipped_ratio": 0.0, "completions/max_length": 1545.0, "completions/max_terminated_length": 1545.0, "completions/mean_length": 1429.21875, "completions/mean_terminated_length": 1429.21875, "completions/min_length": 1226.0, "completions/min_terminated_length": 1226.0, "entropy": 0.14011064730584621, "epoch": 0.00778, "frac_reward_zero_std": 0.0, "grad_norm": 0.5465406775474548, "kl": 0.6449617743492126, "learning_rate": 9.999769245577337e-05, "loss": -0.0416, "num_tokens": 22191273.0, "reward": 10.615909576416016, "reward_std": 10.947202682495117, "rewards/rollout_reward_func/mean": 10.615909576416016, "rewards/rollout_reward_func/std": 12.735282897949219, "sampling/importance_sampling_ratio/max": 2.317744016647339, "sampling/importance_sampling_ratio/mean": 1.0246381759643555, "sampling/importance_sampling_ratio/min": 0.2836526930332184, "sampling/sampling_logp_difference/max": 1.3213729858398438, "sampling/sampling_logp_difference/mean": 0.008526146411895752, "step": 389, "step_time": 41.60411787599878 }, { "clip_ratio/high_max": 0.03645833395421505, "clip_ratio/high_mean": 0.01361607201397419, "clip_ratio/low_mean": 0.013582785322796553, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02719885722035542, "entropy": 0.15200490225106478, "epoch": 0.0078, "grad_norm": 0.4452749192714691, "kl": 0.5898908544331789, "learning_rate": 9.999767936348516e-05, "loss": -0.05, "step": 390, "step_time": 10.0632308130007 }, { "clip_ratio/high_max": 0.0029761905316263437, "clip_ratio/high_mean": 0.0007440476329065859, "clip_ratio/low_mean": 0.0014880952658131719, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002232142898719758, "completions/clipped_ratio": 0.0, "completions/max_length": 1550.0, "completions/max_terminated_length": 1550.0, "completions/mean_length": 1416.0625, "completions/mean_terminated_length": 1416.0625, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "entropy": 0.16107679810374975, "epoch": 0.00782, "frac_reward_zero_std": 0.0, "grad_norm": 0.5122284889221191, "kl": 0.5392248686403036, "learning_rate": 9.999766623416232e-05, "loss": -0.0577, "num_tokens": 22333164.0, "reward": 14.949935913085938, "reward_std": 16.67510414123535, "rewards/rollout_reward_func/mean": 14.949935913085938, "rewards/rollout_reward_func/std": 18.703474044799805, "sampling/importance_sampling_ratio/max": 1.4272053241729736, "sampling/importance_sampling_ratio/mean": 0.9347177743911743, "sampling/importance_sampling_ratio/min": 0.16998553276062012, "sampling/sampling_logp_difference/max": 1.3626210689544678, "sampling/sampling_logp_difference/mean": 0.009718427434563637, "step": 391, "step_time": 39.99297312899398 }, { "clip_ratio/high_max": 0.02976190554909408, "clip_ratio/high_mean": 0.008928571594879031, "clip_ratio/low_mean": 0.015298011188860983, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.024226582725532353, "entropy": 0.14390948927029967, "epoch": 0.00784, "grad_norm": 0.4105764627456665, "kl": 0.5607901010662317, "learning_rate": 9.999765306780482e-05, "loss": -0.0626, "step": 392, "step_time": 10.043341166003302 }, { "clip_ratio/high_max": 0.009077381109818816, "clip_ratio/high_mean": 0.002269345277454704, "clip_ratio/low_mean": 0.0007440476329065859, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00301339291036129, "completions/clipped_ratio": 0.0, "completions/max_length": 1554.0, "completions/max_terminated_length": 1554.0, "completions/mean_length": 1443.640625, "completions/mean_terminated_length": 1443.640625, "completions/min_length": 1069.0, "completions/min_terminated_length": 1069.0, "entropy": 0.12537508364766836, "epoch": 0.00786, "frac_reward_zero_std": 0.0, "grad_norm": 0.7089640498161316, "kl": 0.9093821812421083, "learning_rate": 9.99976398644127e-05, "loss": 0.0186, "num_tokens": 22476782.0, "reward": 11.492524147033691, "reward_std": 15.943157196044922, "rewards/rollout_reward_func/mean": 11.492524147033691, "rewards/rollout_reward_func/std": 16.71925163269043, "sampling/importance_sampling_ratio/max": 1.7644160985946655, "sampling/importance_sampling_ratio/mean": 0.9914994239807129, "sampling/importance_sampling_ratio/min": 0.7484045028686523, "sampling/sampling_logp_difference/max": 0.4207209348678589, "sampling/sampling_logp_difference/mean": 0.006036281120032072, "step": 393, "step_time": 40.15463091899983 }, { "clip_ratio/high_max": 0.01800595293752849, "clip_ratio/high_mean": 0.005245535809081048, "clip_ratio/low_mean": 0.018960813991725445, "clip_ratio/low_min": 0.0029761905316263437, "clip_ratio/region_mean": 0.024206349917221814, "entropy": 0.11524984752759337, "epoch": 0.00788, "grad_norm": 0.7076042890548706, "kl": 0.7414491530507803, "learning_rate": 9.9997626623986e-05, "loss": 0.0115, "step": 394, "step_time": 10.618017185999634 }, { "clip_ratio/high_max": 0.0029761905316263437, "clip_ratio/high_mean": 0.0007440476329065859, "clip_ratio/low_mean": 0.0007440476329065859, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0014880952658131719, "completions/clipped_ratio": 0.0, "completions/max_length": 1563.0, "completions/max_terminated_length": 1563.0, "completions/mean_length": 1443.5, "completions/mean_terminated_length": 1443.5, "completions/min_length": 427.0, "completions/min_terminated_length": 427.0, "entropy": 0.12252500653266907, "epoch": 0.0079, "frac_reward_zero_std": 0.0, "grad_norm": 0.7846469283103943, "kl": 0.7317866403609514, "learning_rate": 9.999761334652469e-05, "loss": 0.0075, "num_tokens": 22620477.0, "reward": 11.849661827087402, "reward_std": 16.187042236328125, "rewards/rollout_reward_func/mean": 11.849662780761719, "rewards/rollout_reward_func/std": 17.399803161621094, "sampling/importance_sampling_ratio/max": 1.4447773694992065, "sampling/importance_sampling_ratio/mean": 1.0075819492340088, "sampling/importance_sampling_ratio/min": 0.663360595703125, "sampling/sampling_logp_difference/max": 0.43144845962524414, "sampling/sampling_logp_difference/mean": 0.007304108701646328, "step": 395, "step_time": 40.574595107005734 }, { "clip_ratio/high_max": 0.033482143422588706, "clip_ratio/high_mean": 0.011425047181546688, "clip_ratio/low_mean": 0.01829117111628875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02971621841425076, "entropy": 0.12393791414797306, "epoch": 0.00792, "grad_norm": 0.38290056586265564, "kl": 0.7317893952131271, "learning_rate": 9.999760003202881e-05, "loss": 0.0033, "step": 396, "step_time": 10.742739806995814 }, { "clip_ratio/high_max": 0.01205357164144516, "clip_ratio/high_mean": 0.00301339291036129, "clip_ratio/low_mean": 0.0007440476329065859, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003757440543267876, "completions/clipped_ratio": 0.0, "completions/max_length": 1543.0, "completions/max_terminated_length": 1543.0, "completions/mean_length": 1444.578125, "completions/mean_terminated_length": 1444.578125, "completions/min_length": 1287.0, "completions/min_terminated_length": 1287.0, "entropy": 0.13034009747207165, "epoch": 0.00794, "frac_reward_zero_std": 0.0, "grad_norm": 0.7314363121986389, "kl": 0.6178351659327745, "learning_rate": 9.999758668049833e-05, "loss": -0.0157, "num_tokens": 22764146.0, "reward": 11.904011726379395, "reward_std": 15.453010559082031, "rewards/rollout_reward_func/mean": 11.904010772705078, "rewards/rollout_reward_func/std": 16.291580200195312, "sampling/importance_sampling_ratio/max": 1.2977502346038818, "sampling/importance_sampling_ratio/mean": 0.9704984426498413, "sampling/importance_sampling_ratio/min": 0.6586284637451172, "sampling/sampling_logp_difference/max": 0.34184467792510986, "sampling/sampling_logp_difference/mean": 0.006397986318916082, "step": 397, "step_time": 40.79752054799974 }, { "clip_ratio/high_max": 0.043154762824997306, "clip_ratio/high_mean": 0.014508928987197578, "clip_ratio/low_mean": 0.025279997498728335, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.039788926660548896, "entropy": 0.11147738387808204, "epoch": 0.00796, "grad_norm": 0.28320473432540894, "kl": 0.7334012817591429, "learning_rate": 9.999757329193333e-05, "loss": -0.021, "step": 398, "step_time": 9.331709539997973 }, { "clip_ratio/high_max": 0.0028409091755747795, "clip_ratio/high_mean": 0.0007102272938936949, "clip_ratio/low_mean": 0.0007440476329065859, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0014542749268002808, "completions/clipped_ratio": 0.0, "completions/max_length": 1546.0, "completions/max_terminated_length": 1546.0, "completions/mean_length": 1455.65625, "completions/mean_terminated_length": 1455.65625, "completions/min_length": 1290.0, "completions/min_terminated_length": 1290.0, "entropy": 0.12526550004258752, "epoch": 0.00798, "frac_reward_zero_std": 0.0, "grad_norm": 0.7262594103813171, "kl": 0.6103415302932262, "learning_rate": 9.999755986633378e-05, "loss": -0.0318, "num_tokens": 22908577.0, "reward": 9.555159568786621, "reward_std": 12.746781349182129, "rewards/rollout_reward_func/mean": 9.555160522460938, "rewards/rollout_reward_func/std": 14.475045204162598, "sampling/importance_sampling_ratio/max": 1.3095201253890991, "sampling/importance_sampling_ratio/mean": 0.9779645204544067, "sampling/importance_sampling_ratio/min": 7.978658610397404e-16, "sampling/sampling_logp_difference/max": 27.056884765625, "sampling/sampling_logp_difference/mean": 0.034079719334840775, "step": 399, "step_time": 41.13124246700136 }, { "clip_ratio/high_max": 0.033107627648860216, "clip_ratio/high_mean": 0.010509049869142473, "clip_ratio/low_mean": 0.02362351247575134, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03413256263593212, "entropy": 0.1209962465800345, "epoch": 0.008, "grad_norm": 0.35216766595840454, "kl": 0.6524146590381861, "learning_rate": 9.99975464036997e-05, "loss": -0.044, "step": 400, "step_time": 10.55650918399806 }, { "clip_ratio/high_max": 0.0059523810632526875, "clip_ratio/high_mean": 0.0014880952658131719, "clip_ratio/low_mean": 0.0015252976445481181, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00301339291036129, "completions/clipped_ratio": 0.0, "completions/max_length": 1555.0, "completions/max_terminated_length": 1555.0, "completions/mean_length": 1432.03125, "completions/mean_terminated_length": 1432.03125, "completions/min_length": 1248.0, "completions/min_terminated_length": 1248.0, "entropy": 0.11135548166930676, "epoch": 0.00802, "frac_reward_zero_std": 0.0, "grad_norm": 1.133269190788269, "kl": 0.7685734387487173, "learning_rate": 9.99975329040311e-05, "loss": 0.0237, "num_tokens": 23051446.0, "reward": 10.270038604736328, "reward_std": 15.682093620300293, "rewards/rollout_reward_func/mean": 10.270038604736328, "rewards/rollout_reward_func/std": 16.255008697509766, "sampling/importance_sampling_ratio/max": 1.4514762163162231, "sampling/importance_sampling_ratio/mean": 1.0226449966430664, "sampling/importance_sampling_ratio/min": 0.7271938920021057, "sampling/sampling_logp_difference/max": 0.45901012420654297, "sampling/sampling_logp_difference/mean": 0.005241828970611095, "step": 401, "step_time": 41.26497857599861 }, { "clip_ratio/high_max": 0.03290043352171779, "clip_ratio/high_mean": 0.008225108380429447, "clip_ratio/low_mean": 0.015327381319366395, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.023552489699795842, "entropy": 0.11777450842782855, "epoch": 0.00804, "grad_norm": 0.928424060344696, "kl": 1.0143736563622952, "learning_rate": 9.999751936732799e-05, "loss": 0.0269, "step": 402, "step_time": 10.720703897995918 }, { "clip_ratio/high_max": 0.008928571594879031, "clip_ratio/high_mean": 0.0029761905316263437, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0029761905316263437, "completions/clipped_ratio": 0.0, "completions/max_length": 1558.0, "completions/max_terminated_length": 1558.0, "completions/mean_length": 1402.484375, "completions/mean_terminated_length": 1402.484375, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "entropy": 0.12977053970098495, "epoch": 0.00806, "frac_reward_zero_std": 0.0, "grad_norm": 0.5543701648712158, "kl": 0.6270178612321615, "learning_rate": 9.999750579359041e-05, "loss": 0.0183, "num_tokens": 23192365.0, "reward": 10.964115142822266, "reward_std": 14.9024658203125, "rewards/rollout_reward_func/mean": 10.964115142822266, "rewards/rollout_reward_func/std": 15.60954475402832, "sampling/importance_sampling_ratio/max": 1.254056453704834, "sampling/importance_sampling_ratio/mean": 0.9866700768470764, "sampling/importance_sampling_ratio/min": 0.661080539226532, "sampling/sampling_logp_difference/max": 0.2775760889053345, "sampling/sampling_logp_difference/mean": 0.005836261436343193, "step": 403, "step_time": 40.087825246997454 }, { "clip_ratio/high_max": 0.029910714831203222, "clip_ratio/high_mean": 0.009676001209300011, "clip_ratio/low_mean": 0.016021825780626386, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.025697826989926398, "entropy": 0.13238740153610706, "epoch": 0.00808, "grad_norm": 0.44548580050468445, "kl": 0.718162702396512, "learning_rate": 9.999749218281836e-05, "loss": 0.0147, "step": 404, "step_time": 9.659750243004964 }, { "clip_ratio/high_max": 0.0029761905316263437, "clip_ratio/high_mean": 0.0007440476329065859, "clip_ratio/low_mean": 0.0007440476329065859, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0014880952658131719, "completions/clipped_ratio": 0.0, "completions/max_length": 1552.0, "completions/max_terminated_length": 1552.0, "completions/mean_length": 1440.421875, "completions/mean_terminated_length": 1440.421875, "completions/min_length": 1154.0, "completions/min_terminated_length": 1154.0, "entropy": 0.12935744831338525, "epoch": 0.0081, "frac_reward_zero_std": 0.0, "grad_norm": 0.5480543375015259, "kl": 0.6046669036149979, "learning_rate": 9.999747853501184e-05, "loss": 0.0137, "num_tokens": 23335798.0, "reward": 12.202452659606934, "reward_std": 18.661951065063477, "rewards/rollout_reward_func/mean": 12.20245361328125, "rewards/rollout_reward_func/std": 20.890966415405273, "sampling/importance_sampling_ratio/max": 1.5541430711746216, "sampling/importance_sampling_ratio/mean": 1.0242815017700195, "sampling/importance_sampling_ratio/min": 0.6801992058753967, "sampling/sampling_logp_difference/max": 0.38781797885894775, "sampling/sampling_logp_difference/mean": 0.006136234849691391, "step": 405, "step_time": 40.755317154002114 }, { "clip_ratio/high_max": 0.02380952425301075, "clip_ratio/high_mean": 0.00744047638727352, "clip_ratio/low_mean": 0.018129960633814335, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.025570437079295516, "entropy": 0.12149734795093536, "epoch": 0.00812, "grad_norm": 0.26514580845832825, "kl": 0.64109767973423, "learning_rate": 9.999746485017087e-05, "loss": 0.0087, "step": 406, "step_time": 10.136832774996947 }, { "clip_ratio/high_max": 0.0059523810632526875, "clip_ratio/high_mean": 0.0014880952658131719, "clip_ratio/low_mean": 0.002232142898719758, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0037202381645329297, "completions/clipped_ratio": 0.0, "completions/max_length": 1554.0, "completions/max_terminated_length": 1554.0, "completions/mean_length": 1405.703125, "completions/mean_terminated_length": 1405.703125, "completions/min_length": 666.0, "completions/min_terminated_length": 666.0, "entropy": 0.12104977620765567, "epoch": 0.00814, "frac_reward_zero_std": 0.0, "grad_norm": 0.35428720712661743, "kl": 0.6081782300025225, "learning_rate": 9.999745112829547e-05, "loss": 0.0047, "num_tokens": 23476941.0, "reward": 10.940488815307617, "reward_std": 14.940820693969727, "rewards/rollout_reward_func/mean": 10.940488815307617, "rewards/rollout_reward_func/std": 15.13664436340332, "sampling/importance_sampling_ratio/max": 1.254475712776184, "sampling/importance_sampling_ratio/mean": 0.9845165014266968, "sampling/importance_sampling_ratio/min": 0.6197980642318726, "sampling/sampling_logp_difference/max": 0.40376973152160645, "sampling/sampling_logp_difference/mean": 0.00637152511626482, "step": 407, "step_time": 40.52080072600438 }, { "clip_ratio/high_max": 0.02380952425301075, "clip_ratio/high_mean": 0.007440476329065859, "clip_ratio/low_mean": 0.026450893783476204, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.033891370287165046, "entropy": 0.11284881783649325, "epoch": 0.00816, "grad_norm": 0.26076704263687134, "kl": 0.6312750466167927, "learning_rate": 9.999743736938565e-05, "loss": -0.0013, "step": 408, "step_time": 10.76028649699765 }, { "clip_ratio/high_max": 0.0028409091755747795, "clip_ratio/high_mean": 0.0007102272938936949, "clip_ratio/low_mean": 0.002232142898719758, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0029423701926134527, "completions/clipped_ratio": 0.0, "completions/max_length": 1546.0, "completions/max_terminated_length": 1546.0, "completions/mean_length": 1414.90625, "completions/mean_terminated_length": 1414.90625, "completions/min_length": 419.0, "completions/min_terminated_length": 419.0, "entropy": 0.11457140510901809, "epoch": 0.00818, "frac_reward_zero_std": 0.0, "grad_norm": 0.668945848941803, "kl": 0.5923841055482626, "learning_rate": 9.999742357344142e-05, "loss": 0.0624, "num_tokens": 23618723.0, "reward": 10.537452697753906, "reward_std": 15.241682052612305, "rewards/rollout_reward_func/mean": 10.537453651428223, "rewards/rollout_reward_func/std": 16.505765914916992, "sampling/importance_sampling_ratio/max": 1.2935158014297485, "sampling/importance_sampling_ratio/mean": 0.9813590049743652, "sampling/importance_sampling_ratio/min": 2.974116992179171e-14, "sampling/sampling_logp_difference/max": 25.953086853027344, "sampling/sampling_logp_difference/mean": 0.028037957847118378, "step": 409, "step_time": 40.691261925003346 }, { "clip_ratio/high_max": 0.04437229549512267, "clip_ratio/high_mean": 0.011837121448479593, "clip_ratio/low_mean": 0.016443452972453088, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.028280574886593968, "entropy": 0.1118474374525249, "epoch": 0.0082, "grad_norm": 0.2630373537540436, "kl": 0.6832827776670456, "learning_rate": 9.999740974046282e-05, "loss": 0.0566, "step": 410, "step_time": 9.717429660999187 }, { "clip_ratio/high_max": 0.011904762126505375, "clip_ratio/high_mean": 0.004464285797439516, "clip_ratio/low_mean": 0.0007440476329065859, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0052083334303461015, "completions/clipped_ratio": 0.0, "completions/max_length": 1543.0, "completions/max_terminated_length": 1543.0, "completions/mean_length": 1393.609375, "completions/mean_terminated_length": 1393.609375, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "entropy": 0.1146247279830277, "epoch": 0.00822, "frac_reward_zero_std": 0.0, "grad_norm": 0.47182416915893555, "kl": 0.5627781376242638, "learning_rate": 9.999739587044981e-05, "loss": -0.0341, "num_tokens": 23759122.0, "reward": 8.971721649169922, "reward_std": 14.443693161010742, "rewards/rollout_reward_func/mean": 8.971721649169922, "rewards/rollout_reward_func/std": 14.68343448638916, "sampling/importance_sampling_ratio/max": 1.243363857269287, "sampling/importance_sampling_ratio/mean": 0.9929588437080383, "sampling/importance_sampling_ratio/min": 0.7046716809272766, "sampling/sampling_logp_difference/max": 0.35747838020324707, "sampling/sampling_logp_difference/mean": 0.005684119649231434, "step": 411, "step_time": 39.962890398002855 }, { "clip_ratio/high_max": 0.035714286379516125, "clip_ratio/high_mean": 0.009672619227785617, "clip_ratio/low_mean": 0.01767113123787567, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02734375116415322, "entropy": 0.11477407393977046, "epoch": 0.00824, "grad_norm": 0.24663475155830383, "kl": 0.6022106558084488, "learning_rate": 9.999738196340245e-05, "loss": -0.0386, "step": 412, "step_time": 9.870993509000982 }, { "clip_ratio/high_max": 0.0029761905316263437, "clip_ratio/high_mean": 0.0007440476329065859, "clip_ratio/low_mean": 0.002232142898719758, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0029761905316263437, "completions/clipped_ratio": 0.0, "completions/max_length": 1563.0, "completions/max_terminated_length": 1563.0, "completions/mean_length": 1434.671875, "completions/mean_terminated_length": 1434.671875, "completions/min_length": 1052.0, "completions/min_terminated_length": 1052.0, "entropy": 0.11288065044209361, "epoch": 0.00826, "frac_reward_zero_std": 0.0, "grad_norm": 0.38144180178642273, "kl": 0.7383872698992491, "learning_rate": 9.999736801932072e-05, "loss": 0.0133, "num_tokens": 23902181.0, "reward": 13.304646492004395, "reward_std": 20.157991409301758, "rewards/rollout_reward_func/mean": 13.304647445678711, "rewards/rollout_reward_func/std": 21.064607620239258, "sampling/importance_sampling_ratio/max": 1.3603137731552124, "sampling/importance_sampling_ratio/mean": 1.0158387422561646, "sampling/importance_sampling_ratio/min": 0.7469893097877502, "sampling/sampling_logp_difference/max": 0.2501299977302551, "sampling/sampling_logp_difference/mean": 0.004556077066808939, "step": 413, "step_time": 41.25988831600807 }, { "clip_ratio/high_max": 0.017857143422588706, "clip_ratio/high_mean": 0.0044642858556471765, "clip_ratio/low_mean": 0.012648809934034944, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01711309573147446, "entropy": 0.10936349909752607, "epoch": 0.00828, "grad_norm": 0.2556546628475189, "kl": 0.7252329587936401, "learning_rate": 9.999735403820466e-05, "loss": 0.0102, "step": 414, "step_time": 10.573283408997668 }, { "clip_ratio/high_max": 0.0059523810632526875, "clip_ratio/high_mean": 0.0014880952658131719, "clip_ratio/low_mean": 0.0007440476329065859, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002232142898719758, "completions/clipped_ratio": 0.0, "completions/max_length": 1562.0, "completions/max_terminated_length": 1562.0, "completions/mean_length": 1493.640625, "completions/mean_terminated_length": 1493.640625, "completions/min_length": 1359.0, "completions/min_terminated_length": 1359.0, "entropy": 0.11034470843151212, "epoch": 0.0083, "frac_reward_zero_std": 0.0, "grad_norm": 0.7242380380630493, "kl": 0.6212767362594604, "learning_rate": 9.999734002005428e-05, "loss": -0.0155, "num_tokens": 24049141.0, "reward": 9.928826332092285, "reward_std": 15.976888656616211, "rewards/rollout_reward_func/mean": 9.928826332092285, "rewards/rollout_reward_func/std": 16.414718627929688, "sampling/importance_sampling_ratio/max": 1.3260316848754883, "sampling/importance_sampling_ratio/mean": 1.0085797309875488, "sampling/importance_sampling_ratio/min": 0.5519727468490601, "sampling/sampling_logp_difference/max": 0.5959200859069824, "sampling/sampling_logp_difference/mean": 0.006388316862285137, "step": 415, "step_time": 40.88382640199961 }, { "clip_ratio/high_max": 0.014880952658131719, "clip_ratio/high_mean": 0.005952381121460348, "clip_ratio/low_mean": 0.019494048377964646, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.025446429615840316, "entropy": 0.09857920417562127, "epoch": 0.00832, "grad_norm": 0.39599546790122986, "kl": 0.7278024889528751, "learning_rate": 9.99973259648696e-05, "loss": -0.013, "step": 416, "step_time": 10.850284384998304 }, { "clip_ratio/high_max": 0.0059523810632526875, "clip_ratio/high_mean": 0.0014880952658131719, "clip_ratio/low_mean": 0.0007440476329065859, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002232142898719758, "completions/clipped_ratio": 0.0, "completions/max_length": 1541.0, "completions/max_terminated_length": 1541.0, "completions/mean_length": 1388.96875, "completions/mean_terminated_length": 1388.96875, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "entropy": 0.1026167522650212, "epoch": 0.00834, "frac_reward_zero_std": 0.0, "grad_norm": 0.4579165577888489, "kl": 0.8239834625273943, "learning_rate": 9.99973118726506e-05, "loss": -0.0484, "num_tokens": 24189178.0, "reward": 12.621437072753906, "reward_std": 16.67880630493164, "rewards/rollout_reward_func/mean": 12.621437072753906, "rewards/rollout_reward_func/std": 17.352924346923828, "sampling/importance_sampling_ratio/max": 1.2838397026062012, "sampling/importance_sampling_ratio/mean": 1.0156192779541016, "sampling/importance_sampling_ratio/min": 0.6750461459159851, "sampling/sampling_logp_difference/max": 0.2394113540649414, "sampling/sampling_logp_difference/mean": 0.004534607753157616, "step": 417, "step_time": 39.856104094997136 }, { "clip_ratio/high_max": 0.038690477376803756, "clip_ratio/high_mean": 0.011904762184713036, "clip_ratio/low_mean": 0.01116071455180645, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02306547696935013, "entropy": 0.11136638512834907, "epoch": 0.00836, "grad_norm": 0.2160414755344391, "kl": 0.6315647587180138, "learning_rate": 9.999729774339733e-05, "loss": -0.0554, "step": 418, "step_time": 9.950184918994637 }, { "clip_ratio/high_max": 0.0028409091755747795, "clip_ratio/high_mean": 0.0007102272938936949, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0007102272938936949, "completions/clipped_ratio": 0.0, "completions/max_length": 1545.0, "completions/max_terminated_length": 1545.0, "completions/mean_length": 1415.34375, "completions/mean_terminated_length": 1415.34375, "completions/min_length": 741.0, "completions/min_terminated_length": 741.0, "entropy": 0.12300179339945316, "epoch": 0.00838, "frac_reward_zero_std": 0.0, "grad_norm": 0.35927513241767883, "kl": 0.563910448923707, "learning_rate": 9.999728357710979e-05, "loss": -0.0024, "num_tokens": 24330939.0, "reward": 10.211483001708984, "reward_std": 12.243392944335938, "rewards/rollout_reward_func/mean": 10.211483001708984, "rewards/rollout_reward_func/std": 12.923269271850586, "sampling/importance_sampling_ratio/max": 1.561508297920227, "sampling/importance_sampling_ratio/mean": 0.9852752089500427, "sampling/importance_sampling_ratio/min": 0.6525661945343018, "sampling/sampling_logp_difference/max": 0.421316921710968, "sampling/sampling_logp_difference/mean": 0.005881062708795071, "step": 419, "step_time": 40.82010957399871 }, { "clip_ratio/high_max": 0.023403680184856057, "clip_ratio/high_mean": 0.00801836303435266, "clip_ratio/low_mean": 0.005332341359462589, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01335070439381525, "entropy": 0.12413196917623281, "epoch": 0.0084, "grad_norm": 0.22808168828487396, "kl": 0.5641085561364889, "learning_rate": 9.999726937378799e-05, "loss": -0.0082, "step": 420, "step_time": 9.7226802879959 }, { "clip_ratio/high_max": 0.008928571594879031, "clip_ratio/high_mean": 0.002232142898719758, "clip_ratio/low_mean": 0.002232142898719758, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004464285797439516, "completions/clipped_ratio": 0.0, "completions/max_length": 1535.0, "completions/max_terminated_length": 1535.0, "completions/mean_length": 1441.640625, "completions/mean_terminated_length": 1441.640625, "completions/min_length": 864.0, "completions/min_terminated_length": 864.0, "entropy": 0.1284960494376719, "epoch": 0.00842, "frac_reward_zero_std": 0.0, "grad_norm": 0.532646656036377, "kl": 0.7414810676127672, "learning_rate": 9.999725513343196e-05, "loss": 0.0034, "num_tokens": 24474440.0, "reward": 15.56411361694336, "reward_std": 16.717456817626953, "rewards/rollout_reward_func/mean": 15.56411361694336, "rewards/rollout_reward_func/std": 16.81290626525879, "sampling/importance_sampling_ratio/max": 1.2900909185409546, "sampling/importance_sampling_ratio/mean": 1.0089163780212402, "sampling/importance_sampling_ratio/min": 0.6302499175071716, "sampling/sampling_logp_difference/max": 0.41839098930358887, "sampling/sampling_logp_difference/mean": 0.006366787478327751, "step": 421, "step_time": 41.64962637100143 }, { "clip_ratio/high_max": 0.020833333721384406, "clip_ratio/high_mean": 0.0052083334303461015, "clip_ratio/low_mean": 0.014136905199848115, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.019345238688401878, "entropy": 0.12450070818886161, "epoch": 0.00844, "grad_norm": 0.3132474422454834, "kl": 0.7047660015523434, "learning_rate": 9.999724085604169e-05, "loss": -0.0014, "step": 422, "step_time": 10.727001868001025 }, { "clip_ratio/high_max": 0.0029761905316263437, "clip_ratio/high_mean": 0.0007440476329065859, "clip_ratio/low_mean": 0.002232142898719758, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0029761905316263437, "completions/clipped_ratio": 0.0, "completions/max_length": 1557.0, "completions/max_terminated_length": 1557.0, "completions/mean_length": 1452.453125, "completions/mean_terminated_length": 1452.453125, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "entropy": 0.11882536578923464, "epoch": 0.00846, "frac_reward_zero_std": 0.0, "grad_norm": 0.7803420424461365, "kl": 0.8879449907690287, "learning_rate": 9.999722654161722e-05, "loss": -0.0437, "num_tokens": 24618707.0, "reward": 11.537307739257812, "reward_std": 16.87006187438965, "rewards/rollout_reward_func/mean": 11.537307739257812, "rewards/rollout_reward_func/std": 18.111291885375977, "sampling/importance_sampling_ratio/max": 2.1790900230407715, "sampling/importance_sampling_ratio/mean": 1.0079734325408936, "sampling/importance_sampling_ratio/min": 0.6660839319229126, "sampling/sampling_logp_difference/max": 1.0955865383148193, "sampling/sampling_logp_difference/mean": 0.0059229484759271145, "step": 423, "step_time": 39.62021958500554 }, { "clip_ratio/high_max": 0.02380952425301075, "clip_ratio/high_mean": 0.006696428696159273, "clip_ratio/low_mean": 0.015560741710942239, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02225717029068619, "entropy": 0.12485062563791871, "epoch": 0.00848, "grad_norm": 0.31361132860183716, "kl": 0.7454855944961309, "learning_rate": 9.999721219015854e-05, "loss": -0.0541, "step": 424, "step_time": 10.1194757400026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0014880952658131719, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0014880952658131719, "completions/clipped_ratio": 0.0, "completions/max_length": 1537.0, "completions/max_terminated_length": 1537.0, "completions/mean_length": 1414.84375, "completions/mean_terminated_length": 1414.84375, "completions/min_length": 690.0, "completions/min_terminated_length": 690.0, "entropy": 0.1313102599233389, "epoch": 0.0085, "frac_reward_zero_std": 0.0, "grad_norm": 0.5788205862045288, "kl": 0.6796710211783648, "learning_rate": 9.999719780166567e-05, "loss": -0.0346, "num_tokens": 24760444.0, "reward": 10.583850860595703, "reward_std": 15.813437461853027, "rewards/rollout_reward_func/mean": 10.583850860595703, "rewards/rollout_reward_func/std": 15.782630920410156, "sampling/importance_sampling_ratio/max": 1.3160440921783447, "sampling/importance_sampling_ratio/mean": 0.9774030447006226, "sampling/importance_sampling_ratio/min": 0.7505905628204346, "sampling/sampling_logp_difference/max": 0.2754938304424286, "sampling/sampling_logp_difference/mean": 0.00678935507312417, "step": 425, "step_time": 41.97174792000442 }, { "clip_ratio/high_max": 0.020833333721384406, "clip_ratio/high_mean": 0.0052083334303461015, "clip_ratio/low_mean": 0.017931548063643277, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.023139881726820022, "entropy": 0.13454774813726544, "epoch": 0.00852, "grad_norm": 0.24161121249198914, "kl": 0.6602058243006468, "learning_rate": 9.999718337613865e-05, "loss": -0.0446, "step": 426, "step_time": 9.663861974999236 }, { "clip_ratio/high_max": 0.008928571594879031, "clip_ratio/high_mean": 0.002232142898719758, "clip_ratio/low_mean": 0.0014880952658131719, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0037202381645329297, "completions/clipped_ratio": 0.0, "completions/max_length": 1540.0, "completions/max_terminated_length": 1540.0, "completions/mean_length": 1432.515625, "completions/mean_terminated_length": 1432.515625, "completions/min_length": 408.0, "completions/min_terminated_length": 408.0, "entropy": 0.1410405244678259, "epoch": 0.00854, "frac_reward_zero_std": 0.0, "grad_norm": 0.5101809501647949, "kl": 0.6386174689978361, "learning_rate": 9.999716891357746e-05, "loss": 0.0369, "num_tokens": 24903364.0, "reward": 11.803701400756836, "reward_std": 16.973173141479492, "rewards/rollout_reward_func/mean": 11.803701400756836, "rewards/rollout_reward_func/std": 17.966468811035156, "sampling/importance_sampling_ratio/max": 1.7738006114959717, "sampling/importance_sampling_ratio/mean": 0.995194137096405, "sampling/importance_sampling_ratio/min": 0.6213434338569641, "sampling/sampling_logp_difference/max": 0.5084433555603027, "sampling/sampling_logp_difference/mean": 0.007640195079147816, "step": 427, "step_time": 42.486011768000026 }, { "clip_ratio/high_max": 0.02976190554909408, "clip_ratio/high_mean": 0.01116071455180645, "clip_ratio/low_mean": 0.012369791802484542, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.023530506470706314, "entropy": 0.14208506979048252, "epoch": 0.00856, "grad_norm": 0.2106575071811676, "kl": 0.6240573097020388, "learning_rate": 9.999715441398214e-05, "loss": 0.0308, "step": 428, "step_time": 10.646923483993305 }, { "clip_ratio/high_max": 0.0029761905316263437, "clip_ratio/high_mean": 0.0007440476329065859, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0007440476329065859, "completions/clipped_ratio": 0.0, "completions/max_length": 1573.0, "completions/max_terminated_length": 1573.0, "completions/mean_length": 1438.828125, "completions/mean_terminated_length": 1438.828125, "completions/min_length": 1255.0, "completions/min_terminated_length": 1255.0, "entropy": 0.14296143036335707, "epoch": 0.00858, "frac_reward_zero_std": 0.0, "grad_norm": 0.42578843235969543, "kl": 0.5468210577964783, "learning_rate": 9.999713987735269e-05, "loss": 0.0008, "num_tokens": 25046668.0, "reward": 12.157367706298828, "reward_std": 19.82905387878418, "rewards/rollout_reward_func/mean": 12.157367706298828, "rewards/rollout_reward_func/std": 20.11625862121582, "sampling/importance_sampling_ratio/max": 1.1917697191238403, "sampling/importance_sampling_ratio/mean": 0.988789439201355, "sampling/importance_sampling_ratio/min": 0.6782960295677185, "sampling/sampling_logp_difference/max": 0.32637321949005127, "sampling/sampling_logp_difference/mean": 0.006113000214099884, "step": 429, "step_time": 40.68629942800362 }, { "clip_ratio/high_max": 0.023809524485841393, "clip_ratio/high_mean": 0.007440476503688842, "clip_ratio/low_mean": 0.01045386923942715, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.017894345801323652, "entropy": 0.14418638544157147, "epoch": 0.0086, "grad_norm": 0.268960177898407, "kl": 0.5387851055711508, "learning_rate": 9.999712530368912e-05, "loss": -0.0055, "step": 430, "step_time": 11.072718907002127 }, { "clip_ratio/high_max": 0.0029761905316263437, "clip_ratio/high_mean": 0.0007440476329065859, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0007440476329065859, "completions/clipped_ratio": 0.0, "completions/max_length": 1550.0, "completions/max_terminated_length": 1550.0, "completions/mean_length": 1429.546875, "completions/mean_terminated_length": 1429.546875, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "entropy": 0.15179488621652126, "epoch": 0.00862, "frac_reward_zero_std": 0.0, "grad_norm": 0.7485275864601135, "kl": 0.5430763624608517, "learning_rate": 9.999711069299146e-05, "loss": -0.0808, "num_tokens": 25189448.0, "reward": 11.358131408691406, "reward_std": 17.856586456298828, "rewards/rollout_reward_func/mean": 11.358131408691406, "rewards/rollout_reward_func/std": 18.32318878173828, "sampling/importance_sampling_ratio/max": 1.3345392942428589, "sampling/importance_sampling_ratio/mean": 1.0265988111495972, "sampling/importance_sampling_ratio/min": 0.48013654351234436, "sampling/sampling_logp_difference/max": 0.7489854097366333, "sampling/sampling_logp_difference/mean": 0.008107547648251057, "step": 431, "step_time": 40.926112169998305 }, { "clip_ratio/high_max": 0.0654761919286102, "clip_ratio/high_mean": 0.02008928614668548, "clip_ratio/low_mean": 0.017782738606911153, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.037872024811804295, "entropy": 0.15683973440900445, "epoch": 0.00864, "grad_norm": 0.2219485342502594, "kl": 0.5109246261417866, "learning_rate": 9.99970960452597e-05, "loss": -0.0914, "step": 432, "step_time": 10.182908312996005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1552.0, "completions/max_terminated_length": 1552.0, "completions/mean_length": 1470.21875, "completions/mean_terminated_length": 1470.21875, "completions/min_length": 1344.0, "completions/min_terminated_length": 1344.0, "entropy": 0.14491091342642903, "epoch": 0.00866, "frac_reward_zero_std": 0.0, "grad_norm": 0.46287354826927185, "kl": 0.5186197776347399, "learning_rate": 9.999708136049389e-05, "loss": -0.0113, "num_tokens": 25334849.0, "reward": 10.764678955078125, "reward_std": 13.417325973510742, "rewards/rollout_reward_func/mean": 10.764678955078125, "rewards/rollout_reward_func/std": 14.159459114074707, "sampling/importance_sampling_ratio/max": 1.4547772407531738, "sampling/importance_sampling_ratio/mean": 1.0081079006195068, "sampling/importance_sampling_ratio/min": 0.7049920558929443, "sampling/sampling_logp_difference/max": 0.4709939956665039, "sampling/sampling_logp_difference/mean": 0.005667536519467831, "step": 433, "step_time": 42.1604533589998 }, { "clip_ratio/high_max": 0.0476190485060215, "clip_ratio/high_mean": 0.014136905199848115, "clip_ratio/low_mean": 0.01785714365541935, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03199404844781384, "entropy": 0.15482168877497315, "epoch": 0.00868, "grad_norm": 0.2332436740398407, "kl": 0.5066223796457052, "learning_rate": 9.9997066638694e-05, "loss": -0.0183, "step": 434, "step_time": 10.119833896998898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1575.0, "completions/max_terminated_length": 1575.0, "completions/mean_length": 1436.375, "completions/mean_terminated_length": 1436.375, "completions/min_length": 795.0, "completions/min_terminated_length": 795.0, "entropy": 0.16640883032232523, "epoch": 0.0087, "frac_reward_zero_std": 0.0, "grad_norm": 0.5225183367729187, "kl": 0.4759600590914488, "learning_rate": 9.999705187986009e-05, "loss": 0.0044, "num_tokens": 25478062.0, "reward": 11.862446784973145, "reward_std": 14.980566024780273, "rewards/rollout_reward_func/mean": 11.862445831298828, "rewards/rollout_reward_func/std": 15.403722763061523, "sampling/importance_sampling_ratio/max": 1.3265814781188965, "sampling/importance_sampling_ratio/mean": 1.0067017078399658, "sampling/importance_sampling_ratio/min": 0.6984032988548279, "sampling/sampling_logp_difference/max": 0.3158724308013916, "sampling/sampling_logp_difference/mean": 0.007522557862102985, "step": 435, "step_time": 40.53673548099687 }, { "clip_ratio/high_max": 0.08556547830812633, "clip_ratio/high_mean": 0.028087798331398517, "clip_ratio/low_mean": 0.025297619868069887, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.05338541802484542, "entropy": 0.166918208822608, "epoch": 0.00872, "grad_norm": 0.5592331886291504, "kl": 0.46205065958201885, "learning_rate": 9.999703708399215e-05, "loss": -0.0001, "step": 436, "step_time": 10.790453629004332 }, { "clip_ratio/high_max": 0.017857143189758062, "clip_ratio/high_mean": 0.004464285797439516, "clip_ratio/low_mean": 0.0007440476329065859, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0052083334303461015, "completions/clipped_ratio": 0.0, "completions/max_length": 1542.0, "completions/max_terminated_length": 1542.0, "completions/mean_length": 1430.21875, "completions/mean_terminated_length": 1430.21875, "completions/min_length": 1031.0, "completions/min_terminated_length": 1031.0, "entropy": 0.1512767318636179, "epoch": 0.00874, "frac_reward_zero_std": 0.0, "grad_norm": 0.5887247920036316, "kl": 0.47883218713104725, "learning_rate": 9.99970222510902e-05, "loss": 0.023, "num_tokens": 25620798.0, "reward": 10.20716667175293, "reward_std": 16.14691734313965, "rewards/rollout_reward_func/mean": 10.20716667175293, "rewards/rollout_reward_func/std": 17.900371551513672, "sampling/importance_sampling_ratio/max": 1.2416183948516846, "sampling/importance_sampling_ratio/mean": 0.9807419776916504, "sampling/importance_sampling_ratio/min": 0.542736291885376, "sampling/sampling_logp_difference/max": 0.36902284622192383, "sampling/sampling_logp_difference/mean": 0.0073195262812078, "step": 437, "step_time": 40.27251563699974 }, { "clip_ratio/high_max": 0.059523811331018806, "clip_ratio/high_mean": 0.02306547691114247, "clip_ratio/low_mean": 0.03698593232547864, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.06005140976049006, "entropy": 0.11152059538289905, "epoch": 0.00876, "grad_norm": 0.34218233823776245, "kl": 0.599434606730938, "learning_rate": 9.999700738115424e-05, "loss": 0.0208, "step": 438, "step_time": 10.141322578992913 }, { "clip_ratio/high_max": 0.008928571594879031, "clip_ratio/high_mean": 0.002232142898719758, "clip_ratio/low_mean": 0.0007440476329065859, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0029761905316263437, "completions/clipped_ratio": 0.0, "completions/max_length": 1551.0, "completions/max_terminated_length": 1551.0, "completions/mean_length": 1448.5625, "completions/mean_terminated_length": 1448.5625, "completions/min_length": 1357.0, "completions/min_terminated_length": 1357.0, "entropy": 0.09335534879937768, "epoch": 0.00878, "frac_reward_zero_std": 0.0, "grad_norm": 0.5081444382667542, "kl": 0.5331121180206537, "learning_rate": 9.999699247418432e-05, "loss": -0.0063, "num_tokens": 25764758.0, "reward": 9.246360778808594, "reward_std": 12.59730339050293, "rewards/rollout_reward_func/mean": 9.246360778808594, "rewards/rollout_reward_func/std": 14.430070877075195, "sampling/importance_sampling_ratio/max": 1.47153902053833, "sampling/importance_sampling_ratio/mean": 0.9984990358352661, "sampling/importance_sampling_ratio/min": 0.582763671875, "sampling/sampling_logp_difference/max": 0.4040945768356323, "sampling/sampling_logp_difference/mean": 0.0051497891545295715, "step": 439, "step_time": 41.78158714499841 }, { "clip_ratio/high_max": 0.020833333721384406, "clip_ratio/high_mean": 0.0052083334303461015, "clip_ratio/low_mean": 0.02083333401242271, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.026041667442768812, "entropy": 0.07112342561595142, "epoch": 0.0088, "grad_norm": 0.41764187812805176, "kl": 0.8426203690469265, "learning_rate": 9.999697753018041e-05, "loss": -0.0085, "step": 440, "step_time": 10.17990355800066 }, { "clip_ratio/high_max": 0.008928571594879031, "clip_ratio/high_mean": 0.0029761905316263437, "clip_ratio/low_mean": 0.002232142898719758, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0052083334303461015, "completions/clipped_ratio": 0.0, "completions/max_length": 1552.0, "completions/max_terminated_length": 1552.0, "completions/mean_length": 1425.234375, "completions/mean_terminated_length": 1425.234375, "completions/min_length": 690.0, "completions/min_terminated_length": 690.0, "entropy": 0.07502732030116022, "epoch": 0.00882, "frac_reward_zero_std": 0.0, "grad_norm": 0.4730430841445923, "kl": 0.598696194589138, "learning_rate": 9.999696254914256e-05, "loss": -0.0232, "num_tokens": 25907211.0, "reward": 12.0460205078125, "reward_std": 12.864827156066895, "rewards/rollout_reward_func/mean": 12.0460205078125, "rewards/rollout_reward_func/std": 13.124265670776367, "sampling/importance_sampling_ratio/max": 2.117748975753784, "sampling/importance_sampling_ratio/mean": 0.979032039642334, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.2621982097625732, "sampling/sampling_logp_difference/mean": 0.006923416629433632, "step": 441, "step_time": 40.17172135200235 }, { "clip_ratio/high_max": 0.020833333721384406, "clip_ratio/high_mean": 0.006696428754366934, "clip_ratio/low_mean": 0.010491071618162096, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01718750043073669, "entropy": 0.07272043719422072, "epoch": 0.00884, "grad_norm": 0.2536933422088623, "kl": 0.6081040930002928, "learning_rate": 9.999694753107076e-05, "loss": -0.0288, "step": 442, "step_time": 10.609227344999454 }, { "clip_ratio/high_max": 0.0029761905316263437, "clip_ratio/high_mean": 0.0007440476329065859, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0007440476329065859, "completions/clipped_ratio": 0.0, "completions/max_length": 1551.0, "completions/max_terminated_length": 1551.0, "completions/mean_length": 1433.484375, "completions/mean_terminated_length": 1433.484375, "completions/min_length": 637.0, "completions/min_terminated_length": 637.0, "entropy": 0.08062643301673234, "epoch": 0.00886, "frac_reward_zero_std": 0.0, "grad_norm": 0.8294975757598877, "kl": 0.5611858777701855, "learning_rate": 9.999693247596505e-05, "loss": 0.0316, "num_tokens": 26050176.0, "reward": 9.822164535522461, "reward_std": 14.750000953674316, "rewards/rollout_reward_func/mean": 9.822165489196777, "rewards/rollout_reward_func/std": 14.6282377243042, "sampling/importance_sampling_ratio/max": 1.5190024375915527, "sampling/importance_sampling_ratio/mean": 1.0036146640777588, "sampling/importance_sampling_ratio/min": 0.7604562640190125, "sampling/sampling_logp_difference/max": 0.3031894564628601, "sampling/sampling_logp_difference/mean": 0.004106580279767513, "step": 443, "step_time": 40.79559905699534 }, { "clip_ratio/high_max": 0.030257937032729387, "clip_ratio/high_mean": 0.007564484258182347, "clip_ratio/low_mean": 0.015591179952025414, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02315566421020776, "entropy": 0.0824263768736273, "epoch": 0.00888, "grad_norm": 0.9322162866592407, "kl": 0.7285797223448753, "learning_rate": 9.999691738382544e-05, "loss": 0.034, "step": 444, "step_time": 10.72277228299754 }, { "clip_ratio/high_max": 0.0029761905316263437, "clip_ratio/high_mean": 0.0007440476329065859, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0007440476329065859, "completions/clipped_ratio": 0.0, "completions/max_length": 1574.0, "completions/max_terminated_length": 1574.0, "completions/mean_length": 1428.421875, "completions/mean_terminated_length": 1428.421875, "completions/min_length": 1166.0, "completions/min_terminated_length": 1166.0, "entropy": 0.07391244731843472, "epoch": 0.0089, "frac_reward_zero_std": 0.0, "grad_norm": 0.5232189893722534, "kl": 0.6245864983648062, "learning_rate": 9.999690225465193e-05, "loss": -0.0215, "num_tokens": 26192780.0, "reward": 11.33067512512207, "reward_std": 15.117729187011719, "rewards/rollout_reward_func/mean": 11.33067512512207, "rewards/rollout_reward_func/std": 16.229934692382812, "sampling/importance_sampling_ratio/max": 1.4000767469406128, "sampling/importance_sampling_ratio/mean": 1.0228557586669922, "sampling/importance_sampling_ratio/min": 0.8148965239524841, "sampling/sampling_logp_difference/max": 0.303769588470459, "sampling/sampling_logp_difference/mean": 0.003096876898780465, "step": 445, "step_time": 39.69306251100352 }, { "clip_ratio/high_max": 0.020833333721384406, "clip_ratio/high_mean": 0.0059523810632526875, "clip_ratio/low_mean": 0.011408730410039425, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.017361111531499773, "entropy": 0.07312626042403281, "epoch": 0.00892, "grad_norm": 0.2677549719810486, "kl": 0.6443136036396027, "learning_rate": 9.999688708844453e-05, "loss": -0.0254, "step": 446, "step_time": 9.859747254999093 }, { "clip_ratio/high_max": 0.0064484127797186375, "clip_ratio/high_mean": 0.0023561508278362453, "clip_ratio/low_mean": 0.0014542749268002808, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003810425754636526, "completions/clipped_ratio": 0.0, "completions/max_length": 1503.0, "completions/max_terminated_length": 1503.0, "completions/mean_length": 1392.1875, "completions/mean_terminated_length": 1392.1875, "completions/min_length": 1207.0, "completions/min_terminated_length": 1207.0, "entropy": 0.10388755868189037, "epoch": 0.00894, "frac_reward_zero_std": 0.0, "grad_norm": 0.44962024688720703, "kl": 0.5700237862765789, "learning_rate": 9.999687188520327e-05, "loss": -0.0085, "num_tokens": 26333000.0, "reward": 10.396234512329102, "reward_std": 12.773336410522461, "rewards/rollout_reward_func/mean": 10.396234512329102, "rewards/rollout_reward_func/std": 13.91511058807373, "sampling/importance_sampling_ratio/max": 1.2538983821868896, "sampling/importance_sampling_ratio/mean": 1.0106072425842285, "sampling/importance_sampling_ratio/min": 0.8617662787437439, "sampling/sampling_logp_difference/max": 0.21103119850158691, "sampling/sampling_logp_difference/mean": 0.004161643795669079, "step": 447, "step_time": 40.40724292899722 }, { "clip_ratio/high_max": 0.017215219675563276, "clip_ratio/high_mean": 0.005047852551797405, "clip_ratio/low_mean": 0.011870941845700145, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01691879451391287, "entropy": 0.10565289529040456, "epoch": 0.00896, "grad_norm": 0.3113742470741272, "kl": 0.5588793251663446, "learning_rate": 9.999685664492817e-05, "loss": -0.011, "step": 448, "step_time": 9.88399511400712 }, { "clip_ratio/high_max": 0.0029761905316263437, "clip_ratio/high_mean": 0.0014880952658131719, "clip_ratio/low_mean": 0.0029761905316263437, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004464285797439516, "completions/clipped_ratio": 0.0, "completions/max_length": 1539.0, "completions/max_terminated_length": 1539.0, "completions/mean_length": 1425.671875, "completions/mean_terminated_length": 1425.671875, "completions/min_length": 1252.0, "completions/min_terminated_length": 1252.0, "entropy": 0.08549337997101247, "epoch": 0.00898, "frac_reward_zero_std": 0.0, "grad_norm": 0.43871885538101196, "kl": 0.5197541080415249, "learning_rate": 9.999684136761923e-05, "loss": 0.0424, "num_tokens": 26475423.0, "reward": 13.137186050415039, "reward_std": 18.040781021118164, "rewards/rollout_reward_func/mean": 13.137186050415039, "rewards/rollout_reward_func/std": 18.348669052124023, "sampling/importance_sampling_ratio/max": 2.0688071250915527, "sampling/importance_sampling_ratio/mean": 1.0355302095413208, "sampling/importance_sampling_ratio/min": 0.7141319513320923, "sampling/sampling_logp_difference/max": 0.7877845764160156, "sampling/sampling_logp_difference/mean": 0.004831024445593357, "step": 449, "step_time": 40.3097439919984 }, { "clip_ratio/high_max": 0.02380952425301075, "clip_ratio/high_mean": 0.00744047638727352, "clip_ratio/low_mean": 0.010349026299081743, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01778950251173228, "entropy": 0.09008124680258334, "epoch": 0.009, "grad_norm": 0.2819499969482422, "kl": 0.48992327228188515, "learning_rate": 9.999682605327648e-05, "loss": 0.0377, "step": 450, "step_time": 11.019723427001736 }, { "clip_ratio/high_max": 0.008928571594879031, "clip_ratio/high_mean": 0.002232142898719758, "clip_ratio/low_mean": 0.0007440476329065859, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0029761905316263437, "completions/clipped_ratio": 0.0, "completions/max_length": 1561.0, "completions/max_terminated_length": 1561.0, "completions/mean_length": 1437.046875, "completions/mean_terminated_length": 1437.046875, "completions/min_length": 686.0, "completions/min_terminated_length": 686.0, "entropy": 0.09430601261556149, "epoch": 0.00902, "frac_reward_zero_std": 0.0, "grad_norm": 0.39097362756729126, "kl": 0.5122922882437706, "learning_rate": 9.99968107018999e-05, "loss": -0.0447, "num_tokens": 26618636.0, "reward": 10.664965629577637, "reward_std": 12.413619995117188, "rewards/rollout_reward_func/mean": 10.664965629577637, "rewards/rollout_reward_func/std": 12.955881118774414, "sampling/importance_sampling_ratio/max": 1.1989917755126953, "sampling/importance_sampling_ratio/mean": 0.9830008745193481, "sampling/importance_sampling_ratio/min": 0.5060357451438904, "sampling/sampling_logp_difference/max": 0.3329579830169678, "sampling/sampling_logp_difference/mean": 0.004485957324504852, "step": 451, "step_time": 39.55284725899946 }, { "clip_ratio/high_max": 0.017857143189758062, "clip_ratio/high_mean": 0.0052083334303461015, "clip_ratio/low_mean": 0.009709821664728224, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.014918155211489648, "entropy": 0.09947029640898108, "epoch": 0.00904, "grad_norm": 0.2647772431373596, "kl": 0.501507306471467, "learning_rate": 9.999679531348955e-05, "loss": -0.0474, "step": 452, "step_time": 9.83529582600022 }, { "clip_ratio/high_max": 0.0029761905316263437, "clip_ratio/high_mean": 0.0007440476329065859, "clip_ratio/low_mean": 0.0007440476329065859, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0014880952658131719, "completions/clipped_ratio": 0.0, "completions/max_length": 1576.0, "completions/max_terminated_length": 1576.0, "completions/mean_length": 1483.796875, "completions/mean_terminated_length": 1483.796875, "completions/min_length": 1354.0, "completions/min_terminated_length": 1354.0, "entropy": 0.08577556139789522, "epoch": 0.00906, "frac_reward_zero_std": 0.0, "grad_norm": 0.4616855978965759, "kl": 0.4984573759138584, "learning_rate": 9.999677988804543e-05, "loss": 0.0129, "num_tokens": 26764995.0, "reward": 12.713988304138184, "reward_std": 16.157230377197266, "rewards/rollout_reward_func/mean": 12.713988304138184, "rewards/rollout_reward_func/std": 17.417678833007812, "sampling/importance_sampling_ratio/max": 1.2561296224594116, "sampling/importance_sampling_ratio/mean": 1.0040102005004883, "sampling/importance_sampling_ratio/min": 0.5851351618766785, "sampling/sampling_logp_difference/max": 0.335345983505249, "sampling/sampling_logp_difference/mean": 0.004758521914482117, "step": 453, "step_time": 42.06172357400101 }, { "clip_ratio/high_max": 0.014880952658131719, "clip_ratio/high_mean": 0.004464285797439516, "clip_ratio/low_mean": 0.015625000465661287, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.020089286321308464, "entropy": 0.0774516521487385, "epoch": 0.00908, "grad_norm": 0.13414981961250305, "kl": 0.5282110534608364, "learning_rate": 9.999676442556757e-05, "loss": 0.0065, "step": 454, "step_time": 10.263705699999264 }, { "clip_ratio/high_max": 0.018005952704697847, "clip_ratio/high_mean": 0.005245535809081048, "clip_ratio/low_mean": 0.002232142898719758, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007477678707800806, "completions/clipped_ratio": 0.0, "completions/max_length": 1557.0, "completions/max_terminated_length": 1557.0, "completions/mean_length": 1432.234375, "completions/mean_terminated_length": 1432.234375, "completions/min_length": 777.0, "completions/min_terminated_length": 777.0, "entropy": 0.0869816429913044, "epoch": 0.0091, "frac_reward_zero_std": 0.0, "grad_norm": 0.5139105319976807, "kl": 0.5019301455467939, "learning_rate": 9.999674892605595e-05, "loss": -0.0143, "num_tokens": 26907877.0, "reward": 14.470987319946289, "reward_std": 12.551952362060547, "rewards/rollout_reward_func/mean": 14.470987319946289, "rewards/rollout_reward_func/std": 13.231359481811523, "sampling/importance_sampling_ratio/max": 1.4351580142974854, "sampling/importance_sampling_ratio/mean": 0.9842495918273926, "sampling/importance_sampling_ratio/min": 0.7047746181488037, "sampling/sampling_logp_difference/max": 0.36011219024658203, "sampling/sampling_logp_difference/mean": 0.005461296532303095, "step": 455, "step_time": 41.37734428300246 }, { "clip_ratio/high_max": 0.036011905409395695, "clip_ratio/high_mean": 0.012723214633297175, "clip_ratio/low_mean": 0.01116071455180645, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.023883929243311286, "entropy": 0.07720525958575308, "epoch": 0.00912, "grad_norm": 0.3397330045700073, "kl": 0.6114528980106115, "learning_rate": 9.99967333895106e-05, "loss": -0.0171, "step": 456, "step_time": 10.617095338997387 }, { "clip_ratio/high_max": 0.009077381109818816, "clip_ratio/high_mean": 0.002269345277454704, "clip_ratio/low_mean": 0.0037202381645329297, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005989583441987634, "completions/clipped_ratio": 0.0, "completions/max_length": 1556.0, "completions/max_terminated_length": 1556.0, "completions/mean_length": 1450.328125, "completions/mean_terminated_length": 1450.328125, "completions/min_length": 677.0, "completions/min_terminated_length": 677.0, "entropy": 0.08630289603024721, "epoch": 0.00914, "frac_reward_zero_std": 0.0, "grad_norm": 0.915941059589386, "kl": 0.5304882265627384, "learning_rate": 9.999671781593154e-05, "loss": -0.0128, "num_tokens": 27051977.0, "reward": 12.1441650390625, "reward_std": 13.508443832397461, "rewards/rollout_reward_func/mean": 12.1441650390625, "rewards/rollout_reward_func/std": 14.862476348876953, "sampling/importance_sampling_ratio/max": 1.8943538665771484, "sampling/importance_sampling_ratio/mean": 1.0383222103118896, "sampling/importance_sampling_ratio/min": 0.6029430031776428, "sampling/sampling_logp_difference/max": 0.5262751579284668, "sampling/sampling_logp_difference/mean": 0.0063569676131010056, "step": 457, "step_time": 40.44359572299618 }, { "clip_ratio/high_max": 0.026785714784637094, "clip_ratio/high_mean": 0.00889475119765848, "clip_ratio/low_mean": 0.015625000349245965, "clip_ratio/low_min": 0.0029761905316263437, "clip_ratio/region_mean": 0.024519751546904445, "entropy": 0.07588907447643578, "epoch": 0.00916, "grad_norm": 0.3780209422111511, "kl": 0.5850545484572649, "learning_rate": 9.999670220531878e-05, "loss": -0.0142, "step": 458, "step_time": 10.81988593099959 }, { "clip_ratio/high_max": 0.0059523810632526875, "clip_ratio/high_mean": 0.0014880952658131719, "clip_ratio/low_mean": 0.0007812500116415322, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002269345277454704, "completions/clipped_ratio": 0.0, "completions/max_length": 1548.0, "completions/max_terminated_length": 1548.0, "completions/mean_length": 1446.8125, "completions/mean_terminated_length": 1446.8125, "completions/min_length": 708.0, "completions/min_terminated_length": 708.0, "entropy": 0.06712023681029677, "epoch": 0.00918, "frac_reward_zero_std": 0.0, "grad_norm": 0.5037431716918945, "kl": 0.5365529656410217, "learning_rate": 9.999668655767235e-05, "loss": -0.0142, "num_tokens": 27195924.0, "reward": 12.623528480529785, "reward_std": 16.375185012817383, "rewards/rollout_reward_func/mean": 12.623528480529785, "rewards/rollout_reward_func/std": 17.157840728759766, "sampling/importance_sampling_ratio/max": 1.4218283891677856, "sampling/importance_sampling_ratio/mean": 1.0120244026184082, "sampling/importance_sampling_ratio/min": 0.7264562249183655, "sampling/sampling_logp_difference/max": 0.36830270290374756, "sampling/sampling_logp_difference/mean": 0.003537567099556327, "step": 459, "step_time": 39.789260978999664 }, { "clip_ratio/high_max": 0.020833333721384406, "clip_ratio/high_mean": 0.0052083334303461015, "clip_ratio/low_mean": 0.010230655025225133, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015438988397363573, "entropy": 0.061492747627198696, "epoch": 0.0092, "grad_norm": 0.27846819162368774, "kl": 0.6263625603169203, "learning_rate": 9.999667087299225e-05, "loss": -0.0179, "step": 460, "step_time": 10.157873148000363 }, { "clip_ratio/high_max": 0.011904762126505375, "clip_ratio/high_mean": 0.0037202381645329297, "clip_ratio/low_mean": 0.0007440476329065859, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004464285797439516, "completions/clipped_ratio": 0.0, "completions/max_length": 1553.0, "completions/max_terminated_length": 1553.0, "completions/mean_length": 1393.921875, "completions/mean_terminated_length": 1393.921875, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "entropy": 0.06535098806489259, "epoch": 0.00922, "frac_reward_zero_std": 0.0, "grad_norm": 0.6925691366195679, "kl": 0.5965993329882622, "learning_rate": 9.99966551512785e-05, "loss": -0.0133, "num_tokens": 27336352.0, "reward": 8.150674819946289, "reward_std": 15.653514862060547, "rewards/rollout_reward_func/mean": 8.150674819946289, "rewards/rollout_reward_func/std": 16.096240997314453, "sampling/importance_sampling_ratio/max": 1.3934406042099, "sampling/importance_sampling_ratio/mean": 0.9711774587631226, "sampling/importance_sampling_ratio/min": 0.3346167504787445, "sampling/sampling_logp_difference/max": 1.0496406555175781, "sampling/sampling_logp_difference/mean": 0.005856034811586142, "step": 461, "step_time": 41.696121679995485 }, { "clip_ratio/high_max": 0.023958333767950535, "clip_ratio/high_mean": 0.00673363107489422, "clip_ratio/low_mean": 0.015252976503688842, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.021986607811413705, "entropy": 0.0650356519035995, "epoch": 0.00924, "grad_norm": 1.2314876317977905, "kl": 1.7299257963895798, "learning_rate": 9.999663939253112e-05, "loss": -0.0022, "step": 462, "step_time": 10.117755536000914 }, { "clip_ratio/high_max": 0.009077381109818816, "clip_ratio/high_mean": 0.002269345277454704, "clip_ratio/low_mean": 0.0014880952658131719, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003757440543267876, "completions/clipped_ratio": 0.0, "completions/max_length": 1543.0, "completions/max_terminated_length": 1543.0, "completions/mean_length": 1386.734375, "completions/mean_terminated_length": 1386.734375, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "entropy": 0.08255739836022258, "epoch": 0.00926, "frac_reward_zero_std": 0.0, "grad_norm": 0.4534408450126648, "kl": 0.5308241080492735, "learning_rate": 9.999662359675012e-05, "loss": -0.0123, "num_tokens": 27476234.0, "reward": 11.247259140014648, "reward_std": 14.853042602539062, "rewards/rollout_reward_func/mean": 11.247259140014648, "rewards/rollout_reward_func/std": 14.736608505249023, "sampling/importance_sampling_ratio/max": 1.3197416067123413, "sampling/importance_sampling_ratio/mean": 0.9946113228797913, "sampling/importance_sampling_ratio/min": 0.7106093764305115, "sampling/sampling_logp_difference/max": 0.3446381092071533, "sampling/sampling_logp_difference/mean": 0.00609009200707078, "step": 463, "step_time": 40.418589189997874 }, { "clip_ratio/high_max": 0.026934524532407522, "clip_ratio/high_mean": 0.009709821664728224, "clip_ratio/low_mean": 0.01640625053551048, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.026116072200238705, "entropy": 0.09197275433689356, "epoch": 0.00928, "grad_norm": 0.46926549077033997, "kl": 0.5417319964617491, "learning_rate": 9.999660776393552e-05, "loss": -0.0111, "step": 464, "step_time": 10.364104637999844 }, { "clip_ratio/high_max": 0.0059523810632526875, "clip_ratio/high_mean": 0.0014880952658131719, "clip_ratio/low_mean": 0.0014880952658131719, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0029761905316263437, "completions/clipped_ratio": 0.0, "completions/max_length": 1538.0, "completions/max_terminated_length": 1538.0, "completions/mean_length": 1408.109375, "completions/mean_terminated_length": 1408.109375, "completions/min_length": 788.0, "completions/min_terminated_length": 788.0, "entropy": 0.0891355937346816, "epoch": 0.0093, "frac_reward_zero_std": 0.0, "grad_norm": 0.9044552445411682, "kl": 0.6998987477272749, "learning_rate": 9.999659189408731e-05, "loss": -0.0085, "num_tokens": 27617505.0, "reward": 13.215154647827148, "reward_std": 11.782221794128418, "rewards/rollout_reward_func/mean": 13.215155601501465, "rewards/rollout_reward_func/std": 12.105838775634766, "sampling/importance_sampling_ratio/max": 1.657700777053833, "sampling/importance_sampling_ratio/mean": 1.0080327987670898, "sampling/importance_sampling_ratio/min": 0.5086445808410645, "sampling/sampling_logp_difference/max": 0.6424019932746887, "sampling/sampling_logp_difference/mean": 0.005240763537585735, "step": 465, "step_time": 39.89362095000433 } ], "logging_steps": 1.0, "max_steps": 100000, "num_input_tokens_seen": 27617505, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }