{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.078, "eval_steps": 500, "global_step": 975, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.00555555559694767, "clip_ratio/high_mean": 0.002777777798473835, "clip_ratio/low_mean": 0.01785130724310875, "clip_ratio/low_min": 0.00555555559694767, "clip_ratio/region_mean": 0.0206290852278471, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/max_terminated_length": 374.0, "completions/mean_length": 301.8375, "completions/mean_terminated_length": 301.8375, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "entropy": 0.3019494503736496, "epoch": 0.0004, "frac_reward_zero_std": 0.425, "grad_norm": 0.12727217376232147, "kl": 0.04473987314850092, "learning_rate": 1.137216e-06, "loss": -1.5935138799250125e-05, "num_tokens": 68886.0, "reward": 0.3598749995231628, "reward_std": 0.2971616297960281, "rewards/env_game_reward/mean": 0.3598749995231628, "rewards/env_game_reward/std": 0.40898369550704955, "sampling/importance_sampling_ratio/max": 1.8200685739517213, "sampling/importance_sampling_ratio/mean": 0.9960397481918335, "sampling/importance_sampling_ratio/min": 0.2915335774421692, "sampling/sampling_logp_difference/max": 1.2875514030456543, "sampling/sampling_logp_difference/mean": 0.07988147884607315, "step": 5, "step_time": 2.435299045799911 }, { "clip_ratio/high_max": 0.02977941185235977, "clip_ratio/high_mean": 0.014889705926179886, "clip_ratio/low_mean": 0.019972265511751176, "clip_ratio/low_min": 0.00555555559694767, "clip_ratio/region_mean": 0.034861971624195576, "completions/clipped_ratio": 0.0, "completions/max_length": 365.6, "completions/max_terminated_length": 365.6, "completions/mean_length": 293.7875, "completions/mean_terminated_length": 293.7875, "completions/min_length": 219.2, "completions/min_terminated_length": 219.2, "entropy": 0.3496206432580948, "epoch": 0.0008, "frac_reward_zero_std": 0.45, "grad_norm": 0.2412712723016739, "kl": 0.021455740835517646, "learning_rate": 2.5587359999999995e-06, "loss": 0.0005658970680087805, "num_tokens": 136575.0, "reward": 0.38937501311302186, "reward_std": 0.25508877336978913, "rewards/env_game_reward/mean": 0.38937501311302186, "rewards/env_game_reward/std": 0.37691490650177, "sampling/importance_sampling_ratio/max": 1.777461338043213, "sampling/importance_sampling_ratio/mean": 0.9673051834106445, "sampling/importance_sampling_ratio/min": 0.18635750880930574, "sampling/sampling_logp_difference/max": 2.5953728675842287, "sampling/sampling_logp_difference/mean": 0.10190577432513237, "step": 10, "step_time": 2.0430383474005795 }, { "clip_ratio/high_max": 0.012132352963089944, "clip_ratio/high_mean": 0.006066176481544972, "clip_ratio/low_mean": 0.008350533433258533, "clip_ratio/low_min": 0.005263157933950424, "clip_ratio/region_mean": 0.01441670972853899, "completions/clipped_ratio": 0.0, "completions/max_length": 373.8, "completions/max_terminated_length": 373.8, "completions/mean_length": 282.05, "completions/mean_terminated_length": 282.05, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "entropy": 0.36802791953086855, "epoch": 0.0012, "frac_reward_zero_std": 0.45, "grad_norm": 0.1376928836107254, "kl": 0.08403404112905263, "learning_rate": 3.9802559999999995e-06, "loss": 0.0002614784985780716, "num_tokens": 202248.0, "reward": 0.3897500097751617, "reward_std": 0.2549119979143143, "rewards/env_game_reward/mean": 0.3897500097751617, "rewards/env_game_reward/std": 0.414092218875885, "sampling/importance_sampling_ratio/max": 1.847982144355774, "sampling/importance_sampling_ratio/mean": 1.0205587506294251, "sampling/importance_sampling_ratio/min": 0.4687712244689465, "sampling/sampling_logp_difference/max": 1.0548641800880432, "sampling/sampling_logp_difference/mean": 0.0759758085012436, "step": 15, "step_time": 2.0221517859987217 }, { "clip_ratio/high_max": 0.023570261523127557, "clip_ratio/high_mean": 0.011785130761563779, "clip_ratio/low_mean": 0.03886523898690939, "clip_ratio/low_min": 0.01670106649398804, "clip_ratio/region_mean": 0.050650370121002194, "completions/clipped_ratio": 0.0, "completions/max_length": 365.6, "completions/max_terminated_length": 365.6, "completions/mean_length": 297.625, "completions/mean_terminated_length": 297.625, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "entropy": 0.3216476052999496, "epoch": 0.0016, "frac_reward_zero_std": 0.2, "grad_norm": 4.7970099449157715, "kl": 13.012010221928358, "learning_rate": 5.401775999999999e-06, "loss": 0.0002645790576934814, "num_tokens": 271508.0, "reward": 0.4497500121593475, "reward_std": 0.4034044086933136, "rewards/env_game_reward/mean": 0.4497500121593475, "rewards/env_game_reward/std": 0.42049501538276673, "sampling/importance_sampling_ratio/max": 1.9190717458724975, "sampling/importance_sampling_ratio/mean": 0.9109017491340637, "sampling/importance_sampling_ratio/min": 0.12080914080142975, "sampling/sampling_logp_difference/max": 1.8006493330001831, "sampling/sampling_logp_difference/mean": 0.11719217151403427, "step": 20, "step_time": 2.1648863314010667 }, { "clip_ratio/high_max": 0.01180555559694767, "clip_ratio/high_mean": 0.005902777798473835, "clip_ratio/low_mean": 0.026298435963690282, "clip_ratio/low_min": 0.010644257813692094, "clip_ratio/region_mean": 0.032201213762164116, "completions/clipped_ratio": 0.0, "completions/max_length": 365.6, "completions/max_terminated_length": 365.6, "completions/mean_length": 286.9375, "completions/mean_terminated_length": 286.9375, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "entropy": 0.3577309399843216, "epoch": 0.002, "frac_reward_zero_std": 0.425, "grad_norm": 0.12569169700145721, "kl": 0.5408080242574215, "learning_rate": 6.8232959999999994e-06, "loss": -0.0006784088443964719, "num_tokens": 338918.0, "reward": 0.48712502121925355, "reward_std": 0.2656953662633896, "rewards/env_game_reward/mean": 0.48712502121925355, "rewards/env_game_reward/std": 0.37898687124252317, "sampling/importance_sampling_ratio/max": 1.827208924293518, "sampling/importance_sampling_ratio/mean": 0.95396728515625, "sampling/importance_sampling_ratio/min": 0.2660827249288559, "sampling/sampling_logp_difference/max": 1.3808974146842956, "sampling/sampling_logp_difference/mean": 0.1048379361629486, "step": 25, "step_time": 2.043276316798438 }, { "clip_ratio/high_max": 0.011513157933950424, "clip_ratio/high_mean": 0.005756578966975212, "clip_ratio/low_mean": 0.017378311045467852, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.023134890012443066, "completions/clipped_ratio": 0.0, "completions/max_length": 365.8, "completions/max_terminated_length": 365.8, "completions/mean_length": 278.425, "completions/mean_terminated_length": 278.425, "completions/min_length": 194.6, "completions/min_terminated_length": 194.6, "entropy": 0.5190394788980484, "epoch": 0.0024, "frac_reward_zero_std": 0.375, "grad_norm": 0.2680231034755707, "kl": 0.7589091017842293, "learning_rate": 8.244816e-06, "loss": -2.776859328150749e-05, "num_tokens": 404073.0, "reward": 0.4043750107288361, "reward_std": 0.25508877336978913, "rewards/env_game_reward/mean": 0.4043750107288361, "rewards/env_game_reward/std": 0.409240585565567, "sampling/importance_sampling_ratio/max": 1.9664097785949708, "sampling/importance_sampling_ratio/mean": 0.9075453758239747, "sampling/importance_sampling_ratio/min": 0.20434444732964038, "sampling/sampling_logp_difference/max": 1.4564614772796631, "sampling/sampling_logp_difference/mean": 0.14034032225608825, "step": 30, "step_time": 1.9978871219995198 }, { "clip_ratio/high_max": 0.02389705888926983, "clip_ratio/high_mean": 0.011948529444634915, "clip_ratio/low_mean": 0.011948529072105885, "clip_ratio/low_min": 0.00625, "clip_ratio/region_mean": 0.023897058516740798, "completions/clipped_ratio": 0.0, "completions/max_length": 357.4, "completions/max_terminated_length": 357.4, "completions/mean_length": 272.075, "completions/mean_terminated_length": 272.075, "completions/min_length": 166.4, "completions/min_terminated_length": 166.4, "entropy": 0.5090951889753341, "epoch": 0.0028, "frac_reward_zero_std": 0.425, "grad_norm": 0.07063741236925125, "kl": 0.7235948637127876, "learning_rate": 9.666336e-06, "loss": -0.0005834823474287987, "num_tokens": 468753.0, "reward": 0.487250018119812, "reward_std": 0.24430539906024934, "rewards/env_game_reward/mean": 0.487250018119812, "rewards/env_game_reward/std": 0.39645681977272035, "sampling/importance_sampling_ratio/max": 1.7759217023849487, "sampling/importance_sampling_ratio/mean": 0.8916664958000183, "sampling/importance_sampling_ratio/min": 0.1796780303120613, "sampling/sampling_logp_difference/max": 5.293339991569519, "sampling/sampling_logp_difference/mean": 0.22543202042579652, "step": 35, "step_time": 1.970122839200485 }, { "clip_ratio/high_max": 0.011764705926179887, "clip_ratio/high_mean": 0.005882352963089943, "clip_ratio/low_mean": 0.007465277798473835, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013347630761563778, "completions/clipped_ratio": 0.0, "completions/max_length": 367.8, "completions/max_terminated_length": 367.8, "completions/mean_length": 285.5, "completions/mean_terminated_length": 285.5, "completions/min_length": 218.8, "completions/min_terminated_length": 218.8, "entropy": 0.4019551068544388, "epoch": 0.0032, "frac_reward_zero_std": 0.425, "grad_norm": 0.05544561892747879, "kl": 0.5415760695934295, "learning_rate": 9.950639790096085e-06, "loss": -0.0005978990811854601, "num_tokens": 535867.0, "reward": 0.6225000143051147, "reward_std": 0.30759145617485045, "rewards/env_game_reward/mean": 0.6225000143051147, "rewards/env_game_reward/std": 0.42984657883644106, "sampling/importance_sampling_ratio/max": 1.527849555015564, "sampling/importance_sampling_ratio/mean": 0.8954729080200196, "sampling/importance_sampling_ratio/min": 0.24045688807964324, "sampling/sampling_logp_difference/max": 1.2608455896377564, "sampling/sampling_logp_difference/mean": 0.09565315097570419, "step": 40, "step_time": 2.158330711200688 }, { "clip_ratio/high_max": 0.011764705926179887, "clip_ratio/high_mean": 0.005882352963089943, "clip_ratio/low_mean": 0.0029411764815449716, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008823529444634914, "completions/clipped_ratio": 0.0, "completions/max_length": 374.4, "completions/max_terminated_length": 374.4, "completions/mean_length": 297.7125, "completions/mean_terminated_length": 297.7125, "completions/min_length": 206.8, "completions/min_terminated_length": 206.8, "entropy": 0.45966209173202516, "epoch": 0.0036, "frac_reward_zero_std": 0.5, "grad_norm": 0.10021962970495224, "kl": 4.332460595667362, "learning_rate": 9.950638937361476e-06, "loss": 0.00036982442252337934, "num_tokens": 604195.0, "reward": 0.5772500157356262, "reward_std": 0.22309219390153884, "rewards/env_game_reward/mean": 0.5772500157356262, "rewards/env_game_reward/std": 0.40507532358169557, "sampling/importance_sampling_ratio/max": 2.0866220235824584, "sampling/importance_sampling_ratio/mean": 0.9108154296875, "sampling/importance_sampling_ratio/min": 0.22963042184710503, "sampling/sampling_logp_difference/max": 1.6737257480621337, "sampling/sampling_logp_difference/mean": 0.13707162737846373, "step": 45, "step_time": 2.022085135200905 }, { "clip_ratio/high_max": 0.016993464156985282, "clip_ratio/high_mean": 0.008496732078492641, "clip_ratio/low_mean": 0.005882352963089943, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.014379085041582584, "completions/clipped_ratio": 0.0, "completions/max_length": 374.2, "completions/max_terminated_length": 374.2, "completions/mean_length": 303.05, "completions/mean_terminated_length": 303.05, "completions/min_length": 234.2, "completions/min_terminated_length": 234.2, "entropy": 0.5295720070600509, "epoch": 0.004, "frac_reward_zero_std": 0.525, "grad_norm": 0.03144078329205513, "kl": 0.7867398172616958, "learning_rate": 9.950637428677324e-06, "loss": -0.0003503738436847925, "num_tokens": 673761.0, "reward": 0.5325000166893006, "reward_std": 0.26516503691673277, "rewards/env_game_reward/mean": 0.5325000166893006, "rewards/env_game_reward/std": 0.3944298326969147, "sampling/importance_sampling_ratio/max": 1.7604175567626954, "sampling/importance_sampling_ratio/mean": 1.0008994936943054, "sampling/importance_sampling_ratio/min": 0.17906015515327453, "sampling/sampling_logp_difference/max": 1.3992098808288573, "sampling/sampling_logp_difference/mean": 0.11947420984506607, "step": 50, "step_time": 2.0436965212007636 }, { "clip_ratio/high_max": 0.0125, "clip_ratio/high_mean": 0.00625, "clip_ratio/low_mean": 0.009375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "completions/clipped_ratio": 0.0, "completions/max_length": 365.2, "completions/max_terminated_length": 365.2, "completions/mean_length": 266.475, "completions/mean_terminated_length": 266.475, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "entropy": 0.48174993991851806, "epoch": 0.0044, "frac_reward_zero_std": 0.45, "grad_norm": 0.08803996443748474, "kl": 0.5854233294725418, "learning_rate": 9.950635264043898e-06, "loss": -0.0005686009302735328, "num_tokens": 737069.0, "reward": 0.6225000143051147, "reward_std": 0.26516503989696505, "rewards/env_game_reward/mean": 0.6225000143051147, "rewards/env_game_reward/std": 0.3427117049694061, "sampling/importance_sampling_ratio/max": 1.6551067352294921, "sampling/importance_sampling_ratio/mean": 0.8545376658439636, "sampling/importance_sampling_ratio/min": 0.25754888653755187, "sampling/sampling_logp_difference/max": 1.1337799072265624, "sampling/sampling_logp_difference/mean": 0.11593741178512573, "step": 55, "step_time": 1.9781528124010948 }, { "clip_ratio/high_max": 0.00625, "clip_ratio/high_mean": 0.003125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003125, "completions/clipped_ratio": 0.0, "completions/max_length": 357.2, "completions/max_terminated_length": 357.2, "completions/mean_length": 297.975, "completions/mean_terminated_length": 297.975, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "entropy": 0.5426139056682586, "epoch": 0.0048, "frac_reward_zero_std": 0.5, "grad_norm": 0.05449790135025978, "kl": 0.9533621698617936, "learning_rate": 9.950632443461579e-06, "loss": -0.0006543617229908705, "num_tokens": 805277.0, "reward": 0.5618750214576721, "reward_std": 0.24448217451572418, "rewards/env_game_reward/mean": 0.5618750214576721, "rewards/env_game_reward/std": 0.3882100999355316, "sampling/importance_sampling_ratio/max": 1.7909126281738281, "sampling/importance_sampling_ratio/mean": 0.9404440999031067, "sampling/importance_sampling_ratio/min": 0.09320367276668548, "sampling/sampling_logp_difference/max": 1.6501066446304322, "sampling/sampling_logp_difference/mean": 0.14056455492973327, "step": 60, "step_time": 1.980496010000934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.014763931930065154, "clip_ratio/low_min": 0.005882352963089943, "clip_ratio/region_mean": 0.014763931930065154, "completions/clipped_ratio": 0.0, "completions/max_length": 365.8, "completions/max_terminated_length": 365.8, "completions/mean_length": 287.8875, "completions/mean_terminated_length": 287.8875, "completions/min_length": 208.4, "completions/min_terminated_length": 208.4, "entropy": 0.4251057833433151, "epoch": 0.0052, "frac_reward_zero_std": 0.425, "grad_norm": 0.2114567756652832, "kl": 1.2128720700740814, "learning_rate": 9.95062896693086e-06, "loss": -0.0011168548837304116, "num_tokens": 872623.0, "reward": 0.6370000123977662, "reward_std": 0.2870853543281555, "rewards/env_game_reward/mean": 0.6370000123977662, "rewards/env_game_reward/std": 0.4172030627727509, "sampling/importance_sampling_ratio/max": 2.156681776046753, "sampling/importance_sampling_ratio/mean": 0.9560775279998779, "sampling/importance_sampling_ratio/min": 0.20165933668613434, "sampling/sampling_logp_difference/max": 1.7227527379989624, "sampling/sampling_logp_difference/mean": 0.12174170911312103, "step": 65, "step_time": 2.1264499234013785 }, { "clip_ratio/high_max": 0.01666666679084301, "clip_ratio/high_mean": 0.008333333395421505, "clip_ratio/low_mean": 0.009027777798473835, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.017361111007630825, "completions/clipped_ratio": 0.0, "completions/max_length": 357.2, "completions/max_terminated_length": 357.2, "completions/mean_length": 284.3625, "completions/mean_terminated_length": 284.3625, "completions/min_length": 201.6, "completions/min_terminated_length": 201.6, "entropy": 0.3628254383802414, "epoch": 0.0056, "frac_reward_zero_std": 0.3, "grad_norm": 0.21124207973480225, "kl": 0.6694609820842743, "learning_rate": 9.950624834452351e-06, "loss": 0.0007115081883966923, "num_tokens": 939127.0, "reward": 0.7271250009536743, "reward_std": 0.28690858632326127, "rewards/env_game_reward/mean": 0.7271250009536743, "rewards/env_game_reward/std": 0.3982576608657837, "sampling/importance_sampling_ratio/max": 2.1042640209198, "sampling/importance_sampling_ratio/mean": 1.001141333580017, "sampling/importance_sampling_ratio/min": 0.19583375304937362, "sampling/sampling_logp_difference/max": 1.3206945657730103, "sampling/sampling_logp_difference/mean": 0.11073778569698334, "step": 70, "step_time": 1.9332619735985646 }, { "clip_ratio/high_max": 0.00625, "clip_ratio/high_mean": 0.003125, "clip_ratio/low_mean": 0.011388305388391019, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.014513305388391018, "completions/clipped_ratio": 0.0, "completions/max_length": 365.4, "completions/max_terminated_length": 365.4, "completions/mean_length": 276.2, "completions/mean_terminated_length": 276.2, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "entropy": 0.26720436066389086, "epoch": 0.006, "frac_reward_zero_std": 0.45, "grad_norm": 0.09686678647994995, "kl": 0.8629033237695694, "learning_rate": 9.950620046026782e-06, "loss": -0.0006346371024847031, "num_tokens": 1004385.0, "reward": 0.8097500324249267, "reward_std": 0.25491200387477875, "rewards/env_game_reward/mean": 0.8097500324249267, "rewards/env_game_reward/std": 0.37631983757019044, "sampling/importance_sampling_ratio/max": 1.5446483612060546, "sampling/importance_sampling_ratio/mean": 0.8950366854667664, "sampling/importance_sampling_ratio/min": 0.13836232647299768, "sampling/sampling_logp_difference/max": 1.9488806009292603, "sampling/sampling_logp_difference/mean": 0.12367857545614243, "step": 75, "step_time": 1.9649908402003349 }, { "epoch": 0.006, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 360.3333333333333, "eval_completions/max_terminated_length": 360.3333333333333, "eval_completions/mean_length": 318.0416666666667, "eval_completions/mean_terminated_length": 318.0416666666667, "eval_completions/min_length": 249.33333333333334, "eval_completions/min_terminated_length": 249.33333333333334, "eval_entropy": 0.24031941095987955, "eval_frac_reward_zero_std": 0.3333333333333333, "eval_kl": 0.7372705638408661, "eval_loss": -0.0011363811790943146, "eval_num_tokens": 1004385.0, "eval_reward": 0.8250000476837158, "eval_reward_std": 0.3181980550289154, "eval_rewards/env_game_reward/mean": 0.8250000476837158, "eval_rewards/env_game_reward/std": 0.3558244506518046, "eval_runtime": 2.1212, "eval_samples_per_second": 4.714, "eval_sampling/importance_sampling_ratio/max": 1.3695820967356365, "eval_sampling/importance_sampling_ratio/mean": 0.9882725675900778, "eval_sampling/importance_sampling_ratio/min": 0.5392542531092962, "eval_sampling/sampling_logp_difference/max": 0.9286821683247884, "eval_sampling/sampling_logp_difference/mean": 0.05970543374617895, "eval_steps_per_second": 0.943, "step": 75 }, { "clip_ratio/high_max": 0.01702786386013031, "clip_ratio/high_mean": 0.008513931930065156, "clip_ratio/low_mean": 0.013888888992369175, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.022402821108698845, "completions/clipped_ratio": 0.0, "completions/max_length": 374.2, "completions/max_terminated_length": 374.2, "completions/mean_length": 304.9, "completions/mean_terminated_length": 304.9, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "entropy": 0.1878312572836876, "epoch": 0.0064, "frac_reward_zero_std": 0.35, "grad_norm": 0.06498023867607117, "kl": 0.9341357827186585, "learning_rate": 9.950614601654993e-06, "loss": -0.001060234196484089, "num_tokens": 1074730.0, "reward": 0.8025000095367432, "reward_std": 0.3075914442539215, "rewards/env_game_reward/mean": 0.8025000095367432, "rewards/env_game_reward/std": 0.3941896140575409, "sampling/importance_sampling_ratio/max": 1.4447970628738402, "sampling/importance_sampling_ratio/mean": 0.8254009127616883, "sampling/importance_sampling_ratio/min": 0.03923565149307251, "sampling/sampling_logp_difference/max": 2.3992530345916747, "sampling/sampling_logp_difference/mean": 0.11771672368049621, "step": 80, "step_time": 1.982058087000769 }, { "clip_ratio/high_max": 0.00625, "clip_ratio/high_mean": 0.003125, "clip_ratio/low_mean": 0.011638931930065155, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.014763931930065154, "completions/clipped_ratio": 0.0, "completions/max_length": 374.6, "completions/max_terminated_length": 374.6, "completions/mean_length": 306.2875, "completions/mean_terminated_length": 306.2875, "completions/min_length": 219.2, "completions/min_terminated_length": 219.2, "entropy": 0.16379451900720596, "epoch": 0.0068, "frac_reward_zero_std": 0.3, "grad_norm": 0.05989319086074829, "kl": 1.268787795305252, "learning_rate": 9.950608501337942e-06, "loss": -0.0012372495606541634, "num_tokens": 1144439.0, "reward": 0.8847500324249268, "reward_std": 0.29733840227127073, "rewards/env_game_reward/mean": 0.8847500324249268, "rewards/env_game_reward/std": 0.33870106339454653, "sampling/importance_sampling_ratio/max": 1.8493404388427734, "sampling/importance_sampling_ratio/mean": 0.9314864993095398, "sampling/importance_sampling_ratio/min": 0.13991991989314556, "sampling/sampling_logp_difference/max": 1.9317576885223389, "sampling/sampling_logp_difference/mean": 0.10915455222129822, "step": 85, "step_time": 2.131227758199384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.002777777798473835, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002777777798473835, "completions/clipped_ratio": 0.0, "completions/max_length": 374.2, "completions/max_terminated_length": 374.2, "completions/mean_length": 289.9125, "completions/mean_terminated_length": 289.9125, "completions/min_length": 227.2, "completions/min_terminated_length": 227.2, "entropy": 0.12536104992032052, "epoch": 0.0072, "frac_reward_zero_std": 0.425, "grad_norm": 0.20048366487026215, "kl": 1.6958665192127227, "learning_rate": 9.9506017450767e-06, "loss": -0.0010345380753278733, "num_tokens": 1211601.0, "reward": 0.8850000381469727, "reward_std": 0.3181980520486832, "rewards/env_game_reward/mean": 0.8850000381469727, "rewards/env_game_reward/std": 0.39857959747314453, "sampling/importance_sampling_ratio/max": 1.9469106435775756, "sampling/importance_sampling_ratio/mean": 0.9089800357818604, "sampling/importance_sampling_ratio/min": 0.10673718531616032, "sampling/sampling_logp_difference/max": 3.3863435983657837, "sampling/sampling_logp_difference/mean": 0.1388186126947403, "step": 90, "step_time": 1.9693921780009986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.019384890049695968, "clip_ratio/low_min": 0.005000000074505806, "clip_ratio/region_mean": 0.019384890049695968, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/max_terminated_length": 366.0, "completions/mean_length": 302.6125, "completions/mean_terminated_length": 302.6125, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "entropy": 0.1978470079600811, "epoch": 0.0076, "frac_reward_zero_std": 0.25, "grad_norm": 0.2384689301252365, "kl": 1.4977002620697022, "learning_rate": 9.950594332872455e-06, "loss": -0.0007448500022292137, "num_tokens": 1280837.0, "reward": 0.7945000290870666, "reward_std": 0.4037579774856567, "rewards/env_game_reward/mean": 0.7945000290870666, "rewards/env_game_reward/std": 0.4634205162525177, "sampling/importance_sampling_ratio/max": 1.9967133045196532, "sampling/importance_sampling_ratio/mean": 0.9287837624549866, "sampling/importance_sampling_ratio/min": 0.0468483492732048, "sampling/sampling_logp_difference/max": 2.3285855531692503, "sampling/sampling_logp_difference/mean": 0.13745660185813904, "step": 95, "step_time": 1.966521529597958 }, { "clip_ratio/high_max": 0.005882352963089943, "clip_ratio/high_mean": 0.0029411764815449716, "clip_ratio/low_mean": 0.020629085041582583, "clip_ratio/low_min": 0.00555555559694767, "clip_ratio/region_mean": 0.023570261523127557, "completions/clipped_ratio": 0.0, "completions/max_length": 365.6, "completions/max_terminated_length": 365.6, "completions/mean_length": 294.075, "completions/mean_terminated_length": 294.075, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "entropy": 0.17654007077217101, "epoch": 0.008, "frac_reward_zero_std": 0.425, "grad_norm": 0.13096670806407928, "kl": 1.3037856757640838, "learning_rate": 9.950586264726511e-06, "loss": -0.0005890541709959507, "num_tokens": 1348490.0, "reward": 0.8925000071525574, "reward_std": 0.2863782405853271, "rewards/env_game_reward/mean": 0.8925000071525574, "rewards/env_game_reward/std": 0.37230696678161623, "sampling/importance_sampling_ratio/max": 1.8284294366836549, "sampling/importance_sampling_ratio/mean": 0.8878485441207886, "sampling/importance_sampling_ratio/min": 0.049935894832015035, "sampling/sampling_logp_difference/max": 2.5030375480651856, "sampling/sampling_logp_difference/mean": 0.14053128361701966, "step": 100, "step_time": 1.9532180378009798 }, { "clip_ratio/high_max": 0.005882352963089943, "clip_ratio/high_mean": 0.0029411764815449716, "clip_ratio/low_mean": 0.014454334415495396, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01739551089704037, "completions/clipped_ratio": 0.0, "completions/max_length": 373.2, "completions/max_terminated_length": 373.2, "completions/mean_length": 282.025, "completions/mean_terminated_length": 282.025, "completions/min_length": 206.8, "completions/min_terminated_length": 206.8, "entropy": 0.1877336472272873, "epoch": 0.0084, "frac_reward_zero_std": 0.5, "grad_norm": 0.13922861218452454, "kl": 1.016879215836525, "learning_rate": 9.950577540640286e-06, "loss": -0.0011088012717664242, "num_tokens": 1414612.0, "reward": 0.8173750281333924, "reward_std": 0.24412862062454224, "rewards/env_game_reward/mean": 0.8173750281333924, "rewards/env_game_reward/std": 0.39675052762031554, "sampling/importance_sampling_ratio/max": 2.0766624450683593, "sampling/importance_sampling_ratio/mean": 0.9333486676216125, "sampling/importance_sampling_ratio/min": 0.17753537595272065, "sampling/sampling_logp_difference/max": 1.8823954582214355, "sampling/sampling_logp_difference/mean": 0.13100463151931763, "step": 105, "step_time": 2.1976747798005816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.01445433460175991, "clip_ratio/low_min": 0.005263157933950424, "clip_ratio/region_mean": 0.01445433460175991, "completions/clipped_ratio": 0.0, "completions/max_length": 365.8, "completions/max_terminated_length": 365.8, "completions/mean_length": 284.425, "completions/mean_terminated_length": 284.425, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "entropy": 0.1360536314547062, "epoch": 0.0088, "frac_reward_zero_std": 0.525, "grad_norm": 0.03397221490740776, "kl": 1.4786522924900054, "learning_rate": 9.950568160615312e-06, "loss": -0.0008132285438477993, "num_tokens": 1481189.0, "reward": 0.9673750162124634, "reward_std": 0.22291541397571563, "rewards/env_game_reward/mean": 0.9673750162124634, "rewards/env_game_reward/std": 0.3230068266391754, "sampling/importance_sampling_ratio/max": 2.0664987325668336, "sampling/importance_sampling_ratio/mean": 0.9952835202217102, "sampling/importance_sampling_ratio/min": 0.21388899087905883, "sampling/sampling_logp_difference/max": 1.6557833194732665, "sampling/sampling_logp_difference/mean": 0.10078478828072548, "step": 110, "step_time": 1.9459437828008959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.008099906705319881, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008099906705319881, "completions/clipped_ratio": 0.0, "completions/max_length": 365.6, "completions/max_terminated_length": 365.6, "completions/mean_length": 299.8625, "completions/mean_terminated_length": 299.8625, "completions/min_length": 249.6, "completions/min_terminated_length": 249.6, "entropy": 0.0922572823241353, "epoch": 0.0092, "frac_reward_zero_std": 0.575, "grad_norm": 0.02898675948381424, "kl": 1.3785052359104157, "learning_rate": 9.950558124653239e-06, "loss": -0.0016875043511390686, "num_tokens": 1549583.0, "reward": 0.9673750400543213, "reward_std": 0.20170221328735352, "rewards/env_game_reward/mean": 0.9673750400543213, "rewards/env_game_reward/std": 0.31243913173675536, "sampling/importance_sampling_ratio/max": 1.452053427696228, "sampling/importance_sampling_ratio/mean": 0.8291665196418763, "sampling/importance_sampling_ratio/min": 0.06903318837285041, "sampling/sampling_logp_difference/max": 2.5763232707977295, "sampling/sampling_logp_difference/mean": 0.12499624639749526, "step": 115, "step_time": 2.0123212985985446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.011785130575299263, "clip_ratio/low_min": 0.005882352963089943, "clip_ratio/region_mean": 0.011785130575299263, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/max_terminated_length": 349.0, "completions/mean_length": 288.425, "completions/mean_terminated_length": 288.425, "completions/min_length": 227.4, "completions/min_terminated_length": 227.4, "entropy": 0.07345340847969055, "epoch": 0.0096, "frac_reward_zero_std": 0.575, "grad_norm": 0.06546451151371002, "kl": 1.1702515691518784, "learning_rate": 9.950547432755832e-06, "loss": -0.0007662988267838955, "num_tokens": 1616803.0, "reward": 1.0573750376701354, "reward_std": 0.18013544976711274, "rewards/env_game_reward/mean": 1.0573750376701354, "rewards/env_game_reward/std": 0.2634139180183411, "sampling/importance_sampling_ratio/max": 1.988950991630554, "sampling/importance_sampling_ratio/mean": 0.9787555813789368, "sampling/importance_sampling_ratio/min": 0.09628211334347725, "sampling/sampling_logp_difference/max": 2.1504470825195314, "sampling/sampling_logp_difference/mean": 0.08526455238461494, "step": 120, "step_time": 1.9904663407985936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015257352963089943, "clip_ratio/low_min": 0.00625, "clip_ratio/region_mean": 0.015257352963089943, "completions/clipped_ratio": 0.0, "completions/max_length": 349.2, "completions/max_terminated_length": 349.2, "completions/mean_length": 267.2875, "completions/mean_terminated_length": 267.2875, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "entropy": 0.07882922235876322, "epoch": 0.01, "frac_reward_zero_std": 0.65, "grad_norm": 0.45003291964530945, "kl": 4.5989465653896335, "learning_rate": 9.950536084924971e-06, "loss": -0.0006290416233241559, "num_tokens": 1679963.0, "reward": 1.0725000381469727, "reward_std": 0.1590990275144577, "rewards/env_game_reward/mean": 1.0725000381469727, "rewards/env_game_reward/std": 0.24540797472000123, "sampling/importance_sampling_ratio/max": 1.6149428606033325, "sampling/importance_sampling_ratio/mean": 0.9134042382240295, "sampling/importance_sampling_ratio/min": 0.06451723147183656, "sampling/sampling_logp_difference/max": 2.553125739097595, "sampling/sampling_logp_difference/mean": 0.11119775697588921, "step": 125, "step_time": 1.9727394509995064 }, { "clip_ratio/high_max": 0.00625, "clip_ratio/high_mean": 0.003125, "clip_ratio/low_mean": 0.002777777798473835, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005902777798473835, "completions/clipped_ratio": 0.0, "completions/max_length": 373.6, "completions/max_terminated_length": 373.6, "completions/mean_length": 279.75, "completions/mean_terminated_length": 279.75, "completions/min_length": 218.8, "completions/min_terminated_length": 218.8, "entropy": 0.11642275676131249, "epoch": 0.0104, "frac_reward_zero_std": 0.775, "grad_norm": 0.03156660497188568, "kl": 0.8787374496459961, "learning_rate": 9.950524081162648e-06, "loss": -0.00015998464077711106, "num_tokens": 1745793.0, "reward": 1.0725000381469727, "reward_std": 0.09545941650867462, "rewards/env_game_reward/mean": 1.0725000381469727, "rewards/env_game_reward/std": 0.27027024030685426, "sampling/importance_sampling_ratio/max": 1.5973956346511842, "sampling/importance_sampling_ratio/mean": 1.051470685005188, "sampling/importance_sampling_ratio/min": 0.41294346153736117, "sampling/sampling_logp_difference/max": 1.1177088975906373, "sampling/sampling_logp_difference/mean": 0.04818851538002491, "step": 130, "step_time": 2.081850044800376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.011513157933950424, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011513157933950424, "completions/clipped_ratio": 0.0, "completions/max_length": 365.8, "completions/max_terminated_length": 365.8, "completions/mean_length": 277.775, "completions/mean_terminated_length": 277.775, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "entropy": 0.16472503766417504, "epoch": 0.0108, "frac_reward_zero_std": 0.375, "grad_norm": 0.04943538084626198, "kl": 0.9097718864679336, "learning_rate": 9.950511421470975e-06, "loss": -0.0013091729953885079, "num_tokens": 1811214.0, "reward": 0.937375009059906, "reward_std": 0.3077682375907898, "rewards/env_game_reward/mean": 0.937375009059906, "rewards/env_game_reward/std": 0.3747471272945404, "sampling/importance_sampling_ratio/max": 1.3492722034454345, "sampling/importance_sampling_ratio/mean": 0.8988288879394531, "sampling/importance_sampling_ratio/min": 0.22225932404398918, "sampling/sampling_logp_difference/max": 1.6521860837936402, "sampling/sampling_logp_difference/mean": 0.08873736709356309, "step": 135, "step_time": 1.9900708778011904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.01472630724310875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01472630724310875, "completions/clipped_ratio": 0.0, "completions/max_length": 373.8, "completions/max_terminated_length": 373.8, "completions/mean_length": 284.375, "completions/mean_terminated_length": 284.375, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "entropy": 0.1683448150753975, "epoch": 0.0112, "frac_reward_zero_std": 0.475, "grad_norm": 0.055281057953834534, "kl": 0.8311902970075608, "learning_rate": 9.950498105852176e-06, "loss": -0.0005867025814950466, "num_tokens": 1877869.0, "reward": 0.9825000524520874, "reward_std": 0.2863782525062561, "rewards/env_game_reward/mean": 0.9825000524520874, "rewards/env_game_reward/std": 0.3745770752429962, "sampling/importance_sampling_ratio/max": 1.7486498355865479, "sampling/importance_sampling_ratio/mean": 1.0248434662818908, "sampling/importance_sampling_ratio/min": 0.2848562225699425, "sampling/sampling_logp_difference/max": 1.237773633003235, "sampling/sampling_logp_difference/mean": 0.06331658810377121, "step": 140, "step_time": 1.9763025989996095 }, { "clip_ratio/high_max": 0.005882352963089943, "clip_ratio/high_mean": 0.0029411764815449716, "clip_ratio/low_mean": 0.005718954280018807, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008660130761563778, "completions/clipped_ratio": 0.0, "completions/max_length": 373.6, "completions/max_terminated_length": 373.6, "completions/mean_length": 301.7375, "completions/mean_terminated_length": 301.7375, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "entropy": 0.16710587963461876, "epoch": 0.0116, "frac_reward_zero_std": 0.575, "grad_norm": 0.04859064891934395, "kl": 0.8752860724925995, "learning_rate": 9.950484134308594e-06, "loss": -0.0005891127046197652, "num_tokens": 1947235.0, "reward": 0.9448750495910645, "reward_std": 0.2335220217704773, "rewards/env_game_reward/mean": 0.9448750495910645, "rewards/env_game_reward/std": 0.3758414626121521, "sampling/importance_sampling_ratio/max": 1.7692456722259522, "sampling/importance_sampling_ratio/mean": 0.964184021949768, "sampling/importance_sampling_ratio/min": 0.2673730432987213, "sampling/sampling_logp_difference/max": 1.0289928674697877, "sampling/sampling_logp_difference/mean": 0.07320068627595902, "step": 145, "step_time": 2.0029842743999327 }, { "clip_ratio/high_max": 0.005882352963089943, "clip_ratio/high_mean": 0.0029411764815449716, "clip_ratio/low_mean": 0.01180555559694767, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.014746732078492641, "completions/clipped_ratio": 0.0, "completions/max_length": 365.6, "completions/max_terminated_length": 365.6, "completions/mean_length": 281.6375, "completions/mean_terminated_length": 281.6375, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "entropy": 0.11515891700983047, "epoch": 0.012, "frac_reward_zero_std": 0.5, "grad_norm": 0.029918797314167023, "kl": 2.9318640172481536, "learning_rate": 9.950469506842683e-06, "loss": -0.0018826086074113847, "num_tokens": 2012897.0, "reward": 0.9900000214576721, "reward_std": 0.23334523737430574, "rewards/env_game_reward/mean": 0.9900000214576721, "rewards/env_game_reward/std": 0.333386093378067, "sampling/importance_sampling_ratio/max": 1.7696414470672608, "sampling/importance_sampling_ratio/mean": 0.9344987154006958, "sampling/importance_sampling_ratio/min": 0.15813518241047858, "sampling/sampling_logp_difference/max": 1.9756666660308837, "sampling/sampling_logp_difference/mean": 0.10830660909414291, "step": 150, "step_time": 2.127201033801248 }, { "epoch": 0.012, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 360.3333333333333, "eval_completions/max_terminated_length": 360.3333333333333, "eval_completions/mean_length": 317.9583333333333, "eval_completions/mean_terminated_length": 317.9583333333333, "eval_completions/min_length": 249.33333333333334, "eval_completions/min_terminated_length": 249.33333333333334, "eval_entropy": 0.13987144331137338, "eval_frac_reward_zero_std": 0.5833333333333334, "eval_kl": 0.781950831413269, "eval_loss": -0.0002963352599181235, "eval_num_tokens": 2012897.0, "eval_reward": 1.00000003973643, "eval_reward_std": 0.2121320366859436, "eval_rewards/env_game_reward/mean": 1.00000003973643, "eval_rewards/env_game_reward/std": 0.34489662448565167, "eval_runtime": 2.1427, "eval_samples_per_second": 4.667, "eval_sampling/importance_sampling_ratio/max": 1.6415756543477376, "eval_sampling/importance_sampling_ratio/mean": 0.9771247307459513, "eval_sampling/importance_sampling_ratio/min": 0.4607134858767192, "eval_sampling/sampling_logp_difference/max": 1.1493730743726094, "eval_sampling/sampling_logp_difference/mean": 0.07872161269187927, "eval_steps_per_second": 0.933, "step": 150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0029411764815449716, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0029411764815449716, "completions/clipped_ratio": 0.0, "completions/max_length": 365.2, "completions/max_terminated_length": 365.2, "completions/mean_length": 280.625, "completions/mean_terminated_length": 280.625, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "entropy": 0.13695140182971954, "epoch": 0.0124, "frac_reward_zero_std": 0.7, "grad_norm": 0.02835434302687645, "kl": 0.8980278909206391, "learning_rate": 9.950454223457017e-06, "loss": -0.0009985696524381638, "num_tokens": 2078097.0, "reward": 1.0575000286102294, "reward_std": 0.13788582384586334, "rewards/env_game_reward/mean": 1.0575000286102294, "rewards/env_game_reward/std": 0.2769304394721985, "sampling/importance_sampling_ratio/max": 1.9645647525787353, "sampling/importance_sampling_ratio/mean": 1.053748869895935, "sampling/importance_sampling_ratio/min": 0.1966949909925461, "sampling/sampling_logp_difference/max": 1.4146570682525634, "sampling/sampling_logp_difference/mean": 0.0750476747751236, "step": 155, "step_time": 1.9448262721984064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0029411764815449716, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0029411764815449716, "completions/clipped_ratio": 0.0, "completions/max_length": 373.6, "completions/max_terminated_length": 373.6, "completions/mean_length": 287.6125, "completions/mean_terminated_length": 287.6125, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "entropy": 0.09548097178339958, "epoch": 0.0128, "frac_reward_zero_std": 0.8, "grad_norm": 0.016875844448804855, "kl": 0.9205106794834137, "learning_rate": 9.95043828415428e-06, "loss": -0.0003811331000179052, "num_tokens": 2145572.0, "reward": 1.1175000429153443, "reward_std": 0.09545941650867462, "rewards/env_game_reward/mean": 1.1175000429153443, "rewards/env_game_reward/std": 0.21992212533950806, "sampling/importance_sampling_ratio/max": 1.873227596282959, "sampling/importance_sampling_ratio/mean": 1.035505485534668, "sampling/importance_sampling_ratio/min": 0.2760059699416161, "sampling/sampling_logp_difference/max": 1.6244861602783203, "sampling/sampling_logp_difference/mean": 0.055689787119627, "step": 160, "step_time": 1.9473863302002428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.008204334415495396, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008204334415495396, "completions/clipped_ratio": 0.0, "completions/max_length": 365.6, "completions/max_terminated_length": 365.6, "completions/mean_length": 293.95, "completions/mean_terminated_length": 293.95, "completions/min_length": 218.8, "completions/min_terminated_length": 218.8, "entropy": 0.09301564693450928, "epoch": 0.0132, "frac_reward_zero_std": 0.6, "grad_norm": 0.03346140682697296, "kl": 1.1773792251944541, "learning_rate": 9.950421688937273e-06, "loss": -0.0007360159419476986, "num_tokens": 2213963.0, "reward": 1.0425000429153441, "reward_std": 0.20152542889118194, "rewards/env_game_reward/mean": 1.0425000429153441, "rewards/env_game_reward/std": 0.3081572115421295, "sampling/importance_sampling_ratio/max": 1.4281560897827148, "sampling/importance_sampling_ratio/mean": 0.9599952220916748, "sampling/importance_sampling_ratio/min": 0.2669469267129898, "sampling/sampling_logp_difference/max": 1.390862488746643, "sampling/sampling_logp_difference/mean": 0.055699700862169264, "step": 165, "step_time": 1.9470213165986934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.009027777798473835, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009027777798473835, "completions/clipped_ratio": 0.0, "completions/max_length": 373.6, "completions/max_terminated_length": 373.6, "completions/mean_length": 284.475, "completions/mean_terminated_length": 284.475, "completions/min_length": 218.8, "completions/min_terminated_length": 218.8, "entropy": 0.07658235654234886, "epoch": 0.0136, "frac_reward_zero_std": 0.675, "grad_norm": 0.03937648609280586, "kl": 1.81198593378067, "learning_rate": 9.950404437808918e-06, "loss": -0.0011821023188531398, "num_tokens": 2279225.0, "reward": 1.0500000238418579, "reward_std": 0.19091882705688476, "rewards/env_game_reward/mean": 1.0500000238418579, "rewards/env_game_reward/std": 0.3320326149463654, "sampling/importance_sampling_ratio/max": 1.477155590057373, "sampling/importance_sampling_ratio/mean": 0.9458814024925232, "sampling/importance_sampling_ratio/min": 0.270341220498085, "sampling/sampling_logp_difference/max": 1.6849958896636963, "sampling/sampling_logp_difference/mean": 0.08588936105370522, "step": 170, "step_time": 2.1512292531995625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.4, "completions/max_terminated_length": 365.4, "completions/mean_length": 303.2, "completions/mean_terminated_length": 303.2, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "entropy": 0.09666889570653439, "epoch": 0.014, "frac_reward_zero_std": 0.75, "grad_norm": 0.04141394793987274, "kl": 2.4968443870544434, "learning_rate": 9.950386530772241e-06, "loss": -0.00040261512622237207, "num_tokens": 2348233.0, "reward": 1.1023750305175781, "reward_std": 0.11684939563274384, "rewards/env_game_reward/mean": 1.1023750305175781, "rewards/env_game_reward/std": 0.21312530040740968, "sampling/importance_sampling_ratio/max": 1.6463705539703368, "sampling/importance_sampling_ratio/mean": 1.0285760641098023, "sampling/importance_sampling_ratio/min": 0.4818965196609497, "sampling/sampling_logp_difference/max": 0.8949426889419556, "sampling/sampling_logp_difference/mean": 0.04998132698237896, "step": 175, "step_time": 2.019259786400653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.2, "completions/max_terminated_length": 366.2, "completions/mean_length": 292.625, "completions/mean_terminated_length": 292.625, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "entropy": 0.11000263169407845, "epoch": 0.0144, "frac_reward_zero_std": 0.625, "grad_norm": 0.014517554081976414, "kl": 3.3274164140224456, "learning_rate": 9.950367967830395e-06, "loss": -0.0005131676327437163, "num_tokens": 2415877.0, "reward": 1.0423750162124634, "reward_std": 0.22291540503501892, "rewards/env_game_reward/mean": 1.0423750162124634, "rewards/env_game_reward/std": 0.3507106065750122, "sampling/importance_sampling_ratio/max": 1.5298500537872315, "sampling/importance_sampling_ratio/mean": 0.9435723781585693, "sampling/importance_sampling_ratio/min": 0.327379421889782, "sampling/sampling_logp_difference/max": 1.2668209314346313, "sampling/sampling_logp_difference/mean": 0.06219653338193894, "step": 180, "step_time": 1.9583896137977717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.002631578966975212, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002631578966975212, "completions/clipped_ratio": 0.0, "completions/max_length": 374.2, "completions/max_terminated_length": 374.2, "completions/mean_length": 281.1875, "completions/mean_terminated_length": 281.1875, "completions/min_length": 199.6, "completions/min_terminated_length": 199.6, "entropy": 0.12678153738379477, "epoch": 0.0148, "frac_reward_zero_std": 0.675, "grad_norm": 0.016833849251270294, "kl": 1.2115282833576202, "learning_rate": 9.950348748986642e-06, "loss": -0.000427194032818079, "num_tokens": 2481693.0, "reward": 1.0798749923706055, "reward_std": 0.16988240480422973, "rewards/env_game_reward/mean": 1.0798749923706055, "rewards/env_game_reward/std": 0.2858531653881073, "sampling/importance_sampling_ratio/max": 1.3460319757461547, "sampling/importance_sampling_ratio/mean": 0.9750611186027527, "sampling/importance_sampling_ratio/min": 0.32110539078712463, "sampling/sampling_logp_difference/max": 1.2185750484466553, "sampling/sampling_logp_difference/mean": 0.04203767627477646, "step": 185, "step_time": 2.0135145276013646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.009007352963089943, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009007352963089943, "completions/clipped_ratio": 0.0, "completions/max_length": 373.2, "completions/max_terminated_length": 373.2, "completions/mean_length": 284.75, "completions/mean_terminated_length": 284.75, "completions/min_length": 219.2, "completions/min_terminated_length": 219.2, "entropy": 0.17685981318354607, "epoch": 0.0152, "frac_reward_zero_std": 0.625, "grad_norm": 0.02496095560491085, "kl": 1.321517664194107, "learning_rate": 9.950328874244359e-06, "loss": -0.0008681080304086209, "num_tokens": 2548402.0, "reward": 1.0200000524520874, "reward_std": 0.23334523439407348, "rewards/env_game_reward/mean": 1.0200000524520874, "rewards/env_game_reward/std": 0.350832587480545, "sampling/importance_sampling_ratio/max": 1.5548951625823975, "sampling/importance_sampling_ratio/mean": 0.9757178544998169, "sampling/importance_sampling_ratio/min": 0.40361145734786985, "sampling/sampling_logp_difference/max": 0.9451999545097352, "sampling/sampling_logp_difference/mean": 0.04799589850008488, "step": 190, "step_time": 2.1438089131981544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.002777777798473835, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002777777798473835, "completions/clipped_ratio": 0.0, "completions/max_length": 357.6, "completions/max_terminated_length": 357.6, "completions/mean_length": 278.6, "completions/mean_terminated_length": 278.6, "completions/min_length": 206.8, "completions/min_terminated_length": 206.8, "entropy": 0.16943402960896492, "epoch": 0.0156, "frac_reward_zero_std": 0.775, "grad_norm": 0.04945311322808266, "kl": 0.8556254506111145, "learning_rate": 9.950308343607042e-06, "loss": -0.00024995713029056786, "num_tokens": 2614104.0, "reward": 1.124875020980835, "reward_std": 0.1062427967786789, "rewards/env_game_reward/mean": 1.124875020980835, "rewards/env_game_reward/std": 0.2150747537612915, "sampling/importance_sampling_ratio/max": 1.4159629583358764, "sampling/importance_sampling_ratio/mean": 1.027950894832611, "sampling/importance_sampling_ratio/min": 0.4427237957715988, "sampling/sampling_logp_difference/max": 1.0141066074371339, "sampling/sampling_logp_difference/mean": 0.034100130572915076, "step": 195, "step_time": 1.9601369298005011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0029411764815449716, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0029411764815449716, "completions/clipped_ratio": 0.0, "completions/max_length": 373.8, "completions/max_terminated_length": 373.8, "completions/mean_length": 292.3, "completions/mean_terminated_length": 292.3, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "entropy": 0.12501999139785766, "epoch": 0.016, "frac_reward_zero_std": 0.825, "grad_norm": 0.035976510494947433, "kl": 1.643832701444626, "learning_rate": 9.950287157078299e-06, "loss": -3.578556934371591e-05, "num_tokens": 2681768.0, "reward": 1.1175000429153443, "reward_std": 0.09545941650867462, "rewards/env_game_reward/mean": 1.1175000429153443, "rewards/env_game_reward/std": 0.1825194239616394, "sampling/importance_sampling_ratio/max": 1.2619178771972657, "sampling/importance_sampling_ratio/mean": 0.9969248056411744, "sampling/importance_sampling_ratio/min": 0.6032956957817077, "sampling/sampling_logp_difference/max": 0.722657385468483, "sampling/sampling_logp_difference/mean": 0.025445305183529852, "step": 200, "step_time": 2.0157902724015004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.003125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003125, "completions/clipped_ratio": 0.0, "completions/max_length": 365.8, "completions/max_terminated_length": 365.8, "completions/mean_length": 287.55, "completions/mean_terminated_length": 287.55, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "entropy": 0.10517201647162437, "epoch": 0.0164, "frac_reward_zero_std": 0.75, "grad_norm": 8.645147318020463e-05, "kl": 0.9572436332702636, "learning_rate": 9.950265314661852e-06, "loss": -0.0006879229098558426, "num_tokens": 2748656.0, "reward": 1.0875000596046447, "reward_std": 0.11667262017726898, "rewards/env_game_reward/mean": 1.0875000596046447, "rewards/env_game_reward/std": 0.22443267107009887, "sampling/importance_sampling_ratio/max": 1.411534857749939, "sampling/importance_sampling_ratio/mean": 0.97565096616745, "sampling/importance_sampling_ratio/min": 0.28964664936065676, "sampling/sampling_logp_difference/max": 0.8600339412689209, "sampling/sampling_logp_difference/mean": 0.03780459091067314, "step": 205, "step_time": 1.9924305702013954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.002777777798473835, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002777777798473835, "completions/clipped_ratio": 0.0, "completions/max_length": 365.6, "completions/max_terminated_length": 365.6, "completions/mean_length": 289.425, "completions/mean_terminated_length": 289.425, "completions/min_length": 225.8, "completions/min_terminated_length": 225.8, "entropy": 0.06622385941445827, "epoch": 0.0168, "frac_reward_zero_std": 0.9, "grad_norm": 3.81359423045069e-05, "kl": 1.2422339260578155, "learning_rate": 9.950242816361542e-06, "loss": -4.801744944415987e-06, "num_tokens": 2816147.0, "reward": 1.1625000476837157, "reward_std": 0.0530330091714859, "rewards/env_game_reward/mean": 1.1625000476837157, "rewards/env_game_reward/std": 0.12526867985725404, "sampling/importance_sampling_ratio/max": 1.1708146333694458, "sampling/importance_sampling_ratio/mean": 1.0134225606918335, "sampling/importance_sampling_ratio/min": 0.794332218170166, "sampling/sampling_logp_difference/max": 0.3431601271033287, "sampling/sampling_logp_difference/mean": 0.01179146794602275, "step": 210, "step_time": 2.072417707799468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.002631578966975212, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002631578966975212, "completions/clipped_ratio": 0.0, "completions/max_length": 357.6, "completions/max_terminated_length": 357.6, "completions/mean_length": 279.8375, "completions/mean_terminated_length": 279.8375, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "entropy": 0.09221492633223534, "epoch": 0.0172, "frac_reward_zero_std": 0.725, "grad_norm": 0.019924040883779526, "kl": 6.042136391997337, "learning_rate": 9.950219662181327e-06, "loss": 0.0002026589121669531, "num_tokens": 2882162.0, "reward": 1.0723750352859498, "reward_std": 0.13806260228157044, "rewards/env_game_reward/mean": 1.0723750352859498, "rewards/env_game_reward/std": 0.2875085473060608, "sampling/importance_sampling_ratio/max": 1.1406650304794312, "sampling/importance_sampling_ratio/mean": 0.9631157875061035, "sampling/importance_sampling_ratio/min": 0.5338585242629051, "sampling/sampling_logp_difference/max": 0.9422056198120117, "sampling/sampling_logp_difference/mean": 0.038619527220726015, "step": 215, "step_time": 2.0965528106011333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.008333333395421505, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008333333395421505, "completions/clipped_ratio": 0.0, "completions/max_length": 373.6, "completions/max_terminated_length": 373.6, "completions/mean_length": 296.625, "completions/mean_terminated_length": 296.625, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "entropy": 0.209561850130558, "epoch": 0.0176, "frac_reward_zero_std": 0.6, "grad_norm": 0.025828201323747635, "kl": 1.7181314051151275, "learning_rate": 9.950195852125273e-06, "loss": -0.00037424759939312934, "num_tokens": 2950140.0, "reward": 1.0123750209808349, "reward_std": 0.20170221328735352, "rewards/env_game_reward/mean": 1.0123750209808349, "rewards/env_game_reward/std": 0.35673150420188904, "sampling/importance_sampling_ratio/max": 1.5680360555648805, "sampling/importance_sampling_ratio/mean": 0.978592324256897, "sampling/importance_sampling_ratio/min": 0.29607744626700877, "sampling/sampling_logp_difference/max": 1.106997287273407, "sampling/sampling_logp_difference/mean": 0.05600374937057495, "step": 220, "step_time": 2.013816408000275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.8, "completions/max_terminated_length": 365.8, "completions/mean_length": 283.4125, "completions/mean_terminated_length": 283.4125, "completions/min_length": 219.2, "completions/min_terminated_length": 219.2, "entropy": 0.19863322675228118, "epoch": 0.018, "frac_reward_zero_std": 0.55, "grad_norm": 0.0762995183467865, "kl": 1.3156237244606017, "learning_rate": 9.95017138619757e-06, "loss": 9.536017896607518e-05, "num_tokens": 3015791.0, "reward": 1.0125000596046447, "reward_std": 0.22273863554000856, "rewards/env_game_reward/mean": 1.0125000596046447, "rewards/env_game_reward/std": 0.3335274219512939, "sampling/importance_sampling_ratio/max": 1.5515583753585815, "sampling/importance_sampling_ratio/mean": 1.00895494222641, "sampling/importance_sampling_ratio/min": 0.5095189124345779, "sampling/sampling_logp_difference/max": 0.8763931155204773, "sampling/sampling_logp_difference/mean": 0.040667933598160746, "step": 225, "step_time": 2.018751011400309 }, { "epoch": 0.018, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 360.0, "eval_completions/max_terminated_length": 360.0, "eval_completions/mean_length": 317.75, "eval_completions/mean_terminated_length": 317.75, "eval_completions/min_length": 249.33333333333334, "eval_completions/min_terminated_length": 249.33333333333334, "eval_entropy": 0.1765607347091039, "eval_frac_reward_zero_std": 0.4166666666666667, "eval_kl": 1.3525162537892659, "eval_loss": -0.0003587036917451769, "eval_num_tokens": 3015791.0, "eval_reward": 0.9500000476837158, "eval_reward_std": 0.3535533944765727, "eval_rewards/env_game_reward/mean": 0.9500000476837158, "eval_rewards/env_game_reward/std": 0.39218372106552124, "eval_runtime": 2.0879, "eval_samples_per_second": 4.789, "eval_sampling/importance_sampling_ratio/max": 1.2104533513387044, "eval_sampling/importance_sampling_ratio/mean": 0.9633649388949076, "eval_sampling/importance_sampling_ratio/min": 0.3959500590960185, "eval_sampling/sampling_logp_difference/max": 1.0513638655344646, "eval_sampling/sampling_logp_difference/mean": 0.04657792175809542, "eval_steps_per_second": 0.958, "step": 225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.002631578966975212, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002631578966975212, "completions/clipped_ratio": 0.0, "completions/max_length": 374.4, "completions/max_terminated_length": 374.4, "completions/mean_length": 303.125, "completions/mean_terminated_length": 303.125, "completions/min_length": 234.4, "completions/min_terminated_length": 234.4, "entropy": 0.1315464749932289, "epoch": 0.0184, "frac_reward_zero_std": 0.625, "grad_norm": 0.017125917598605156, "kl": 1.0924064695835114, "learning_rate": 9.950146264402513e-06, "loss": -0.0007199865765869617, "num_tokens": 3085648.0, "reward": 1.0575000405311585, "reward_std": 0.2015254318714142, "rewards/env_game_reward/mean": 1.0575000405311585, "rewards/env_game_reward/std": 0.2932907700538635, "sampling/importance_sampling_ratio/max": 1.2461576223373414, "sampling/importance_sampling_ratio/mean": 0.9759754657745361, "sampling/importance_sampling_ratio/min": 0.4818070411682129, "sampling/sampling_logp_difference/max": 0.7828710556030274, "sampling/sampling_logp_difference/mean": 0.033535952866077426, "step": 230, "step_time": 1.964708850999159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.003125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003125, "completions/clipped_ratio": 0.0, "completions/max_length": 357.6, "completions/max_terminated_length": 357.6, "completions/mean_length": 277.425, "completions/mean_terminated_length": 277.425, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "entropy": 0.0799025647342205, "epoch": 0.0188, "frac_reward_zero_std": 0.775, "grad_norm": 0.10008827596902847, "kl": 2.224274104833603, "learning_rate": 9.950120486744523e-06, "loss": 8.720792829990387e-05, "num_tokens": 3151112.0, "reward": 1.0950000286102295, "reward_std": 0.1484924226999283, "rewards/env_game_reward/mean": 1.0950000286102295, "rewards/env_game_reward/std": 0.2522334337234497, "sampling/importance_sampling_ratio/max": 1.1456147193908692, "sampling/importance_sampling_ratio/mean": 0.9904715538024902, "sampling/importance_sampling_ratio/min": 0.647669506072998, "sampling/sampling_logp_difference/max": 0.4812516301870346, "sampling/sampling_logp_difference/mean": 0.01942737139761448, "step": 235, "step_time": 2.084714898400125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.005882352963089943, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005882352963089943, "completions/clipped_ratio": 0.0, "completions/max_length": 373.4, "completions/max_terminated_length": 373.4, "completions/mean_length": 293.65, "completions/mean_terminated_length": 293.65, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "entropy": 0.06182103715837002, "epoch": 0.0192, "frac_reward_zero_std": 0.85, "grad_norm": 0.014402960427105427, "kl": 1.220539104938507, "learning_rate": 9.950094053228128e-06, "loss": -0.000241970201022923, "num_tokens": 3218802.0, "reward": 1.140000057220459, "reward_std": 0.08485281467437744, "rewards/env_game_reward/mean": 1.140000057220459, "rewards/env_game_reward/std": 0.16455072164535522, "sampling/importance_sampling_ratio/max": 1.1483745574951172, "sampling/importance_sampling_ratio/mean": 1.0016549229621887, "sampling/importance_sampling_ratio/min": 0.740897786617279, "sampling/sampling_logp_difference/max": 0.4138609737157822, "sampling/sampling_logp_difference/mean": 0.012947980128228665, "step": 240, "step_time": 1.9686758019997797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.2, "completions/max_terminated_length": 374.2, "completions/mean_length": 295.5375, "completions/mean_terminated_length": 295.5375, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "entropy": 0.06896210089325905, "epoch": 0.0196, "frac_reward_zero_std": 0.85, "grad_norm": 3.613123408285901e-05, "kl": 1.0657689243555069, "learning_rate": 9.950066963857978e-06, "loss": 5.23922499269247e-06, "num_tokens": 3286788.0, "reward": 1.139875054359436, "reward_std": 0.08502959161996841, "rewards/env_game_reward/mean": 1.139875054359436, "rewards/env_game_reward/std": 0.1967564880847931, "sampling/importance_sampling_ratio/max": 1.1366309642791748, "sampling/importance_sampling_ratio/mean": 1.0147639751434325, "sampling/importance_sampling_ratio/min": 0.8659112334251404, "sampling/sampling_logp_difference/max": 0.27902010679244993, "sampling/sampling_logp_difference/mean": 0.011635956121608615, "step": 245, "step_time": 1.9748669714004792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.4, "completions/max_terminated_length": 365.4, "completions/mean_length": 289.75, "completions/mean_terminated_length": 289.75, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "entropy": 0.05834418162703514, "epoch": 0.02, "frac_reward_zero_std": 0.825, "grad_norm": 0.10833176970481873, "kl": 1.9543183267116546, "learning_rate": 9.950039218638832e-06, "loss": 7.4031762778759e-05, "num_tokens": 3353652.0, "reward": 1.1325000286102296, "reward_std": 0.09545941650867462, "rewards/env_game_reward/mean": 1.1325000286102296, "rewards/env_game_reward/std": 0.22625648975372314, "sampling/importance_sampling_ratio/max": 1.149832510948181, "sampling/importance_sampling_ratio/mean": 0.9904601097106933, "sampling/importance_sampling_ratio/min": 0.6639677166938782, "sampling/sampling_logp_difference/max": 0.6797393321990967, "sampling/sampling_logp_difference/mean": 0.017544577736407517, "step": 250, "step_time": 1.967866663800669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.001923076994717121, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001923076994717121, "completions/clipped_ratio": 0.0, "completions/max_length": 580.0, "completions/max_terminated_length": 580.0, "completions/mean_length": 447.3875, "completions/mean_terminated_length": 447.3875, "completions/min_length": 344.8, "completions/min_terminated_length": 344.8, "entropy": 0.05960344485938549, "epoch": 0.0204, "frac_reward_zero_std": 0.725, "grad_norm": 0.007863137871026993, "kl": 1.9300155520439148, "learning_rate": 9.95001081757557e-06, "loss": -0.0004777685273438692, "num_tokens": 3433435.0, "reward": 1.1585000038146973, "reward_std": 0.1282220296561718, "rewards/env_game_reward/mean": 1.1585000038146973, "rewards/env_game_reward/std": 0.2555761218070984, "sampling/importance_sampling_ratio/max": 1.133116865158081, "sampling/importance_sampling_ratio/mean": 0.9558210611343384, "sampling/importance_sampling_ratio/min": 0.4158107668161392, "sampling/sampling_logp_difference/max": 1.0225807905197144, "sampling/sampling_logp_difference/mean": 0.02713311556726694, "step": 255, "step_time": 2.663749648000521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0018518518656492234, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0018518518656492234, "completions/clipped_ratio": 0.0, "completions/max_length": 714.0, "completions/max_terminated_length": 714.0, "completions/mean_length": 577.2875, "completions/mean_terminated_length": 577.2875, "completions/min_length": 421.6, "completions/min_terminated_length": 421.6, "entropy": 0.10376667603850365, "epoch": 0.0208, "frac_reward_zero_std": 0.65, "grad_norm": 0.03462546318769455, "kl": 1.5407811641693114, "learning_rate": 9.94998176067318e-06, "loss": 0.0002572691533714533, "num_tokens": 3523855.0, "reward": 1.0450416326522827, "reward_std": 0.17671776488423346, "rewards/env_game_reward/mean": 1.0450416326522827, "rewards/env_game_reward/std": 0.45010764002799986, "sampling/importance_sampling_ratio/max": 1.5040478706359863, "sampling/importance_sampling_ratio/mean": 0.9873509287834168, "sampling/importance_sampling_ratio/min": 0.370639518648386, "sampling/sampling_logp_difference/max": 1.1807260036468505, "sampling/sampling_logp_difference/mean": 0.04055641330778599, "step": 260, "step_time": 2.858658364200528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.001923076994717121, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001923076994717121, "completions/clipped_ratio": 0.0, "completions/max_length": 711.8, "completions/max_terminated_length": 711.8, "completions/mean_length": 536.9, "completions/mean_terminated_length": 536.9, "completions/min_length": 408.0, "completions/min_terminated_length": 408.0, "entropy": 0.08738742098212242, "epoch": 0.0212, "frac_reward_zero_std": 0.6, "grad_norm": 0.020184623077511787, "kl": 1.8179149568080901, "learning_rate": 9.949952047936776e-06, "loss": -0.00012561121257022024, "num_tokens": 3610314.0, "reward": 1.1425416231155396, "reward_std": 0.16104355901479722, "rewards/env_game_reward/mean": 1.1425416231155396, "rewards/env_game_reward/std": 0.3060436934232712, "sampling/importance_sampling_ratio/max": 1.5381874561309814, "sampling/importance_sampling_ratio/mean": 0.9935122966766358, "sampling/importance_sampling_ratio/min": 0.4440208673477173, "sampling/sampling_logp_difference/max": 0.8807984113693237, "sampling/sampling_logp_difference/mean": 0.026784875988960268, "step": 265, "step_time": 2.800866130801296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 714.6, "completions/max_terminated_length": 714.6, "completions/mean_length": 565.0875, "completions/mean_terminated_length": 565.0875, "completions/min_length": 408.0, "completions/min_terminated_length": 408.0, "entropy": 0.11295722424983978, "epoch": 0.0216, "frac_reward_zero_std": 0.675, "grad_norm": 0.03279825299978256, "kl": 0.9358411014080048, "learning_rate": 9.949921679371578e-06, "loss": -0.0005372571758925915, "num_tokens": 3699396.0, "reward": 1.15362491607666, "reward_std": 0.14572291895747186, "rewards/env_game_reward/mean": 1.15362491607666, "rewards/env_game_reward/std": 0.2874349907040596, "sampling/importance_sampling_ratio/max": 1.4842311143875122, "sampling/importance_sampling_ratio/mean": 1.0119757771492004, "sampling/importance_sampling_ratio/min": 0.4642439320683479, "sampling/sampling_logp_difference/max": 0.8794727861881256, "sampling/sampling_logp_difference/mean": 0.02908453196287155, "step": 270, "step_time": 2.7460473307997746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.003708791360259056, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003708791360259056, "completions/clipped_ratio": 0.0, "completions/max_length": 729.6, "completions/max_terminated_length": 729.6, "completions/mean_length": 578.45, "completions/mean_terminated_length": 578.45, "completions/min_length": 408.0, "completions/min_terminated_length": 408.0, "entropy": 0.10119448900222779, "epoch": 0.022, "frac_reward_zero_std": 0.525, "grad_norm": 0.040497273206710815, "kl": 1.5820525407791137, "learning_rate": 9.949890654982923e-06, "loss": -0.00091603584587574, "num_tokens": 3790479.0, "reward": 1.088499939441681, "reward_std": 0.26846486926078794, "rewards/env_game_reward/mean": 1.088499939441681, "rewards/env_game_reward/std": 0.408444818854332, "sampling/importance_sampling_ratio/max": 1.219689965248108, "sampling/importance_sampling_ratio/mean": 0.9480740189552307, "sampling/importance_sampling_ratio/min": 0.3397506684064865, "sampling/sampling_logp_difference/max": 1.1102912425994873, "sampling/sampling_logp_difference/mean": 0.030945492908358574, "step": 275, "step_time": 2.881785232200491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0037749288603663445, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0037749288603663445, "completions/clipped_ratio": 0.0, "completions/max_length": 715.2, "completions/max_terminated_length": 715.2, "completions/mean_length": 589.2375, "completions/mean_terminated_length": 589.2375, "completions/min_length": 422.4, "completions/min_terminated_length": 422.4, "entropy": 0.08195799887180329, "epoch": 0.0224, "frac_reward_zero_std": 0.625, "grad_norm": 0.023325594142079353, "kl": 1.135708212852478, "learning_rate": 9.949858974776267e-06, "loss": -0.000301552121527493, "num_tokens": 3882655.0, "reward": 1.191166639328003, "reward_std": 0.15391356945037843, "rewards/env_game_reward/mean": 1.191166639328003, "rewards/env_game_reward/std": 0.2715784251689911, "sampling/importance_sampling_ratio/max": 1.4824723482131958, "sampling/importance_sampling_ratio/mean": 1.0203513145446776, "sampling/importance_sampling_ratio/min": 0.5855113506317139, "sampling/sampling_logp_difference/max": 0.7591158509254455, "sampling/sampling_logp_difference/mean": 0.025018543750047684, "step": 280, "step_time": 2.7125778502006144 }, { "clip_ratio/high_max": 0.003333333507180214, "clip_ratio/high_mean": 0.001666666753590107, "clip_ratio/low_mean": 0.005631868354976177, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0072985351085662845, "completions/clipped_ratio": 0.0, "completions/max_length": 731.2, "completions/max_terminated_length": 731.2, "completions/mean_length": 586.3, "completions/mean_terminated_length": 586.3, "completions/min_length": 408.0, "completions/min_terminated_length": 408.0, "entropy": 0.08055115789175034, "epoch": 0.0228, "frac_reward_zero_std": 0.6, "grad_norm": 0.028693703934550285, "kl": 1.138446581363678, "learning_rate": 9.949826638757177e-06, "loss": -0.0006620377767831087, "num_tokens": 3973740.0, "reward": 1.0940416216850282, "reward_std": 0.19934517294168472, "rewards/env_game_reward/mean": 1.0940416216850282, "rewards/env_game_reward/std": 0.37726728320121766, "sampling/importance_sampling_ratio/max": 1.7476803302764892, "sampling/importance_sampling_ratio/mean": 1.0127242803573608, "sampling/importance_sampling_ratio/min": 0.5199615359306335, "sampling/sampling_logp_difference/max": 0.7218784093856812, "sampling/sampling_logp_difference/mean": 0.02444089874625206, "step": 285, "step_time": 2.7483612370000627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 714.6, "completions/max_terminated_length": 714.6, "completions/mean_length": 537.3, "completions/mean_terminated_length": 537.3, "completions/min_length": 408.0, "completions/min_terminated_length": 408.0, "entropy": 0.049627304822206494, "epoch": 0.0232, "frac_reward_zero_std": 0.85, "grad_norm": 0.03257312625646591, "kl": 0.7880214869976043, "learning_rate": 9.949793646931339e-06, "loss": -4.999106749892235e-06, "num_tokens": 4059661.0, "reward": 1.2024999141693116, "reward_std": 0.09192387312650681, "rewards/env_game_reward/mean": 1.2024999141693116, "rewards/env_game_reward/std": 0.23650072515010834, "sampling/importance_sampling_ratio/max": 1.458734655380249, "sampling/importance_sampling_ratio/mean": 1.007603394985199, "sampling/importance_sampling_ratio/min": 0.7188777476549149, "sampling/sampling_logp_difference/max": 0.6383673250675201, "sampling/sampling_logp_difference/mean": 0.020518184872344136, "step": 290, "step_time": 2.739068496599066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0017241379246115685, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0017241379246115685, "completions/clipped_ratio": 0.0, "completions/max_length": 711.6, "completions/max_terminated_length": 711.6, "completions/mean_length": 546.625, "completions/mean_terminated_length": 546.625, "completions/min_length": 439.0, "completions/min_terminated_length": 439.0, "entropy": 0.06519524082541465, "epoch": 0.0236, "frac_reward_zero_std": 0.625, "grad_norm": 0.07401054352521896, "kl": 1.4018354892730713, "learning_rate": 9.949759999304552e-06, "loss": -0.00022121451329439878, "num_tokens": 4146522.0, "reward": 1.1591666221618653, "reward_std": 0.168527103215456, "rewards/env_game_reward/mean": 1.1591666221618653, "rewards/env_game_reward/std": 0.28247022032737734, "sampling/importance_sampling_ratio/max": 1.5240054845809936, "sampling/importance_sampling_ratio/mean": 1.0128975629806518, "sampling/importance_sampling_ratio/min": 0.6516130924224853, "sampling/sampling_logp_difference/max": 0.5109567165374755, "sampling/sampling_logp_difference/mean": 0.01806719144806266, "step": 295, "step_time": 2.7780750279984203 }, { "clip_ratio/high_max": 0.003846153989434242, "clip_ratio/high_mean": 0.001923076994717121, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001923076994717121, "completions/clipped_ratio": 0.0, "completions/max_length": 696.4, "completions/max_terminated_length": 696.4, "completions/mean_length": 552.8625, "completions/mean_terminated_length": 552.8625, "completions/min_length": 408.0, "completions/min_terminated_length": 408.0, "entropy": 0.05754914581775665, "epoch": 0.024, "frac_reward_zero_std": 0.8, "grad_norm": 0.004387096501886845, "kl": 0.762263560295105, "learning_rate": 9.949725695882732e-06, "loss": 0.00016510837012901903, "num_tokens": 4234609.0, "reward": 1.234499979019165, "reward_std": 0.06163613530807197, "rewards/env_game_reward/mean": 1.234499979019165, "rewards/env_game_reward/std": 0.15678054541349412, "sampling/importance_sampling_ratio/max": 1.2199280977249145, "sampling/importance_sampling_ratio/mean": 1.0034643888473511, "sampling/importance_sampling_ratio/min": 0.7295505046844483, "sampling/sampling_logp_difference/max": 0.5000777631998062, "sampling/sampling_logp_difference/mean": 0.012467277515679599, "step": 300, "step_time": 2.7151460476001374 }, { "epoch": 0.024, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 703.3333333333334, "eval_completions/max_terminated_length": 703.3333333333334, "eval_completions/mean_length": 617.3333333333334, "eval_completions/mean_terminated_length": 617.3333333333334, "eval_completions/min_length": 481.6666666666667, "eval_completions/min_terminated_length": 481.6666666666667, "eval_entropy": 0.0684248556693395, "eval_frac_reward_zero_std": 0.75, "eval_kl": 1.430545687675476, "eval_loss": 0.0002169054205296561, "eval_num_tokens": 4234609.0, "eval_reward": 1.2093055248260498, "eval_reward_std": 0.07719248533248901, "eval_rewards/env_game_reward/mean": 1.2093055248260498, "eval_rewards/env_game_reward/std": 0.15293016036351523, "eval_runtime": 2.8392, "eval_samples_per_second": 3.522, "eval_sampling/importance_sampling_ratio/max": 1.2929344177246094, "eval_sampling/importance_sampling_ratio/mean": 1.0424925486246746, "eval_sampling/importance_sampling_ratio/min": 0.8097424507141113, "eval_sampling/sampling_logp_difference/max": 0.27991748849550885, "eval_sampling/sampling_logp_difference/mean": 0.015183204164107641, "eval_steps_per_second": 0.704, "step": 300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 698.0, "completions/max_terminated_length": 698.0, "completions/mean_length": 528.225, "completions/mean_terminated_length": 528.225, "completions/min_length": 408.0, "completions/min_terminated_length": 408.0, "entropy": 0.0414741288870573, "epoch": 0.0244, "frac_reward_zero_std": 0.825, "grad_norm": 0.00968001689761877, "kl": 1.175690919160843, "learning_rate": 9.949690736671905e-06, "loss": -1.1410797014832497e-05, "num_tokens": 4319539.0, "reward": 1.2401665925979615, "reward_std": 0.06929646017961204, "rewards/env_game_reward/mean": 1.2401665925979615, "rewards/env_game_reward/std": 0.18133917301893235, "sampling/importance_sampling_ratio/max": 1.2353633403778077, "sampling/importance_sampling_ratio/mean": 0.9993547916412353, "sampling/importance_sampling_ratio/min": 0.5523549929261208, "sampling/sampling_logp_difference/max": 1.0161285698413849, "sampling/sampling_logp_difference/mean": 0.01637693466618657, "step": 305, "step_time": 2.655280681199656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.001785714365541935, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001785714365541935, "completions/clipped_ratio": 0.0, "completions/max_length": 715.0, "completions/max_terminated_length": 715.0, "completions/mean_length": 581.925, "completions/mean_terminated_length": 581.925, "completions/min_length": 452.4, "completions/min_terminated_length": 452.4, "entropy": 0.05037630945444107, "epoch": 0.0248, "frac_reward_zero_std": 0.8, "grad_norm": 0.01921161077916622, "kl": 1.3029194891452789, "learning_rate": 9.949655121678223e-06, "loss": -0.00036651836708188055, "num_tokens": 4411139.0, "reward": 1.234999942779541, "reward_std": 0.09192387759685516, "rewards/env_game_reward/mean": 1.234999942779541, "rewards/env_game_reward/std": 0.16591952741146088, "sampling/importance_sampling_ratio/max": 1.1560017108917235, "sampling/importance_sampling_ratio/mean": 0.9992516279220581, "sampling/importance_sampling_ratio/min": 0.6994159758090973, "sampling/sampling_logp_difference/max": 0.527313782274723, "sampling/sampling_logp_difference/mean": 0.012010546866804361, "step": 310, "step_time": 2.6350250776005852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 713.8, "completions/max_terminated_length": 713.8, "completions/mean_length": 542.4375, "completions/mean_terminated_length": 542.4375, "completions/min_length": 408.0, "completions/min_terminated_length": 408.0, "entropy": 0.054155930504202844, "epoch": 0.0252, "frac_reward_zero_std": 0.75, "grad_norm": 0.020733093842864037, "kl": 1.5725973546504974, "learning_rate": 9.949618850907941e-06, "loss": 0.00024796247016638516, "num_tokens": 4498257.0, "reward": 1.2346249580383302, "reward_std": 0.09210064932703972, "rewards/env_game_reward/mean": 1.2346249580383302, "rewards/env_game_reward/std": 0.16266353130340577, "sampling/importance_sampling_ratio/max": 1.3359755039215089, "sampling/importance_sampling_ratio/mean": 1.0149800062179566, "sampling/importance_sampling_ratio/min": 0.7080724120140076, "sampling/sampling_logp_difference/max": 0.4858335077762604, "sampling/sampling_logp_difference/mean": 0.011928107030689717, "step": 315, "step_time": 2.831428350000351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.002083333395421505, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002083333395421505, "completions/clipped_ratio": 0.0, "completions/max_length": 698.0, "completions/max_terminated_length": 698.0, "completions/mean_length": 579.0625, "completions/mean_terminated_length": 579.0625, "completions/min_length": 454.0, "completions/min_terminated_length": 454.0, "entropy": 0.07051767148077488, "epoch": 0.0256, "frac_reward_zero_std": 0.7, "grad_norm": 0.01131022721529007, "kl": 0.9661010444164276, "learning_rate": 9.949581924367437e-06, "loss": -0.0006595293991267682, "num_tokens": 4588831.0, "reward": 1.2023749113082887, "reward_std": 0.13806259408593177, "rewards/env_game_reward/mean": 1.2023749113082887, "rewards/env_game_reward/std": 0.2536344364285469, "sampling/importance_sampling_ratio/max": 1.3653532981872558, "sampling/importance_sampling_ratio/mean": 1.0079243421554565, "sampling/importance_sampling_ratio/min": 0.5111093282699585, "sampling/sampling_logp_difference/max": 0.7481902837753296, "sampling/sampling_logp_difference/mean": 0.020597192458808422, "step": 320, "step_time": 2.711340737000137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 714.4, "completions/max_terminated_length": 714.4, "completions/mean_length": 556.4625, "completions/mean_terminated_length": 556.4625, "completions/min_length": 408.0, "completions/min_terminated_length": 408.0, "entropy": 0.059214940294623375, "epoch": 0.026, "frac_reward_zero_std": 0.7, "grad_norm": 0.03349935635924339, "kl": 1.0856446743011474, "learning_rate": 9.9495443420632e-06, "loss": -0.0004986363928765059, "num_tokens": 4676780.0, "reward": 1.191541600227356, "reward_std": 0.12274194019846618, "rewards/env_game_reward/mean": 1.191541600227356, "rewards/env_game_reward/std": 0.2498372197151184, "sampling/importance_sampling_ratio/max": 1.12011821269989, "sampling/importance_sampling_ratio/mean": 0.9541650176048279, "sampling/importance_sampling_ratio/min": 0.45518134236335756, "sampling/sampling_logp_difference/max": 1.0134361952543258, "sampling/sampling_logp_difference/mean": 0.02279095593839884, "step": 325, "step_time": 2.610014723199856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 698.2, "completions/max_terminated_length": 698.2, "completions/mean_length": 564.2, "completions/mean_terminated_length": 564.2, "completions/min_length": 408.0, "completions/min_terminated_length": 408.0, "entropy": 0.06921758279204368, "epoch": 0.0264, "frac_reward_zero_std": 0.55, "grad_norm": 0.04242366924881935, "kl": 1.510494002699852, "learning_rate": 9.949506104001843e-06, "loss": -0.000468868063762784, "num_tokens": 4765601.0, "reward": 1.1211249828338623, "reward_std": 0.20700550079345703, "rewards/env_game_reward/mean": 1.1211249828338623, "rewards/env_game_reward/std": 0.34373362362384796, "sampling/importance_sampling_ratio/max": 1.384414553642273, "sampling/importance_sampling_ratio/mean": 0.9697236299514771, "sampling/importance_sampling_ratio/min": 0.4015564054250717, "sampling/sampling_logp_difference/max": 1.096415627002716, "sampling/sampling_logp_difference/mean": 0.023835126869380475, "step": 330, "step_time": 2.618563677399652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0038518518209457397, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0038518518209457397, "completions/clipped_ratio": 0.0, "completions/max_length": 731.8, "completions/max_terminated_length": 731.8, "completions/mean_length": 577.3, "completions/mean_terminated_length": 577.3, "completions/min_length": 396.8, "completions/min_terminated_length": 396.8, "entropy": 0.07292215041816234, "epoch": 0.0268, "frac_reward_zero_std": 0.8, "grad_norm": 0.010667292401194572, "kl": 1.164840006828308, "learning_rate": 9.94946721019008e-06, "loss": -0.0007914766669273376, "num_tokens": 4855821.0, "reward": 1.2240416049957275, "reward_std": 0.09210064932703972, "rewards/env_game_reward/mean": 1.2240416049957275, "rewards/env_game_reward/std": 0.18175842016935348, "sampling/importance_sampling_ratio/max": 1.1197868585586548, "sampling/importance_sampling_ratio/mean": 0.9610379815101624, "sampling/importance_sampling_ratio/min": 0.43021807074546814, "sampling/sampling_logp_difference/max": 6.813688334822655, "sampling/sampling_logp_difference/mean": 0.10667178835719823, "step": 335, "step_time": 2.7418013950002207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0018518518656492234, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0018518518656492234, "completions/clipped_ratio": 0.0, "completions/max_length": 730.2, "completions/max_terminated_length": 730.2, "completions/mean_length": 560.125, "completions/mean_terminated_length": 560.125, "completions/min_length": 420.8, "completions/min_terminated_length": 420.8, "entropy": 0.05461762771010399, "epoch": 0.0272, "frac_reward_zero_std": 0.75, "grad_norm": 0.008303332142531872, "kl": 1.1738501071929932, "learning_rate": 9.949427660634754e-06, "loss": -0.00020156898535788058, "num_tokens": 4944599.0, "reward": 1.2348749399185182, "reward_std": 0.09210065379738808, "rewards/env_game_reward/mean": 1.2348749399185182, "rewards/env_game_reward/std": 0.19004902094602585, "sampling/importance_sampling_ratio/max": 1.2192065000534058, "sampling/importance_sampling_ratio/mean": 0.993685245513916, "sampling/importance_sampling_ratio/min": 0.5723252117633819, "sampling/sampling_logp_difference/max": 0.6896804094314575, "sampling/sampling_logp_difference/mean": 0.01418588999658823, "step": 340, "step_time": 2.7300288166021347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.001923076994717121, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001923076994717121, "completions/clipped_ratio": 0.0, "completions/max_length": 715.4, "completions/max_terminated_length": 715.4, "completions/mean_length": 569.075, "completions/mean_terminated_length": 569.075, "completions/min_length": 408.0, "completions/min_terminated_length": 408.0, "entropy": 0.06558555997908115, "epoch": 0.0276, "frac_reward_zero_std": 0.7, "grad_norm": 0.049474235624074936, "kl": 1.3128838062286377, "learning_rate": 9.949387455342814e-06, "loss": -0.00022792629897594452, "num_tokens": 5033871.0, "reward": 1.2022499322891236, "reward_std": 0.13823936092667283, "rewards/env_game_reward/mean": 1.2022499322891236, "rewards/env_game_reward/std": 0.24752375930547715, "sampling/importance_sampling_ratio/max": 1.2796708583831786, "sampling/importance_sampling_ratio/mean": 0.9890965223312378, "sampling/importance_sampling_ratio/min": 0.5188629031181335, "sampling/sampling_logp_difference/max": 0.6587271898984909, "sampling/sampling_logp_difference/mean": 0.015436801221221685, "step": 345, "step_time": 2.8809383545994933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.002083333395421505, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002083333395421505, "completions/clipped_ratio": 0.0, "completions/max_length": 729.8, "completions/max_terminated_length": 729.8, "completions/mean_length": 572.375, "completions/mean_terminated_length": 572.375, "completions/min_length": 421.6, "completions/min_terminated_length": 421.6, "entropy": 0.06578464694321155, "epoch": 0.028, "frac_reward_zero_std": 0.825, "grad_norm": 0.00010816263238666579, "kl": 1.1110517114400864, "learning_rate": 9.949346594321329e-06, "loss": -0.0002046652138233185, "num_tokens": 5123373.0, "reward": 1.2458332538604737, "reward_std": 0.07660322934389115, "rewards/env_game_reward/mean": 1.2458332538604737, "rewards/env_game_reward/std": 0.16050206124782562, "sampling/importance_sampling_ratio/max": 1.1853637456893922, "sampling/importance_sampling_ratio/mean": 0.9942168712615966, "sampling/importance_sampling_ratio/min": 0.770496129989624, "sampling/sampling_logp_difference/max": 0.3065337672829628, "sampling/sampling_logp_difference/mean": 0.010001219715923071, "step": 350, "step_time": 2.7594579180004075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 732.0, "completions/max_terminated_length": 732.0, "completions/mean_length": 590.825, "completions/mean_terminated_length": 590.825, "completions/min_length": 436.0, "completions/min_terminated_length": 436.0, "entropy": 0.0713444285094738, "epoch": 0.0284, "frac_reward_zero_std": 0.775, "grad_norm": 0.02447124384343624, "kl": 0.9440391480922699, "learning_rate": 9.949305077577481e-06, "loss": -0.0007917589507997036, "num_tokens": 5214490.0, "reward": 1.2240416049957275, "reward_std": 0.07678000405430793, "rewards/env_game_reward/mean": 1.2240416049957275, "rewards/env_game_reward/std": 0.20674404948949815, "sampling/importance_sampling_ratio/max": 1.48435320854187, "sampling/importance_sampling_ratio/mean": 1.0448272943496704, "sampling/importance_sampling_ratio/min": 0.6205300271511078, "sampling/sampling_logp_difference/max": 0.699562880396843, "sampling/sampling_logp_difference/mean": 0.019524367339909077, "step": 355, "step_time": 2.7076195863992325 }, { "clip_ratio/high_max": 0.003846153989434242, "clip_ratio/high_mean": 0.001923076994717121, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001923076994717121, "completions/clipped_ratio": 0.0, "completions/max_length": 681.0, "completions/max_terminated_length": 681.0, "completions/mean_length": 547.5, "completions/mean_terminated_length": 547.5, "completions/min_length": 408.2, "completions/min_terminated_length": 408.2, "entropy": 0.07489714697003365, "epoch": 0.0288, "frac_reward_zero_std": 0.7, "grad_norm": 0.018365204334259033, "kl": 1.3538099706172944, "learning_rate": 9.949262905118568e-06, "loss": -0.0006200538948178292, "num_tokens": 5301101.0, "reward": 1.2171249389648438, "reward_std": 0.10188229638151824, "rewards/env_game_reward/mean": 1.2171249389648438, "rewards/env_game_reward/std": 0.18957828879356384, "sampling/importance_sampling_ratio/max": 1.3922180652618408, "sampling/importance_sampling_ratio/mean": 0.9885348677635193, "sampling/importance_sampling_ratio/min": 0.6369173884391784, "sampling/sampling_logp_difference/max": 0.6018468141555786, "sampling/sampling_logp_difference/mean": 0.018113284837454557, "step": 360, "step_time": 2.585118543999852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.001923076994717121, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001923076994717121, "completions/clipped_ratio": 0.0, "completions/max_length": 715.6, "completions/max_terminated_length": 715.6, "completions/mean_length": 547.125, "completions/mean_terminated_length": 547.125, "completions/min_length": 408.0, "completions/min_terminated_length": 408.0, "entropy": 0.061598078906536104, "epoch": 0.0292, "frac_reward_zero_std": 0.775, "grad_norm": 0.02872176468372345, "kl": 1.7623548924922943, "learning_rate": 9.949220076952004e-06, "loss": 0.00013176617212593556, "num_tokens": 5387242.0, "reward": 1.229458260536194, "reward_std": 0.09976096972823142, "rewards/env_game_reward/mean": 1.229458260536194, "rewards/env_game_reward/std": 0.18861736208200455, "sampling/importance_sampling_ratio/max": 1.1277338027954102, "sampling/importance_sampling_ratio/mean": 0.9845755219459533, "sampling/importance_sampling_ratio/min": 0.6804550766944886, "sampling/sampling_logp_difference/max": 0.5396802634000778, "sampling/sampling_logp_difference/mean": 0.012317924201488495, "step": 365, "step_time": 2.8329780505999222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 714.6, "completions/max_terminated_length": 714.6, "completions/mean_length": 546.5375, "completions/mean_terminated_length": 546.5375, "completions/min_length": 421.6, "completions/min_terminated_length": 421.6, "entropy": 0.06278647035360337, "epoch": 0.0296, "frac_reward_zero_std": 0.8, "grad_norm": 0.008219737559556961, "kl": 1.525150203704834, "learning_rate": 9.949176593085315e-06, "loss": 0.00031437948346138, "num_tokens": 5474001.0, "reward": 1.191666555404663, "reward_std": 0.09192387461662292, "rewards/env_game_reward/mean": 1.191666555404663, "rewards/env_game_reward/std": 0.244113752245903, "sampling/importance_sampling_ratio/max": 1.2466303348541259, "sampling/importance_sampling_ratio/mean": 0.99187251329422, "sampling/importance_sampling_ratio/min": 0.6674005150794983, "sampling/sampling_logp_difference/max": 0.5849884033203125, "sampling/sampling_logp_difference/mean": 0.013392899371683597, "step": 370, "step_time": 2.6906030645986903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 698.0, "completions/max_terminated_length": 698.0, "completions/mean_length": 558.05, "completions/mean_terminated_length": 558.05, "completions/min_length": 408.0, "completions/min_terminated_length": 408.0, "entropy": 0.06691324710845947, "epoch": 0.03, "frac_reward_zero_std": 0.775, "grad_norm": 0.016666624695062637, "kl": 0.9236783385276794, "learning_rate": 9.94913245352615e-06, "loss": 7.541242521256209e-05, "num_tokens": 5561757.0, "reward": 1.2079166412353515, "reward_std": 0.11490484774112701, "rewards/env_game_reward/mean": 1.2079166412353515, "rewards/env_game_reward/std": 0.2623890072107315, "sampling/importance_sampling_ratio/max": 1.1636013031005858, "sampling/importance_sampling_ratio/mean": 0.9869522213935852, "sampling/importance_sampling_ratio/min": 0.5432810723781586, "sampling/sampling_logp_difference/max": 0.6354977548122406, "sampling/sampling_logp_difference/mean": 0.011450758669525385, "step": 375, "step_time": 2.7167678525976955 }, { "epoch": 0.03, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 702.3333333333334, "eval_completions/max_terminated_length": 702.3333333333334, "eval_completions/mean_length": 618.25, "eval_completions/mean_terminated_length": 618.25, "eval_completions/min_length": 482.3333333333333, "eval_completions/min_terminated_length": 482.3333333333333, "eval_entropy": 0.08387033144632976, "eval_frac_reward_zero_std": 0.5, "eval_kl": 0.9737871090571085, "eval_loss": -0.0007849211106076837, "eval_num_tokens": 5561757.0, "eval_reward": 1.1370832522710164, "eval_reward_std": 0.23039893666282296, "eval_rewards/env_game_reward/mean": 1.1370832522710164, "eval_rewards/env_game_reward/std": 0.30880257207900286, "eval_runtime": 2.8647, "eval_samples_per_second": 3.491, "eval_sampling/importance_sampling_ratio/max": 1.4039711157480876, "eval_sampling/importance_sampling_ratio/mean": 1.006952742735545, "eval_sampling/importance_sampling_ratio/min": 0.7436001698176066, "eval_sampling/sampling_logp_difference/max": 0.5522954861323038, "eval_sampling/sampling_logp_difference/mean": 0.019840245756010216, "eval_steps_per_second": 0.698, "step": 375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0014705882407724858, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0014705882407724858, "completions/clipped_ratio": 0.0, "completions/max_length": 937.8, "completions/max_terminated_length": 937.8, "completions/mean_length": 735.0, "completions/mean_terminated_length": 735.0, "completions/min_length": 517.2, "completions/min_terminated_length": 517.2, "entropy": 0.0695138342678547, "epoch": 0.0304, "frac_reward_zero_std": 0.675, "grad_norm": 0.023199956864118576, "kl": 1.087385755777359, "learning_rate": 9.949087658282265e-06, "loss": 0.00038635083474218844, "num_tokens": 5665353.0, "reward": 1.2418749570846557, "reward_std": 0.12957732044160367, "rewards/env_game_reward/mean": 1.2418749570846557, "rewards/env_game_reward/std": 0.24777153581380845, "sampling/importance_sampling_ratio/max": 1.760440754890442, "sampling/importance_sampling_ratio/mean": 1.0248130679130554, "sampling/importance_sampling_ratio/min": 0.5629997849464417, "sampling/sampling_logp_difference/max": 0.7708295106887817, "sampling/sampling_logp_difference/mean": 0.016029300354421138, "step": 380, "step_time": 3.315412589598418 }, { "clip_ratio/high_max": 0.002857142873108387, "clip_ratio/high_mean": 0.0014285714365541934, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0014285714365541934, "completions/clipped_ratio": 0.0, "completions/max_length": 1022.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 839.425, "completions/mean_terminated_length": 839.425, "completions/min_length": 611.2, "completions/min_terminated_length": 611.2, "entropy": 0.08930239342153072, "epoch": 0.0308, "frac_reward_zero_std": 0.4, "grad_norm": 0.07401212304830551, "kl": 1.702320432662964, "learning_rate": 9.949042207361537e-06, "loss": -0.0006852170452475547, "num_tokens": 5776827.0, "reward": 1.1849999666213988, "reward_std": 0.20470741838216783, "rewards/env_game_reward/mean": 1.1849999666213988, "rewards/env_game_reward/std": 0.30558564364910124, "sampling/importance_sampling_ratio/max": 1.5303932189941407, "sampling/importance_sampling_ratio/mean": 0.9755434513092041, "sampling/importance_sampling_ratio/min": 0.5129167795181274, "sampling/sampling_logp_difference/max": 0.8233615517616272, "sampling/sampling_logp_difference/mean": 0.02174628246575594, "step": 385, "step_time": 3.589076187599858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0029910714365541935, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0029910714365541935, "completions/clipped_ratio": 0.0, "completions/max_length": 1050.8, "completions/max_terminated_length": 1050.8, "completions/mean_length": 794.475, "completions/mean_terminated_length": 794.475, "completions/min_length": 590.0, "completions/min_terminated_length": 590.0, "entropy": 0.09299433380365371, "epoch": 0.0312, "frac_reward_zero_std": 0.5, "grad_norm": 0.042726535350084305, "kl": 0.9145794808864594, "learning_rate": 9.948996100771952e-06, "loss": 0.0006041613407433033, "num_tokens": 5884075.0, "reward": 1.1981249570846557, "reward_std": 0.18649942204356193, "rewards/env_game_reward/mean": 1.1981249570846557, "rewards/env_game_reward/std": 0.30469191670417783, "sampling/importance_sampling_ratio/max": 1.5661879777908325, "sampling/importance_sampling_ratio/mean": 1.0027704000473023, "sampling/importance_sampling_ratio/min": 0.5443926572799682, "sampling/sampling_logp_difference/max": 0.6991547107696533, "sampling/sampling_logp_difference/mean": 0.02191640790551901, "step": 390, "step_time": 3.5801626390006276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.002985739801079035, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002985739801079035, "completions/clipped_ratio": 0.0, "completions/max_length": 1047.8, "completions/max_terminated_length": 1047.8, "completions/mean_length": 831.9625, "completions/mean_terminated_length": 831.9625, "completions/min_length": 632.0, "completions/min_terminated_length": 632.0, "entropy": 0.10950318947434426, "epoch": 0.0316, "frac_reward_zero_std": 0.525, "grad_norm": 0.024459250271320343, "kl": 1.3264178693294526, "learning_rate": 9.948949338521616e-06, "loss": -0.00011393972672522069, "num_tokens": 5994509.0, "reward": 1.2248749732971191, "reward_std": 0.16104357689619064, "rewards/env_game_reward/mean": 1.2248749732971191, "rewards/env_game_reward/std": 0.2880355179309845, "sampling/importance_sampling_ratio/max": 1.3253472089767455, "sampling/importance_sampling_ratio/mean": 1.016199254989624, "sampling/importance_sampling_ratio/min": 0.6328357100486756, "sampling/sampling_logp_difference/max": 0.5666046380996704, "sampling/sampling_logp_difference/mean": 0.016961843892931937, "step": 395, "step_time": 3.593138352200185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0027046783827245234, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0027046783827245234, "completions/clipped_ratio": 0.0, "completions/max_length": 1048.0, "completions/max_terminated_length": 1048.0, "completions/mean_length": 865.2375, "completions/mean_terminated_length": 865.2375, "completions/min_length": 651.6, "completions/min_terminated_length": 651.6, "entropy": 0.12832557484507562, "epoch": 0.032, "frac_reward_zero_std": 0.45, "grad_norm": 0.0840858444571495, "kl": 1.16541907787323, "learning_rate": 9.948901920618752e-06, "loss": -0.0007970237173140049, "num_tokens": 6108518.0, "reward": 1.171999979019165, "reward_std": 0.2605688437819481, "rewards/env_game_reward/mean": 1.171999979019165, "rewards/env_game_reward/std": 0.36585823595523836, "sampling/importance_sampling_ratio/max": 1.6069575786590575, "sampling/importance_sampling_ratio/mean": 0.9940932035446167, "sampling/importance_sampling_ratio/min": 0.49497389793395996, "sampling/sampling_logp_difference/max": 0.7419742822647095, "sampling/sampling_logp_difference/mean": 0.023178784735500814, "step": 400, "step_time": 3.5148092628012817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.002878289483487606, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002878289483487606, "completions/clipped_ratio": 0.0, "completions/max_length": 1044.4, "completions/max_terminated_length": 1044.4, "completions/mean_length": 828.5875, "completions/mean_terminated_length": 828.5875, "completions/min_length": 610.4, "completions/min_terminated_length": 610.4, "entropy": 0.11367225870490075, "epoch": 0.0324, "frac_reward_zero_std": 0.375, "grad_norm": 0.07428044080734253, "kl": 1.0149361670017243, "learning_rate": 9.948853847071691e-06, "loss": 0.00038471841253340244, "num_tokens": 6218492.0, "reward": 1.1938749551773071, "reward_std": 0.25438166558742525, "rewards/env_game_reward/mean": 1.1938749551773071, "rewards/env_game_reward/std": 0.321139919757843, "sampling/importance_sampling_ratio/max": 1.361912488937378, "sampling/importance_sampling_ratio/mean": 1.0066336393356323, "sampling/importance_sampling_ratio/min": 0.7280618786811829, "sampling/sampling_logp_difference/max": 0.47970104217529297, "sampling/sampling_logp_difference/mean": 0.013811025116592646, "step": 405, "step_time": 3.7229729885992127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0029910714365541935, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0029910714365541935, "completions/clipped_ratio": 0.0, "completions/max_length": 1047.4, "completions/max_terminated_length": 1047.4, "completions/mean_length": 831.125, "completions/mean_terminated_length": 831.125, "completions/min_length": 590.0, "completions/min_terminated_length": 590.0, "entropy": 0.0979391686618328, "epoch": 0.0328, "frac_reward_zero_std": 0.45, "grad_norm": 0.057129692286252975, "kl": 1.3072730243206023, "learning_rate": 9.948805117888886e-06, "loss": 4.2584206676110624e-05, "num_tokens": 6328847.0, "reward": 1.194124960899353, "reward_std": 0.21690500825643538, "rewards/env_game_reward/mean": 1.194124960899353, "rewards/env_game_reward/std": 0.3049228638410568, "sampling/importance_sampling_ratio/max": 1.4106521368026734, "sampling/importance_sampling_ratio/mean": 0.9964896202087402, "sampling/importance_sampling_ratio/min": 0.6699582457542419, "sampling/sampling_logp_difference/max": 0.5707800269126893, "sampling/sampling_logp_difference/mean": 0.016702639311552046, "step": 410, "step_time": 3.5735276011997485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.002857142873108387, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002857142873108387, "completions/clipped_ratio": 0.0, "completions/max_length": 1072.6, "completions/max_terminated_length": 1072.6, "completions/mean_length": 851.675, "completions/mean_terminated_length": 851.675, "completions/min_length": 632.0, "completions/min_terminated_length": 632.0, "entropy": 0.09468646496534347, "epoch": 0.0332, "frac_reward_zero_std": 0.5, "grad_norm": 0.03625418618321419, "kl": 0.9998983383178711, "learning_rate": 9.948755733078905e-06, "loss": 0.0005254603922367096, "num_tokens": 6441493.0, "reward": 1.203125, "reward_std": 0.21655145585536956, "rewards/env_game_reward/mean": 1.203125, "rewards/env_game_reward/std": 0.3023660510778427, "sampling/importance_sampling_ratio/max": 1.5803467512130738, "sampling/importance_sampling_ratio/mean": 1.0241896748542785, "sampling/importance_sampling_ratio/min": 0.5122612774372101, "sampling/sampling_logp_difference/max": 0.7667708158493042, "sampling/sampling_logp_difference/mean": 0.016751198470592497, "step": 415, "step_time": 3.535725635799463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.002129505993798375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002129505993798375, "completions/clipped_ratio": 0.0, "completions/max_length": 1072.6, "completions/max_terminated_length": 1072.6, "completions/mean_length": 812.1, "completions/mean_terminated_length": 812.1, "completions/min_length": 590.0, "completions/min_terminated_length": 590.0, "entropy": 0.16425883509218692, "epoch": 0.0336, "frac_reward_zero_std": 0.5, "grad_norm": 0.04025555029511452, "kl": 1.0152770042419434, "learning_rate": 9.948705692650427e-06, "loss": 0.0013656719587743283, "num_tokens": 6549145.0, "reward": 1.2017499923706054, "reward_std": 0.18137289136648177, "rewards/env_game_reward/mean": 1.2017499923706054, "rewards/env_game_reward/std": 0.2834285110235214, "sampling/importance_sampling_ratio/max": 1.2725768566131592, "sampling/importance_sampling_ratio/mean": 0.9965269207954407, "sampling/importance_sampling_ratio/min": 0.5284772694110871, "sampling/sampling_logp_difference/max": 0.6862696170806885, "sampling/sampling_logp_difference/mean": 0.01916008032858372, "step": 420, "step_time": 3.653649864799809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1046.2, "completions/max_terminated_length": 1046.2, "completions/mean_length": 848.475, "completions/mean_terminated_length": 848.475, "completions/min_length": 617.6, "completions/min_terminated_length": 617.6, "entropy": 0.10671765431761741, "epoch": 0.034, "frac_reward_zero_std": 0.375, "grad_norm": 0.0357169471681118, "kl": 1.2555115342140197, "learning_rate": 9.948654996612248e-06, "loss": -0.0010916889645159245, "num_tokens": 6661416.0, "reward": 1.1152499675750733, "reward_std": 0.241830512881279, "rewards/env_game_reward/mean": 1.1152499675750733, "rewards/env_game_reward/std": 0.3348828315734863, "sampling/importance_sampling_ratio/max": 1.2912365436553954, "sampling/importance_sampling_ratio/mean": 0.9513557910919189, "sampling/importance_sampling_ratio/min": 0.4529180288314819, "sampling/sampling_logp_difference/max": 0.838625431060791, "sampling/sampling_logp_difference/mean": 0.02186411377042532, "step": 425, "step_time": 3.7595838884000843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0014285714365541934, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0014285714365541934, "completions/clipped_ratio": 0.0, "completions/max_length": 1047.2, "completions/max_terminated_length": 1047.2, "completions/mean_length": 808.375, "completions/mean_terminated_length": 808.375, "completions/min_length": 631.4, "completions/min_terminated_length": 631.4, "entropy": 0.0735358338803053, "epoch": 0.0344, "frac_reward_zero_std": 0.475, "grad_norm": 0.024515505880117416, "kl": 1.176445186138153, "learning_rate": 9.948603644973282e-06, "loss": 0.00011715407017618418, "num_tokens": 6769624.0, "reward": 1.2248749732971191, "reward_std": 0.2105410486459732, "rewards/env_game_reward/mean": 1.2248749732971191, "rewards/env_game_reward/std": 0.2968492805957794, "sampling/importance_sampling_ratio/max": 1.4149769067764282, "sampling/importance_sampling_ratio/mean": 1.0460476994514465, "sampling/importance_sampling_ratio/min": 0.855708372592926, "sampling/sampling_logp_difference/max": 0.2869324326515198, "sampling/sampling_logp_difference/mean": 0.010842377878725528, "step": 430, "step_time": 3.4251211591988975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 996.8, "completions/max_terminated_length": 996.8, "completions/mean_length": 800.6125, "completions/mean_terminated_length": 800.6125, "completions/min_length": 578.0, "completions/min_terminated_length": 578.0, "entropy": 0.05469609908759594, "epoch": 0.0348, "frac_reward_zero_std": 0.375, "grad_norm": 0.02258330211043358, "kl": 1.0658069968223571, "learning_rate": 9.948551637742554e-06, "loss": 0.00048186536878347397, "num_tokens": 6876746.0, "reward": 1.2204999208450318, "reward_std": 0.19197950065135955, "rewards/env_game_reward/mean": 1.2204999208450318, "rewards/env_game_reward/std": 0.2575215369462967, "sampling/importance_sampling_ratio/max": 1.7662209749221802, "sampling/importance_sampling_ratio/mean": 1.0243075132369994, "sampling/importance_sampling_ratio/min": 0.6089048445224762, "sampling/sampling_logp_difference/max": 0.8979941129684448, "sampling/sampling_logp_difference/mean": 0.016185545828193426, "step": 435, "step_time": 3.3869100240008265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0015625, "completions/clipped_ratio": 0.0, "completions/max_length": 1071.6, "completions/max_terminated_length": 1071.6, "completions/mean_length": 838.85, "completions/mean_terminated_length": 838.85, "completions/min_length": 590.0, "completions/min_terminated_length": 590.0, "entropy": 0.04340629056096077, "epoch": 0.0352, "frac_reward_zero_std": 0.675, "grad_norm": 0.025899723172187805, "kl": 1.2329406023025513, "learning_rate": 9.948498974929206e-06, "loss": 0.0002917660400271416, "num_tokens": 6987662.0, "reward": 1.2731249809265137, "reward_std": 0.12993087247014046, "rewards/env_game_reward/mean": 1.2731249809265137, "rewards/env_game_reward/std": 0.2441350817680359, "sampling/importance_sampling_ratio/max": 1.1621084928512573, "sampling/importance_sampling_ratio/mean": 0.9838597655296326, "sampling/importance_sampling_ratio/min": 0.6955267369747162, "sampling/sampling_logp_difference/max": 0.48128714561462405, "sampling/sampling_logp_difference/mean": 0.007422756869345903, "step": 440, "step_time": 3.551285775598808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0015151515603065492, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0015151515603065492, "completions/clipped_ratio": 0.0, "completions/max_length": 1074.0, "completions/max_terminated_length": 1074.0, "completions/mean_length": 835.5875, "completions/mean_terminated_length": 835.5875, "completions/min_length": 610.4, "completions/min_terminated_length": 610.4, "entropy": 0.04846553523093462, "epoch": 0.0356, "frac_reward_zero_std": 0.6, "grad_norm": 0.15698401629924774, "kl": 1.8966678202152252, "learning_rate": 9.948445656542496e-06, "loss": 0.0002397662028670311, "num_tokens": 7098696.0, "reward": 1.2731249809265137, "reward_std": 0.14230524450540544, "rewards/env_game_reward/mean": 1.2731249809265137, "rewards/env_game_reward/std": 0.2304918497800827, "sampling/importance_sampling_ratio/max": 1.4296421527862548, "sampling/importance_sampling_ratio/mean": 1.0030879974365234, "sampling/importance_sampling_ratio/min": 0.5428477764129639, "sampling/sampling_logp_difference/max": 0.6600507885217667, "sampling/sampling_logp_difference/mean": 0.011323550157248974, "step": 445, "step_time": 3.5293831905983097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0027046783827245234, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0027046783827245234, "completions/clipped_ratio": 0.0, "completions/max_length": 1074.0, "completions/max_terminated_length": 1074.0, "completions/mean_length": 813.675, "completions/mean_terminated_length": 813.675, "completions/min_length": 618.8, "completions/min_terminated_length": 618.8, "entropy": 0.07172373905777932, "epoch": 0.036, "frac_reward_zero_std": 0.475, "grad_norm": 0.030395962297916412, "kl": 1.2917304337024689, "learning_rate": 9.948391682591799e-06, "loss": 3.6328425630927085e-05, "num_tokens": 7207378.0, "reward": 1.1674999475479126, "reward_std": 0.2174353301525116, "rewards/env_game_reward/mean": 1.1674999475479126, "rewards/env_game_reward/std": 0.32538898289203644, "sampling/importance_sampling_ratio/max": 1.4234293460845948, "sampling/importance_sampling_ratio/mean": 0.9796630859375, "sampling/importance_sampling_ratio/min": 0.616313761472702, "sampling/sampling_logp_difference/max": 0.6073259890079499, "sampling/sampling_logp_difference/mean": 0.014473339542746544, "step": 450, "step_time": 3.6695362117985497 }, { "epoch": 0.036, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 1031.6666666666667, "eval_completions/max_terminated_length": 1031.6666666666667, "eval_completions/mean_length": 903.7083333333334, "eval_completions/mean_terminated_length": 903.7083333333334, "eval_completions/min_length": 700.0, "eval_completions/min_terminated_length": 700.0, "eval_entropy": 0.09484085440635681, "eval_frac_reward_zero_std": 0.3333333333333333, "eval_kl": 2.29697722196579, "eval_loss": -0.0005131486104801297, "eval_num_tokens": 7207378.0, "eval_reward": 1.1229166587193806, "eval_reward_std": 0.1856155296166738, "eval_rewards/env_game_reward/mean": 1.1229166587193806, "eval_rewards/env_game_reward/std": 0.2910057157278061, "eval_runtime": 3.7272, "eval_samples_per_second": 2.683, "eval_sampling/importance_sampling_ratio/max": 1.1381525993347168, "eval_sampling/importance_sampling_ratio/mean": 0.9641783038775126, "eval_sampling/importance_sampling_ratio/min": 0.7627586523691813, "eval_sampling/sampling_logp_difference/max": 0.25132163365681964, "eval_sampling/sampling_logp_difference/mean": 0.012963244070609411, "eval_steps_per_second": 0.537, "step": 450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1047.2, "completions/max_terminated_length": 1047.2, "completions/mean_length": 834.8, "completions/mean_terminated_length": 834.8, "completions/min_length": 655.4, "completions/min_terminated_length": 655.4, "entropy": 0.07156877405941486, "epoch": 0.0364, "frac_reward_zero_std": 0.65, "grad_norm": 0.02007192373275757, "kl": 1.030625057220459, "learning_rate": 9.9483370530866e-06, "loss": -0.00047754105180501937, "num_tokens": 7318135.0, "reward": 1.259999966621399, "reward_std": 0.12374369204044341, "rewards/env_game_reward/mean": 1.259999966621399, "rewards/env_game_reward/std": 0.2538477838039398, "sampling/importance_sampling_ratio/max": 1.0907421350479125, "sampling/importance_sampling_ratio/mean": 0.9758262634277344, "sampling/importance_sampling_ratio/min": 0.6034112870693207, "sampling/sampling_logp_difference/max": 0.446870756149292, "sampling/sampling_logp_difference/mean": 0.008248830866068601, "step": 455, "step_time": 3.3988218826001684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0012820512987673283, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0012820512987673283, "completions/clipped_ratio": 0.0, "completions/max_length": 1047.8, "completions/max_terminated_length": 1047.8, "completions/mean_length": 826.6875, "completions/mean_terminated_length": 826.6875, "completions/min_length": 590.2, "completions/min_terminated_length": 590.2, "entropy": 0.08540964387357235, "epoch": 0.0368, "frac_reward_zero_std": 0.55, "grad_norm": 0.06558004766702652, "kl": 0.8614380896091461, "learning_rate": 9.948281768036502e-06, "loss": 8.575599640607834e-05, "num_tokens": 7427814.0, "reward": 1.2334999561309814, "reward_std": 0.1983434498310089, "rewards/env_game_reward/mean": 1.2334999561309814, "rewards/env_game_reward/std": 0.29357832968235015, "sampling/importance_sampling_ratio/max": 1.2517836332321166, "sampling/importance_sampling_ratio/mean": 0.9869768977165222, "sampling/importance_sampling_ratio/min": 0.591094845533371, "sampling/sampling_logp_difference/max": 0.5167952656745911, "sampling/sampling_logp_difference/mean": 0.011756654176861048, "step": 460, "step_time": 3.48071748059956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.002857142873108387, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002857142873108387, "completions/clipped_ratio": 0.0, "completions/max_length": 1071.2, "completions/max_terminated_length": 1071.2, "completions/mean_length": 860.525, "completions/mean_terminated_length": 860.525, "completions/min_length": 611.6, "completions/min_terminated_length": 611.6, "entropy": 0.0760620430111885, "epoch": 0.0372, "frac_reward_zero_std": 0.55, "grad_norm": 0.03671133518218994, "kl": 1.1999324023723603, "learning_rate": 9.948225827451225e-06, "loss": 0.0007083296775817871, "num_tokens": 7540850.0, "reward": 1.2249999523162842, "reward_std": 0.19798989295959474, "rewards/env_game_reward/mean": 1.2249999523162842, "rewards/env_game_reward/std": 0.2860778242349625, "sampling/importance_sampling_ratio/max": 1.3862907409667968, "sampling/importance_sampling_ratio/mean": 1.022992479801178, "sampling/importance_sampling_ratio/min": 0.6777822971343994, "sampling/sampling_logp_difference/max": 0.46881517171859743, "sampling/sampling_logp_difference/mean": 0.009866528119891882, "step": 465, "step_time": 3.5962128964005386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0015625, "completions/clipped_ratio": 0.0, "completions/max_length": 1073.0, "completions/max_terminated_length": 1073.0, "completions/mean_length": 826.9875, "completions/mean_terminated_length": 826.9875, "completions/min_length": 632.0, "completions/min_terminated_length": 632.0, "entropy": 0.06233834847807884, "epoch": 0.0376, "frac_reward_zero_std": 0.575, "grad_norm": 0.034234557300806046, "kl": 0.7558440536260604, "learning_rate": 9.948169231340603e-06, "loss": 5.739857442677021e-06, "num_tokens": 7651407.0, "reward": 1.246874952316284, "reward_std": 0.19180271327495574, "rewards/env_game_reward/mean": 1.246874952316284, "rewards/env_game_reward/std": 0.30952975153923035, "sampling/importance_sampling_ratio/max": 1.133919596672058, "sampling/importance_sampling_ratio/mean": 0.9887906551361084, "sampling/importance_sampling_ratio/min": 0.6729271888732911, "sampling/sampling_logp_difference/max": 0.4064432859420776, "sampling/sampling_logp_difference/mean": 0.006795862503349781, "step": 470, "step_time": 3.6208750205994873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1049.2, "completions/max_terminated_length": 1049.2, "completions/mean_length": 803.9625, "completions/mean_terminated_length": 803.9625, "completions/min_length": 611.0, "completions/min_terminated_length": 611.0, "entropy": 0.04374380446970463, "epoch": 0.038, "frac_reward_zero_std": 0.625, "grad_norm": 0.03510225564241409, "kl": 0.8806528657674789, "learning_rate": 9.948111979714584e-06, "loss": -3.6415911745280026e-05, "num_tokens": 7758750.0, "reward": 1.2643749475479127, "reward_std": 0.12993087470531464, "rewards/env_game_reward/mean": 1.2643749475479127, "rewards/env_game_reward/std": 0.2548509627580643, "sampling/importance_sampling_ratio/max": 1.0943125486373901, "sampling/importance_sampling_ratio/mean": 0.9773291110992431, "sampling/importance_sampling_ratio/min": 0.6454042971134186, "sampling/sampling_logp_difference/max": 0.5180009245872498, "sampling/sampling_logp_difference/mean": 0.00764626101590693, "step": 475, "step_time": 3.543976286398538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0001736111124046147, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0001736111124046147, "completions/clipped_ratio": 0.0, "completions/max_length": 1023.4, "completions/max_terminated_length": 1023.4, "completions/mean_length": 789.3625, "completions/mean_terminated_length": 789.3625, "completions/min_length": 597.0, "completions/min_terminated_length": 597.0, "entropy": 0.07270487509667874, "epoch": 0.0384, "frac_reward_zero_std": 0.375, "grad_norm": 0.02017202600836754, "kl": 1.002439457178116, "learning_rate": 9.948054072583232e-06, "loss": 0.0005217269062995911, "num_tokens": 7864541.0, "reward": 1.2366249322891236, "reward_std": 0.1935704916715622, "rewards/env_game_reward/mean": 1.2366249322891236, "rewards/env_game_reward/std": 0.23191870152950286, "sampling/importance_sampling_ratio/max": 1.3867063045501709, "sampling/importance_sampling_ratio/mean": 1.0338881731033325, "sampling/importance_sampling_ratio/min": 0.6520328521747821, "sampling/sampling_logp_difference/max": 4.570292866230011, "sampling/sampling_logp_difference/mean": 0.015752241667360067, "step": 480, "step_time": 3.5510043223992396 }, { "clip_ratio/high_max": 0.003125, "clip_ratio/high_mean": 0.0015625, "clip_ratio/low_mean": 0.0014705882407724858, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003033088240772486, "completions/clipped_ratio": 0.0, "completions/max_length": 1023.8, "completions/max_terminated_length": 1023.8, "completions/mean_length": 805.8, "completions/mean_terminated_length": 805.8, "completions/min_length": 610.2, "completions/min_terminated_length": 610.2, "entropy": 0.053855656459927556, "epoch": 0.0388, "frac_reward_zero_std": 0.625, "grad_norm": 0.06707289814949036, "kl": 0.8074522137641906, "learning_rate": 9.947995509956728e-06, "loss": 0.0006144857034087181, "num_tokens": 7972352.0, "reward": 1.2990000009536744, "reward_std": 0.09333809614181518, "rewards/env_game_reward/mean": 1.2990000009536744, "rewards/env_game_reward/std": 0.21436608731746673, "sampling/importance_sampling_ratio/max": 1.468837523460388, "sampling/importance_sampling_ratio/mean": 1.0262328863143921, "sampling/importance_sampling_ratio/min": 0.7864007472991943, "sampling/sampling_logp_difference/max": 0.3819884166121483, "sampling/sampling_logp_difference/mean": 0.006758352974429727, "step": 485, "step_time": 3.4237150788008877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0027863777242600916, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0027863777242600916, "completions/clipped_ratio": 0.0, "completions/max_length": 1048.2, "completions/max_terminated_length": 1048.2, "completions/mean_length": 811.15, "completions/mean_terminated_length": 811.15, "completions/min_length": 590.0, "completions/min_terminated_length": 590.0, "entropy": 0.07090941481292248, "epoch": 0.0392, "frac_reward_zero_std": 0.5, "grad_norm": 0.028884613886475563, "kl": 0.9331954538822174, "learning_rate": 9.947936291845363e-06, "loss": -0.00021723993122577668, "num_tokens": 8080617.0, "reward": 1.2640000104904174, "reward_std": 0.1552099421620369, "rewards/env_game_reward/mean": 1.2640000104904174, "rewards/env_game_reward/std": 0.22860259115695952, "sampling/importance_sampling_ratio/max": 1.3268755435943604, "sampling/importance_sampling_ratio/mean": 0.9921948909759521, "sampling/importance_sampling_ratio/min": 0.7053830564022064, "sampling/sampling_logp_difference/max": 0.5805891335010529, "sampling/sampling_logp_difference/mean": 0.011148639302700759, "step": 490, "step_time": 3.587002480799856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0014285714365541934, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0014285714365541934, "completions/clipped_ratio": 0.0, "completions/max_length": 1049.0, "completions/max_terminated_length": 1049.0, "completions/mean_length": 826.6375, "completions/mean_terminated_length": 826.6375, "completions/min_length": 611.0, "completions/min_terminated_length": 611.0, "entropy": 0.07093169055879116, "epoch": 0.0396, "frac_reward_zero_std": 0.45, "grad_norm": 0.038124196231365204, "kl": 0.8121790528297425, "learning_rate": 9.94787641825955e-06, "loss": 0.00019427400548011065, "num_tokens": 8190006.0, "reward": 1.2337499618530274, "reward_std": 0.21036426872015, "rewards/env_game_reward/mean": 1.2337499618530274, "rewards/env_game_reward/std": 0.29140671491622927, "sampling/importance_sampling_ratio/max": 1.1874067783355713, "sampling/importance_sampling_ratio/mean": 0.9822077751159668, "sampling/importance_sampling_ratio/min": 0.6450884461402893, "sampling/sampling_logp_difference/max": 0.5654253602027893, "sampling/sampling_logp_difference/mean": 0.009776360169053078, "step": 495, "step_time": 3.513944055599859 }, { "clip_ratio/high_max": 0.003125, "clip_ratio/high_mean": 0.0015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0015625, "completions/clipped_ratio": 0.0, "completions/max_length": 950.0, "completions/max_terminated_length": 950.0, "completions/mean_length": 796.3625, "completions/mean_terminated_length": 796.3625, "completions/min_length": 630.6, "completions/min_terminated_length": 630.6, "entropy": 0.06923086419701577, "epoch": 0.04, "frac_reward_zero_std": 0.575, "grad_norm": 0.04387466236948967, "kl": 0.8395125925540924, "learning_rate": 9.947815889209812e-06, "loss": 0.00012775636278092862, "num_tokens": 8295385.0, "reward": 1.2642499923706054, "reward_std": 0.12975409924983977, "rewards/env_game_reward/mean": 1.2642499923706054, "rewards/env_game_reward/std": 0.23409587144851685, "sampling/importance_sampling_ratio/max": 1.3154429912567138, "sampling/importance_sampling_ratio/mean": 1.000835919380188, "sampling/importance_sampling_ratio/min": 0.6876768827438354, "sampling/sampling_logp_difference/max": 0.45278497934341433, "sampling/sampling_logp_difference/mean": 0.007802166696637869, "step": 500, "step_time": 3.2585985789992264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1178.4, "completions/max_terminated_length": 1178.4, "completions/mean_length": 906.1625, "completions/mean_terminated_length": 906.1625, "completions/min_length": 686.0, "completions/min_terminated_length": 686.0, "entropy": 0.07725062370300292, "epoch": 0.0404, "frac_reward_zero_std": 0.475, "grad_norm": 0.08721896260976791, "kl": 0.8332188785076141, "learning_rate": 9.947754704706791e-06, "loss": -0.0001865808852016926, "num_tokens": 8411094.0, "reward": 1.1867499947547913, "reward_std": 0.21460690796375276, "rewards/env_game_reward/mean": 1.1867499947547913, "rewards/env_game_reward/std": 0.3215657353401184, "sampling/importance_sampling_ratio/max": 1.3392818450927735, "sampling/importance_sampling_ratio/mean": 1.0001285552978516, "sampling/importance_sampling_ratio/min": 0.7314820289611816, "sampling/sampling_logp_difference/max": 0.4349968358874321, "sampling/sampling_logp_difference/mean": 0.010020055808126926, "step": 505, "step_time": 3.8816107694001403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0011363636702299118, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0011363636702299118, "completions/clipped_ratio": 0.0, "completions/max_length": 1330.0, "completions/max_terminated_length": 1330.0, "completions/mean_length": 1020.9625, "completions/mean_terminated_length": 1020.9625, "completions/min_length": 758.6, "completions/min_terminated_length": 758.6, "entropy": 0.060335731133818626, "epoch": 0.0408, "frac_reward_zero_std": 0.5, "grad_norm": 0.02782941795885563, "kl": 0.7987826824188232, "learning_rate": 9.947692864761243e-06, "loss": -0.00011843224056065082, "num_tokens": 8535366.0, "reward": 1.3006249904632567, "reward_std": 0.17589280307292937, "rewards/env_game_reward/mean": 1.3006249904632567, "rewards/env_game_reward/std": 0.27952331900596616, "sampling/importance_sampling_ratio/max": 1.2727408409118652, "sampling/importance_sampling_ratio/mean": 1.0021348953247071, "sampling/importance_sampling_ratio/min": 0.6616379976272583, "sampling/sampling_logp_difference/max": 0.705156409740448, "sampling/sampling_logp_difference/mean": 0.010359429102391005, "step": 510, "step_time": 4.54856540679757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0011363636702299118, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0011363636702299118, "completions/clipped_ratio": 0.0, "completions/max_length": 1332.6, "completions/max_terminated_length": 1332.6, "completions/mean_length": 1048.3875, "completions/mean_terminated_length": 1048.3875, "completions/min_length": 760.6, "completions/min_terminated_length": 760.6, "entropy": 0.08999732621014118, "epoch": 0.0412, "frac_reward_zero_std": 0.325, "grad_norm": 0.05762872472405434, "kl": 0.8527314305305481, "learning_rate": 9.947630369384036e-06, "loss": 0.0009840115904808044, "num_tokens": 8662752.0, "reward": 1.2363749980926513, "reward_std": 0.20240932703018188, "rewards/env_game_reward/mean": 1.2363749980926513, "rewards/env_game_reward/std": 0.2678327292203903, "sampling/importance_sampling_ratio/max": 1.7645922422409057, "sampling/importance_sampling_ratio/mean": 1.0444591283798217, "sampling/importance_sampling_ratio/min": 0.6512416243553162, "sampling/sampling_logp_difference/max": 0.5892035603523255, "sampling/sampling_logp_difference/mean": 0.01366331558674574, "step": 515, "step_time": 4.4831126509998285 }, { "clip_ratio/high_max": 0.0024390242993831634, "clip_ratio/high_mean": 0.0012195121496915817, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0012195121496915817, "completions/clipped_ratio": 0.0, "completions/max_length": 1366.8, "completions/max_terminated_length": 1366.8, "completions/mean_length": 1012.5875, "completions/mean_terminated_length": 1012.5875, "completions/min_length": 748.8, "completions/min_terminated_length": 748.8, "entropy": 0.06842279564589263, "epoch": 0.0416, "frac_reward_zero_std": 0.35, "grad_norm": 0.051058683544397354, "kl": 0.956644493341446, "learning_rate": 9.947567218586157e-06, "loss": 0.0002877896185964346, "num_tokens": 8786459.0, "reward": 1.2256250619888305, "reward_std": 0.20735905468463897, "rewards/env_game_reward/mean": 1.2256250619888305, "rewards/env_game_reward/std": 0.2896923661231995, "sampling/importance_sampling_ratio/max": 1.444146990776062, "sampling/importance_sampling_ratio/mean": 0.9814066886901855, "sampling/importance_sampling_ratio/min": 0.6000928044319153, "sampling/sampling_logp_difference/max": 0.5186550855636597, "sampling/sampling_logp_difference/mean": 0.012273849081248045, "step": 520, "step_time": 4.551891402601177 }, { "clip_ratio/high_max": 0.0023255813866853714, "clip_ratio/high_mean": 0.0011627906933426857, "clip_ratio/low_mean": 0.0023532669059932233, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003516057599335909, "completions/clipped_ratio": 0.0, "completions/max_length": 1362.2, "completions/max_terminated_length": 1362.2, "completions/mean_length": 1080.0125, "completions/mean_terminated_length": 1080.0125, "completions/min_length": 831.4, "completions/min_terminated_length": 831.4, "entropy": 0.054267326928675176, "epoch": 0.042, "frac_reward_zero_std": 0.25, "grad_norm": 0.016077129170298576, "kl": 0.9028543293476105, "learning_rate": 9.94750341237871e-06, "loss": -0.001446277927607298, "num_tokens": 8917463.0, "reward": 1.2988749742507935, "reward_std": 0.21867277324199677, "rewards/env_game_reward/mean": 1.2988749742507935, "rewards/env_game_reward/std": 0.2746602237224579, "sampling/importance_sampling_ratio/max": 1.3581011295318604, "sampling/importance_sampling_ratio/mean": 0.8896560549736023, "sampling/importance_sampling_ratio/min": 0.29013479351997373, "sampling/sampling_logp_difference/max": 0.9900324821472168, "sampling/sampling_logp_difference/mean": 0.024117074348032473, "step": 525, "step_time": 4.540563870201003 }, { "epoch": 0.042, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 1346.6666666666667, "eval_completions/max_terminated_length": 1346.6666666666667, "eval_completions/mean_length": 1164.4583333333333, "eval_completions/mean_terminated_length": 1164.4583333333333, "eval_completions/min_length": 842.0, "eval_completions/min_terminated_length": 842.0, "eval_entropy": 0.05087510993083318, "eval_frac_reward_zero_std": 0.4166666666666667, "eval_kl": 0.9046386082967123, "eval_loss": -0.00028693454805761576, "eval_num_tokens": 8917463.0, "eval_reward": 1.2712500095367432, "eval_reward_std": 0.17736596086372933, "eval_rewards/env_game_reward/mean": 1.2712500095367432, "eval_rewards/env_game_reward/std": 0.3256373902161916, "eval_runtime": 4.9449, "eval_samples_per_second": 2.022, "eval_sampling/importance_sampling_ratio/max": 1.1742435693740845, "eval_sampling/importance_sampling_ratio/mean": 0.8475368817647299, "eval_sampling/importance_sampling_ratio/min": 0.048990381260712944, "eval_sampling/sampling_logp_difference/max": 1.8321036497751872, "eval_sampling/sampling_logp_difference/mean": 0.03830909232298533, "eval_steps_per_second": 0.404, "step": 525 }, { "clip_ratio/high_max": 0.0021739130839705466, "clip_ratio/high_mean": 0.0010869565419852733, "clip_ratio/low_mean": 0.00701402323320508, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008100979775190354, "completions/clipped_ratio": 0.0, "completions/max_length": 1396.8, "completions/max_terminated_length": 1396.8, "completions/mean_length": 1032.575, "completions/mean_terminated_length": 1032.575, "completions/min_length": 734.8, "completions/min_terminated_length": 734.8, "entropy": 0.04630528762936592, "epoch": 0.0424, "frac_reward_zero_std": 0.25, "grad_norm": 0.4360601305961609, "kl": 2.5725530982017517, "learning_rate": 9.947438950772907e-06, "loss": 0.00011557291727513075, "num_tokens": 9043383.0, "reward": 1.2701250076293946, "reward_std": 0.16139711886644365, "rewards/env_game_reward/mean": 1.2701250076293946, "rewards/env_game_reward/std": 0.26454554200172425, "sampling/importance_sampling_ratio/max": 1.2565020084381104, "sampling/importance_sampling_ratio/mean": 0.8415638923645019, "sampling/importance_sampling_ratio/min": 0.1712031990289688, "sampling/sampling_logp_difference/max": 1.4327976703643799, "sampling/sampling_logp_difference/mean": 0.032391348481178285, "step": 530, "step_time": 4.706439789199067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0011627906933426857, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0011627906933426857, "completions/clipped_ratio": 0.0, "completions/max_length": 1335.2, "completions/max_terminated_length": 1335.2, "completions/mean_length": 1041.975, "completions/mean_terminated_length": 1041.975, "completions/min_length": 738.8, "completions/min_terminated_length": 738.8, "entropy": 0.11737375222146511, "epoch": 0.0428, "frac_reward_zero_std": 0.35, "grad_norm": 0.03724990040063858, "kl": 2.714300978183746, "learning_rate": 9.947373833780082e-06, "loss": 0.0008453769609332084, "num_tokens": 9170328.0, "reward": 1.2677500009536744, "reward_std": 0.16581654138863086, "rewards/env_game_reward/mean": 1.2677500009536744, "rewards/env_game_reward/std": 0.273353236913681, "sampling/importance_sampling_ratio/max": 1.6070437669754027, "sampling/importance_sampling_ratio/mean": 0.9828642964363098, "sampling/importance_sampling_ratio/min": 0.4266847729682922, "sampling/sampling_logp_difference/max": 1.137272548675537, "sampling/sampling_logp_difference/mean": 0.020196602493524552, "step": 535, "step_time": 4.443584994000412 }, { "clip_ratio/high_max": 0.0024390242993831634, "clip_ratio/high_mean": 0.0012195121496915817, "clip_ratio/low_mean": 0.0022543059661984445, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003473818115890026, "completions/clipped_ratio": 0.0, "completions/max_length": 1329.4, "completions/max_terminated_length": 1329.4, "completions/mean_length": 1017.975, "completions/mean_terminated_length": 1017.975, "completions/min_length": 751.2, "completions/min_terminated_length": 751.2, "entropy": 0.2812275648117065, "epoch": 0.0432, "frac_reward_zero_std": 0.1, "grad_norm": 0.07084682583808899, "kl": 0.9571394979953766, "learning_rate": 9.947308061411678e-06, "loss": 0.0005366875790059567, "num_tokens": 9294399.0, "reward": 0.9989999771118164, "reward_std": 0.4200214326381683, "rewards/env_game_reward/mean": 0.9989999771118164, "rewards/env_game_reward/std": 0.4420970261096954, "sampling/importance_sampling_ratio/max": 1.6333736658096314, "sampling/importance_sampling_ratio/mean": 1.0077808022499084, "sampling/importance_sampling_ratio/min": 0.6362493216991425, "sampling/sampling_logp_difference/max": 0.6116647005081177, "sampling/sampling_logp_difference/mean": 0.02449062168598175, "step": 540, "step_time": 4.512151982798969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0010638297535479069, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0010638297535479069, "completions/clipped_ratio": 0.0, "completions/max_length": 1399.6, "completions/max_terminated_length": 1399.6, "completions/mean_length": 1103.425, "completions/mean_terminated_length": 1103.425, "completions/min_length": 832.0, "completions/min_terminated_length": 832.0, "entropy": 0.31689401865005495, "epoch": 0.0436, "frac_reward_zero_std": 0.175, "grad_norm": 0.05505409464240074, "kl": 1.0262145280838013, "learning_rate": 9.947241633679265e-06, "loss": 0.0009896579198539258, "num_tokens": 9427316.0, "reward": 1.0178749799728393, "reward_std": 0.3282743275165558, "rewards/env_game_reward/mean": 1.0178749799728393, "rewards/env_game_reward/std": 0.4406755268573761, "sampling/importance_sampling_ratio/max": 1.4536394834518434, "sampling/importance_sampling_ratio/mean": 1.0220404386520385, "sampling/importance_sampling_ratio/min": 0.7271527051925659, "sampling/sampling_logp_difference/max": 0.3957766056060791, "sampling/sampling_logp_difference/mean": 0.017512153089046478, "step": 545, "step_time": 4.728214942799968 }, { "clip_ratio/high_max": 0.002500000037252903, "clip_ratio/high_mean": 0.0012500000186264515, "clip_ratio/low_mean": 0.0010638297535479069, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002313829772174358, "completions/clipped_ratio": 0.0, "completions/max_length": 1364.8, "completions/max_terminated_length": 1364.8, "completions/mean_length": 1103.85, "completions/mean_terminated_length": 1103.85, "completions/min_length": 813.0, "completions/min_terminated_length": 813.0, "entropy": 0.2996835097670555, "epoch": 0.044, "frac_reward_zero_std": 0.1, "grad_norm": 0.05984482169151306, "kl": 1.0265156745910644, "learning_rate": 9.947174550594512e-06, "loss": -0.0004418407566845417, "num_tokens": 9559899.0, "reward": 1.1202500104904174, "reward_std": 0.35602826476097105, "rewards/env_game_reward/mean": 1.1202500104904174, "rewards/env_game_reward/std": 0.3752091586589813, "sampling/importance_sampling_ratio/max": 1.2323621749877929, "sampling/importance_sampling_ratio/mean": 0.9733627676963806, "sampling/importance_sampling_ratio/min": 0.5377955138683319, "sampling/sampling_logp_difference/max": 0.519776713848114, "sampling/sampling_logp_difference/mean": 0.019513449072837828, "step": 550, "step_time": 4.6486869933993145 }, { "clip_ratio/high_max": 0.0023255813866853714, "clip_ratio/high_mean": 0.0011627906933426857, "clip_ratio/low_mean": 0.002273901831358671, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0034366926178336144, "completions/clipped_ratio": 0.0, "completions/max_length": 1365.4, "completions/max_terminated_length": 1365.4, "completions/mean_length": 1047.0375, "completions/mean_terminated_length": 1047.0375, "completions/min_length": 743.0, "completions/min_terminated_length": 743.0, "entropy": 0.22820044606924056, "epoch": 0.0444, "frac_reward_zero_std": 0.25, "grad_norm": 0.13629508018493652, "kl": 1.2704419553279878, "learning_rate": 9.947106812169217e-06, "loss": 0.0005231309216469527, "num_tokens": 9687189.0, "reward": 1.0570000410079956, "reward_std": 0.2750645339488983, "rewards/env_game_reward/mean": 1.0570000410079956, "rewards/env_game_reward/std": 0.41252206563949584, "sampling/importance_sampling_ratio/max": 1.3011247158050536, "sampling/importance_sampling_ratio/mean": 0.997124445438385, "sampling/importance_sampling_ratio/min": 0.6755926370620727, "sampling/sampling_logp_difference/max": 0.3411633610725403, "sampling/sampling_logp_difference/mean": 0.016007083281874657, "step": 555, "step_time": 4.4430775420005375 }, { "clip_ratio/high_max": 0.004764605686068535, "clip_ratio/high_mean": 0.0023823028430342676, "clip_ratio/low_mean": 0.0022474748082458975, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004629777651280165, "completions/clipped_ratio": 0.0, "completions/max_length": 1295.6, "completions/max_terminated_length": 1295.6, "completions/mean_length": 1009.375, "completions/mean_terminated_length": 1009.375, "completions/min_length": 747.0, "completions/min_terminated_length": 747.0, "entropy": 0.189104101061821, "epoch": 0.0448, "frac_reward_zero_std": 0.25, "grad_norm": 0.03939010575413704, "kl": 1.5311454951763153, "learning_rate": 9.947038418415283e-06, "loss": 0.0001573776826262474, "num_tokens": 9811161.0, "reward": 1.1691249847412108, "reward_std": 0.21195527017116547, "rewards/env_game_reward/mean": 1.1691249847412108, "rewards/env_game_reward/std": 0.30571516156196593, "sampling/importance_sampling_ratio/max": 1.4245776176452636, "sampling/importance_sampling_ratio/mean": 0.9870476484298706, "sampling/importance_sampling_ratio/min": 0.5857455730438232, "sampling/sampling_logp_difference/max": 0.5550769805908203, "sampling/sampling_logp_difference/mean": 0.016982442513108254, "step": 560, "step_time": 4.34924570279909 }, { "clip_ratio/high_max": 0.002222222276031971, "clip_ratio/high_mean": 0.0011111111380159855, "clip_ratio/low_mean": 0.0022991543635725977, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003410265501588583, "completions/clipped_ratio": 0.0, "completions/max_length": 1266.8, "completions/max_terminated_length": 1266.8, "completions/mean_length": 1015.125, "completions/mean_terminated_length": 1015.125, "completions/min_length": 774.0, "completions/min_terminated_length": 774.0, "entropy": 0.17396872639656066, "epoch": 0.0452, "frac_reward_zero_std": 0.2, "grad_norm": 0.0472419410943985, "kl": 1.4365731865167617, "learning_rate": 9.946969369344737e-06, "loss": 0.00030971074011176825, "num_tokens": 9935273.0, "reward": 1.1539999842643738, "reward_std": 0.28637823164463044, "rewards/env_game_reward/mean": 1.1539999842643738, "rewards/env_game_reward/std": 0.3490210175514221, "sampling/importance_sampling_ratio/max": 1.4732376098632813, "sampling/importance_sampling_ratio/mean": 0.9988004565238953, "sampling/importance_sampling_ratio/min": 0.72213294506073, "sampling/sampling_logp_difference/max": 0.4380834102630615, "sampling/sampling_logp_difference/mean": 0.011317184008657933, "step": 565, "step_time": 4.254104064800049 }, { "clip_ratio/high_max": 0.002222222276031971, "clip_ratio/high_mean": 0.0011111111380159855, "clip_ratio/low_mean": 0.0034126984886825086, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004523809626698494, "completions/clipped_ratio": 0.0, "completions/max_length": 1366.8, "completions/max_terminated_length": 1366.8, "completions/mean_length": 1055.375, "completions/mean_terminated_length": 1055.375, "completions/min_length": 764.0, "completions/min_terminated_length": 764.0, "entropy": 0.16300990134477616, "epoch": 0.0456, "frac_reward_zero_std": 0.325, "grad_norm": 0.05742908641695976, "kl": 1.029376059770584, "learning_rate": 9.946899664969714e-06, "loss": 0.0016465794295072555, "num_tokens": 10062425.0, "reward": 1.2022499561309814, "reward_std": 0.23970919847488403, "rewards/env_game_reward/mean": 1.2022499561309814, "rewards/env_game_reward/std": 0.3561276257038116, "sampling/importance_sampling_ratio/max": 1.472624158859253, "sampling/importance_sampling_ratio/mean": 1.0107559204101562, "sampling/importance_sampling_ratio/min": 0.7311721444129944, "sampling/sampling_logp_difference/max": 0.49366533756256104, "sampling/sampling_logp_difference/mean": 0.012838244996964931, "step": 570, "step_time": 4.771000784600619 }, { "clip_ratio/high_max": 0.002380952425301075, "clip_ratio/high_mean": 0.0011904762126505376, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0011904762126505376, "completions/clipped_ratio": 0.0, "completions/max_length": 1361.2, "completions/max_terminated_length": 1361.2, "completions/mean_length": 1047.8125, "completions/mean_terminated_length": 1047.8125, "completions/min_length": 716.4, "completions/min_terminated_length": 716.4, "entropy": 0.17721292078495027, "epoch": 0.046, "frac_reward_zero_std": 0.35, "grad_norm": 0.07721905410289764, "kl": 1.2694499015808105, "learning_rate": 9.946829305302469e-06, "loss": 0.00022875519935041665, "num_tokens": 10189363.0, "reward": 1.1711250066757202, "reward_std": 0.2282187223434448, "rewards/env_game_reward/mean": 1.1711250066757202, "rewards/env_game_reward/std": 0.38280072808265686, "sampling/importance_sampling_ratio/max": 1.3734643936157227, "sampling/importance_sampling_ratio/mean": 0.9969516515731811, "sampling/importance_sampling_ratio/min": 0.5975641906261444, "sampling/sampling_logp_difference/max": 0.6332993030548095, "sampling/sampling_logp_difference/mean": 0.014802565798163414, "step": 575, "step_time": 5.061972107600741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1397.4, "completions/max_terminated_length": 1397.4, "completions/mean_length": 1023.0375, "completions/mean_terminated_length": 1023.0375, "completions/min_length": 725.2, "completions/min_terminated_length": 725.2, "entropy": 0.15077468156814575, "epoch": 0.0464, "frac_reward_zero_std": 0.25, "grad_norm": 0.034438736736774445, "kl": 0.9025042593479157, "learning_rate": 9.946758290355367e-06, "loss": 0.0001244666986167431, "num_tokens": 10313827.0, "reward": 1.2078750133514404, "reward_std": 0.283373036980629, "rewards/env_game_reward/mean": 1.2078750133514404, "rewards/env_game_reward/std": 0.3918757438659668, "sampling/importance_sampling_ratio/max": 1.2872359037399292, "sampling/importance_sampling_ratio/mean": 0.991502559185028, "sampling/importance_sampling_ratio/min": 0.6531059622764588, "sampling/sampling_logp_difference/max": 0.43392740488052367, "sampling/sampling_logp_difference/mean": 0.011175908334553241, "step": 580, "step_time": 4.977742930199019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0022774327546358107, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0022774327546358107, "completions/clipped_ratio": 0.0, "completions/max_length": 1395.6, "completions/max_terminated_length": 1395.6, "completions/mean_length": 1072.8625, "completions/mean_terminated_length": 1072.8625, "completions/min_length": 729.0, "completions/min_terminated_length": 729.0, "entropy": 0.1566304475069046, "epoch": 0.0468, "frac_reward_zero_std": 0.25, "grad_norm": 0.043017007410526276, "kl": 1.0742876887321473, "learning_rate": 9.946686620140896e-06, "loss": -0.0001473414245992899, "num_tokens": 10443641.0, "reward": 1.252625036239624, "reward_std": 0.18862073719501496, "rewards/env_game_reward/mean": 1.252625036239624, "rewards/env_game_reward/std": 0.31137059032917025, "sampling/importance_sampling_ratio/max": 1.440973162651062, "sampling/importance_sampling_ratio/mean": 0.9900519251823425, "sampling/importance_sampling_ratio/min": 0.47393553853034975, "sampling/sampling_logp_difference/max": 0.7416575908660888, "sampling/sampling_logp_difference/mean": 0.017456328868865965, "step": 585, "step_time": 5.251834656600113 }, { "epoch": 0.04712, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 1344.6666666666667, "eval_completions/max_terminated_length": 1344.6666666666667, "eval_completions/mean_length": 1168.0416666666667, "eval_completions/mean_terminated_length": 1168.0416666666667, "eval_completions/min_length": 862.3333333333334, "eval_completions/min_terminated_length": 862.3333333333334, "eval_entropy": 0.10806348671515782, "eval_frac_reward_zero_std": 0.5, "eval_kl": 0.8868140776952108, "eval_loss": 0.0005803019157610834, "eval_num_tokens": 10555310.0, "eval_reward": 1.3479166825612385, "eval_reward_std": 0.1431891197959582, "eval_rewards/env_game_reward/mean": 1.3479166825612385, "eval_rewards/env_game_reward/std": 0.2515455484390259, "eval_runtime": 5.5745, "eval_samples_per_second": 1.794, "eval_sampling/importance_sampling_ratio/max": 1.4131007194519043, "eval_sampling/importance_sampling_ratio/mean": 1.034290115038554, "eval_sampling/importance_sampling_ratio/min": 0.6967911720275879, "eval_sampling/sampling_logp_difference/max": 0.37542269627253216, "eval_sampling/sampling_logp_difference/mean": 0.01116576852897803, "eval_steps_per_second": 0.359, "step": 589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.004533906187862158, "clip_ratio/low_min": 0.0022727273404598235, "clip_ratio/region_mean": 0.004533906187862158, "completions/clipped_ratio": 0.0, "completions/max_length": 1399.4, "completions/max_terminated_length": 1399.4, "completions/mean_length": 1164.425, "completions/mean_terminated_length": 1164.425, "completions/min_length": 828.0, "completions/min_terminated_length": 828.0, "entropy": 0.12822237834334374, "epoch": 0.0472, "frac_reward_zero_std": 0.375, "grad_norm": 0.036244940012693405, "kl": 1.0616086661815642, "learning_rate": 9.946614294671653e-06, "loss": 9.502761531621218e-06, "num_tokens": 10581987.0, "reward": 1.3098750352859496, "reward_std": 0.17058949768543244, "rewards/env_game_reward/mean": 1.3098750352859496, "rewards/env_game_reward/std": 0.3004313975572586, "sampling/importance_sampling_ratio/max": 1.4858418226242065, "sampling/importance_sampling_ratio/mean": 1.0072325468063354, "sampling/importance_sampling_ratio/min": 0.5586565196514129, "sampling/sampling_logp_difference/max": 0.6491478204727172, "sampling/sampling_logp_difference/mean": 0.015580065548419952, "step": 590, "step_time": 5.07703026540039 }, { "clip_ratio/high_max": 0.0024390242993831634, "clip_ratio/high_mean": 0.0012195121496915817, "clip_ratio/low_mean": 0.0011904762126505376, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002409988362342119, "completions/clipped_ratio": 0.0, "completions/max_length": 1362.0, "completions/max_terminated_length": 1362.0, "completions/mean_length": 1053.4625, "completions/mean_terminated_length": 1053.4625, "completions/min_length": 767.8, "completions/min_terminated_length": 767.8, "entropy": 0.1259395658969879, "epoch": 0.0476, "frac_reward_zero_std": 0.225, "grad_norm": 0.0859086737036705, "kl": 1.2129336059093476, "learning_rate": 9.94654131396035e-06, "loss": -0.00022206176072359085, "num_tokens": 10709974.0, "reward": 1.2372500419616699, "reward_std": 0.2305168092250824, "rewards/env_game_reward/mean": 1.2372500419616699, "rewards/env_game_reward/std": 0.31874315440654755, "sampling/importance_sampling_ratio/max": 1.3722262382507324, "sampling/importance_sampling_ratio/mean": 0.9773016810417176, "sampling/importance_sampling_ratio/min": 0.5880869805812836, "sampling/sampling_logp_difference/max": 0.6137372255325317, "sampling/sampling_logp_difference/mean": 0.01840695794671774, "step": 595, "step_time": 4.8652321020003 }, { "clip_ratio/high_max": 0.00467391312122345, "clip_ratio/high_mean": 0.002336956560611725, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002336956560611725, "completions/clipped_ratio": 0.0, "completions/max_length": 1369.0, "completions/max_terminated_length": 1369.0, "completions/mean_length": 1018.6375, "completions/mean_terminated_length": 1018.6375, "completions/min_length": 701.6, "completions/min_terminated_length": 701.6, "entropy": 0.12487654238939286, "epoch": 0.048, "frac_reward_zero_std": 0.175, "grad_norm": 0.09567244350910187, "kl": 1.3309521913528441, "learning_rate": 9.94646767801982e-06, "loss": -0.0002675119787454605, "num_tokens": 10833208.0, "reward": 1.2700000047683715, "reward_std": 0.2577404260635376, "rewards/env_game_reward/mean": 1.2700000047683715, "rewards/env_game_reward/std": 0.311552906036377, "sampling/importance_sampling_ratio/max": 1.4984299182891845, "sampling/importance_sampling_ratio/mean": 0.9919448137283325, "sampling/importance_sampling_ratio/min": 0.5176741182804108, "sampling/sampling_logp_difference/max": 0.763327705860138, "sampling/sampling_logp_difference/mean": 0.01984752044081688, "step": 600, "step_time": 5.101835844999004 }, { "epoch": 0.048, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 1345.6666666666667, "eval_completions/max_terminated_length": 1345.6666666666667, "eval_completions/mean_length": 1157.8333333333333, "eval_completions/mean_terminated_length": 1157.8333333333333, "eval_completions/min_length": 839.0, "eval_completions/min_terminated_length": 839.0, "eval_entropy": 0.08671100437641144, "eval_frac_reward_zero_std": 0.25, "eval_kl": 0.9999842445055643, "eval_loss": 0.000851858698297292, "eval_num_tokens": 10833208.0, "eval_reward": 1.3208333651224773, "eval_reward_std": 0.14613540470600128, "eval_rewards/env_game_reward/mean": 1.3208333651224773, "eval_rewards/env_game_reward/std": 0.25870031118392944, "eval_runtime": 5.5348, "eval_samples_per_second": 1.807, "eval_sampling/importance_sampling_ratio/max": 1.4152355988820393, "eval_sampling/importance_sampling_ratio/mean": 1.0069709221522014, "eval_sampling/importance_sampling_ratio/min": 0.7177760501702627, "eval_sampling/sampling_logp_difference/max": 0.5611036419868469, "eval_sampling/sampling_logp_difference/mean": 0.013044494514664015, "eval_steps_per_second": 0.361, "step": 600 }, { "clip_ratio/high_max": 0.002380952425301075, "clip_ratio/high_mean": 0.0011904762126505376, "clip_ratio/low_mean": 0.002273901831358671, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0034643780440092088, "completions/clipped_ratio": 0.0, "completions/max_length": 1366.8, "completions/max_terminated_length": 1366.8, "completions/mean_length": 1079.5875, "completions/mean_terminated_length": 1079.5875, "completions/min_length": 725.6, "completions/min_terminated_length": 725.6, "entropy": 0.0867956567555666, "epoch": 0.0484, "frac_reward_zero_std": 0.375, "grad_norm": 0.021401429548859596, "kl": 0.9383753180503845, "learning_rate": 9.946393386863004e-06, "loss": 0.0008476153947412967, "num_tokens": 10963503.0, "reward": 1.3545000076293945, "reward_std": 0.1400071382522583, "rewards/env_game_reward/mean": 1.3545000076293945, "rewards/env_game_reward/std": 0.22545891106128693, "sampling/importance_sampling_ratio/max": 1.553718662261963, "sampling/importance_sampling_ratio/mean": 1.0068870782852173, "sampling/importance_sampling_ratio/min": 0.5401573002338409, "sampling/sampling_logp_difference/max": 0.7244943857192994, "sampling/sampling_logp_difference/mean": 0.013488991186022758, "step": 605, "step_time": 4.886114542601717 }, { "clip_ratio/high_max": 0.002222222276031971, "clip_ratio/high_mean": 0.0011111111380159855, "clip_ratio/low_mean": 0.003518666513264179, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004629777651280165, "completions/clipped_ratio": 0.0, "completions/max_length": 1359.2, "completions/max_terminated_length": 1359.2, "completions/mean_length": 1043.95, "completions/mean_terminated_length": 1043.95, "completions/min_length": 776.8, "completions/min_terminated_length": 776.8, "entropy": 0.0908737700432539, "epoch": 0.0488, "frac_reward_zero_std": 0.225, "grad_norm": 0.017844822257757187, "kl": 1.0020154118537903, "learning_rate": 9.946318440502963e-06, "loss": 0.00019409202504903079, "num_tokens": 11090764.0, "reward": 1.2816250324249268, "reward_std": 0.19922733902931214, "rewards/env_game_reward/mean": 1.2816250324249268, "rewards/env_game_reward/std": 0.2757085144519806, "sampling/importance_sampling_ratio/max": 1.3940518379211426, "sampling/importance_sampling_ratio/mean": 0.9948145389556885, "sampling/importance_sampling_ratio/min": 0.520936244726181, "sampling/sampling_logp_difference/max": 0.7158478498458862, "sampling/sampling_logp_difference/mean": 0.013564139045774937, "step": 610, "step_time": 5.003223898000579 }, { "clip_ratio/high_max": 0.0022727273404598235, "clip_ratio/high_mean": 0.0011363636702299118, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0011363636702299118, "completions/clipped_ratio": 0.0, "completions/max_length": 1397.0, "completions/max_terminated_length": 1397.0, "completions/mean_length": 1096.8375, "completions/mean_terminated_length": 1096.8375, "completions/min_length": 775.6, "completions/min_terminated_length": 775.6, "entropy": 0.06849537193775176, "epoch": 0.0492, "frac_reward_zero_std": 0.4, "grad_norm": 0.3096023499965668, "kl": 1.5637618243694305, "learning_rate": 9.946242838952871e-06, "loss": -0.00011666212230920792, "num_tokens": 11222478.0, "reward": 1.3425000190734864, "reward_std": 0.1347038432955742, "rewards/env_game_reward/mean": 1.3425000190734864, "rewards/env_game_reward/std": 0.215749391913414, "sampling/importance_sampling_ratio/max": 1.6003359794616698, "sampling/importance_sampling_ratio/mean": 1.0017924785614014, "sampling/importance_sampling_ratio/min": 0.6305740118026734, "sampling/sampling_logp_difference/max": 0.5970579147338867, "sampling/sampling_logp_difference/mean": 0.011274610366672277, "step": 615, "step_time": 4.919843460199627 }, { "clip_ratio/high_max": 0.004651162773370743, "clip_ratio/high_mean": 0.0023255813866853714, "clip_ratio/low_mean": 0.004850464593619108, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00717604598030448, "completions/clipped_ratio": 0.0, "completions/max_length": 1394.2, "completions/max_terminated_length": 1394.2, "completions/mean_length": 1084.875, "completions/mean_terminated_length": 1084.875, "completions/min_length": 701.6, "completions/min_terminated_length": 701.6, "entropy": 0.07974468879401683, "epoch": 0.0496, "frac_reward_zero_std": 0.4, "grad_norm": 0.09348504990339279, "kl": 1.5357508301734923, "learning_rate": 9.946166582226018e-06, "loss": 0.001522757112979889, "num_tokens": 11354065.0, "reward": 1.349375033378601, "reward_std": 0.10235370472073554, "rewards/env_game_reward/mean": 1.349375033378601, "rewards/env_game_reward/std": 0.2258108526468277, "sampling/importance_sampling_ratio/max": 1.5427998304367065, "sampling/importance_sampling_ratio/mean": 0.9522350907325745, "sampling/importance_sampling_ratio/min": 0.401968015730381, "sampling/sampling_logp_difference/max": 1.563115644454956, "sampling/sampling_logp_difference/mean": 0.021521523036062716, "step": 620, "step_time": 5.169165654201788 }, { "clip_ratio/high_max": 0.002222222276031971, "clip_ratio/high_mean": 0.0011111111380159855, "clip_ratio/low_mean": 0.0022266204468905926, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003337731584906578, "completions/clipped_ratio": 0.0, "completions/max_length": 1394.0, "completions/max_terminated_length": 1394.0, "completions/mean_length": 1078.9875, "completions/mean_terminated_length": 1078.9875, "completions/min_length": 741.0, "completions/min_terminated_length": 741.0, "entropy": 0.08226897865533829, "epoch": 0.05, "frac_reward_zero_std": 0.2, "grad_norm": 0.03745371475815773, "kl": 1.0471153795719146, "learning_rate": 9.946089670335809e-06, "loss": -4.6807329636067155e-05, "num_tokens": 11484123.0, "reward": 1.3264999866485596, "reward_std": 0.15733125507831575, "rewards/env_game_reward/mean": 1.3264999866485596, "rewards/env_game_reward/std": 0.248744997382164, "sampling/importance_sampling_ratio/max": 1.4238209962844848, "sampling/importance_sampling_ratio/mean": 0.9809260606765747, "sampling/importance_sampling_ratio/min": 0.6000564575195313, "sampling/sampling_logp_difference/max": 0.7538355946540832, "sampling/sampling_logp_difference/mean": 0.012904992513358592, "step": 625, "step_time": 5.083012864999182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0010204081423580646, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0010204081423580646, "completions/clipped_ratio": 0.0, "completions/max_length": 1500.2, "completions/max_terminated_length": 1500.2, "completions/mean_length": 1179.3125, "completions/mean_terminated_length": 1179.3125, "completions/min_length": 850.6, "completions/min_terminated_length": 850.6, "entropy": 0.12509409189224244, "epoch": 0.0504, "frac_reward_zero_std": 0.175, "grad_norm": 0.04384912550449371, "kl": 1.6370096325874328, "learning_rate": 9.946012103295763e-06, "loss": 0.00025049857795238494, "num_tokens": 11621524.0, "reward": 1.265916681289673, "reward_std": 0.2652829051017761, "rewards/env_game_reward/mean": 1.265916681289673, "rewards/env_game_reward/std": 0.38448314666748046, "sampling/importance_sampling_ratio/max": 1.7073483228683473, "sampling/importance_sampling_ratio/mean": 0.9963474154472352, "sampling/importance_sampling_ratio/min": 0.4732287287712097, "sampling/sampling_logp_difference/max": 0.8960028886795044, "sampling/sampling_logp_difference/mean": 0.02103390172123909, "step": 630, "step_time": 5.693049415801215 }, { "clip_ratio/high_max": 0.004001600667834282, "clip_ratio/high_mean": 0.002000800333917141, "clip_ratio/low_mean": 0.0038125600665807726, "clip_ratio/low_min": 0.00181818176060915, "clip_ratio/region_mean": 0.0058133604004979135, "completions/clipped_ratio": 0.0, "completions/max_length": 1703.2, "completions/max_terminated_length": 1703.2, "completions/mean_length": 1235.5875, "completions/mean_terminated_length": 1235.5875, "completions/min_length": 869.0, "completions/min_terminated_length": 869.0, "entropy": 0.13607805520296096, "epoch": 0.0508, "frac_reward_zero_std": 0.075, "grad_norm": 0.07780463993549347, "kl": 1.3788925766944886, "learning_rate": 9.94593388111952e-06, "loss": 0.0022235548123717306, "num_tokens": 11763516.0, "reward": 1.2385416746139526, "reward_std": 0.3068254292011261, "rewards/env_game_reward/mean": 1.2385416746139526, "rewards/env_game_reward/std": 0.3703270435333252, "sampling/importance_sampling_ratio/max": 1.830536437034607, "sampling/importance_sampling_ratio/mean": 1.037343180179596, "sampling/importance_sampling_ratio/min": 0.6153202593326569, "sampling/sampling_logp_difference/max": 0.6751439929008484, "sampling/sampling_logp_difference/mean": 0.018885864317417143, "step": 635, "step_time": 6.099398655800906 }, { "clip_ratio/high_max": 0.003704974241554737, "clip_ratio/high_mean": 0.0018524871207773685, "clip_ratio/low_mean": 0.0010416666977107526, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002894153818488121, "completions/clipped_ratio": 0.0, "completions/max_length": 1667.2, "completions/max_terminated_length": 1667.2, "completions/mean_length": 1243.275, "completions/mean_terminated_length": 1243.275, "completions/min_length": 862.8, "completions/min_terminated_length": 862.8, "entropy": 0.14713116511702537, "epoch": 0.0512, "frac_reward_zero_std": 0.1, "grad_norm": 0.06170591339468956, "kl": 1.3155274748802186, "learning_rate": 9.945855003820824e-06, "loss": 0.0015274440869688989, "num_tokens": 11905864.0, "reward": 1.2326250314712524, "reward_std": 0.2401216834783554, "rewards/env_game_reward/mean": 1.2326250314712524, "rewards/env_game_reward/std": 0.33104810416698455, "sampling/importance_sampling_ratio/max": 1.4255517959594726, "sampling/importance_sampling_ratio/mean": 0.9756847858428955, "sampling/importance_sampling_ratio/min": 0.591080230474472, "sampling/sampling_logp_difference/max": 0.5704588890075684, "sampling/sampling_logp_difference/mean": 0.015524715185165405, "step": 640, "step_time": 6.075532784599636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0019615384750068187, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0019615384750068187, "completions/clipped_ratio": 0.0, "completions/max_length": 1701.2, "completions/max_terminated_length": 1701.2, "completions/mean_length": 1271.575, "completions/mean_terminated_length": 1271.575, "completions/min_length": 888.0, "completions/min_terminated_length": 888.0, "entropy": 0.1372353859245777, "epoch": 0.0516, "frac_reward_zero_std": 0.2, "grad_norm": 0.051711492240428925, "kl": 1.1370346248149872, "learning_rate": 9.945775471413545e-06, "loss": -0.00048790322616696356, "num_tokens": 12050609.0, "reward": 1.2797083377838134, "reward_std": 0.25072828233242034, "rewards/env_game_reward/mean": 1.2797083377838134, "rewards/env_game_reward/std": 0.34153226017951965, "sampling/importance_sampling_ratio/max": 1.3425079822540282, "sampling/importance_sampling_ratio/mean": 0.9526666402816772, "sampling/importance_sampling_ratio/min": 0.6094960927963257, "sampling/sampling_logp_difference/max": 0.49349377155303953, "sampling/sampling_logp_difference/mean": 0.013823052123188972, "step": 645, "step_time": 5.696355382000911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.001963804382830858, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001963804382830858, "completions/clipped_ratio": 0.0, "completions/max_length": 1709.6, "completions/max_terminated_length": 1709.6, "completions/mean_length": 1273.3875, "completions/mean_terminated_length": 1273.3875, "completions/min_length": 930.0, "completions/min_terminated_length": 930.0, "entropy": 0.1402006097137928, "epoch": 0.052, "frac_reward_zero_std": 0.175, "grad_norm": 0.053286418318748474, "kl": 0.9155610263347626, "learning_rate": 9.945695283911661e-06, "loss": 0.0008199753239750863, "num_tokens": 12194619.0, "reward": 1.1976250171661378, "reward_std": 0.28514082431793214, "rewards/env_game_reward/mean": 1.1976250171661378, "rewards/env_game_reward/std": 0.36828628182411194, "sampling/importance_sampling_ratio/max": 1.6765745401382446, "sampling/importance_sampling_ratio/mean": 0.9975801944732666, "sampling/importance_sampling_ratio/min": 0.5081476271152496, "sampling/sampling_logp_difference/max": 0.6622140645980835, "sampling/sampling_logp_difference/mean": 0.014845983497798443, "step": 650, "step_time": 5.861162030600099 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.002022058889269829, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002022058889269829, "completions/clipped_ratio": 0.0, "completions/max_length": 1546.0, "completions/max_terminated_length": 1546.0, "completions/mean_length": 1165.9375, "completions/mean_terminated_length": 1165.9375, "completions/min_length": 874.4, "completions/min_terminated_length": 874.4, "entropy": 0.13178882375359535, "epoch": 0.0524, "frac_reward_zero_std": 0.05, "grad_norm": 0.07173304259777069, "kl": 1.5115601539611816, "learning_rate": 9.94561444132927e-06, "loss": 0.000665221968665719, "num_tokens": 12329314.0, "reward": 1.0957500100135804, "reward_std": 0.3418861448764801, "rewards/env_game_reward/mean": 1.0957500100135804, "rewards/env_game_reward/std": 0.41972014904022215, "sampling/importance_sampling_ratio/max": 1.5416927576065063, "sampling/importance_sampling_ratio/mean": 1.0306774377822876, "sampling/importance_sampling_ratio/min": 0.6824665546417237, "sampling/sampling_logp_difference/max": 0.5923709630966186, "sampling/sampling_logp_difference/mean": 0.014958329871296883, "step": 655, "step_time": 4.971940669400647 }, { "clip_ratio/high_max": 0.005824592150747776, "clip_ratio/high_mean": 0.003932704217731953, "clip_ratio/low_mean": 0.001981946639716625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005914650764316321, "completions/clipped_ratio": 0.0, "completions/max_length": 1711.8, "completions/max_terminated_length": 1711.8, "completions/mean_length": 1323.8, "completions/mean_terminated_length": 1323.8, "completions/min_length": 925.4, "completions/min_terminated_length": 925.4, "entropy": 0.11637530401349068, "epoch": 0.0528, "frac_reward_zero_std": 0.1, "grad_norm": 0.04864488169550896, "kl": 0.9532750248908997, "learning_rate": 9.945532943680585e-06, "loss": 0.001364955585449934, "num_tokens": 12478993.0, "reward": 1.1640416860580445, "reward_std": 0.30364345014095306, "rewards/env_game_reward/mean": 1.1640416860580445, "rewards/env_game_reward/std": 0.3607664942741394, "sampling/importance_sampling_ratio/max": 1.532017135620117, "sampling/importance_sampling_ratio/mean": 0.9525379180908203, "sampling/importance_sampling_ratio/min": 0.4082610189914703, "sampling/sampling_logp_difference/max": 0.7750426650047302, "sampling/sampling_logp_difference/mean": 0.01715007275342941, "step": 660, "step_time": 5.538303243199334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.004831413738429546, "clip_ratio/low_min": 0.00181818176060915, "clip_ratio/region_mean": 0.004831413738429546, "completions/clipped_ratio": 0.0, "completions/max_length": 1708.8, "completions/max_terminated_length": 1708.8, "completions/mean_length": 1283.0875, "completions/mean_terminated_length": 1283.0875, "completions/min_length": 951.2, "completions/min_terminated_length": 951.2, "entropy": 0.08296727910637855, "epoch": 0.0532, "frac_reward_zero_std": 0.175, "grad_norm": 0.0572076216340065, "kl": 0.8097724616527557, "learning_rate": 9.945450790979927e-06, "loss": 0.0013984257355332374, "num_tokens": 12624480.0, "reward": 1.2450417280197144, "reward_std": 0.2561494290828705, "rewards/env_game_reward/mean": 1.2450417280197144, "rewards/env_game_reward/std": 0.3560167372226715, "sampling/importance_sampling_ratio/max": 1.6131016254425048, "sampling/importance_sampling_ratio/mean": 0.9815467715263366, "sampling/importance_sampling_ratio/min": 0.6314162969589233, "sampling/sampling_logp_difference/max": 0.6175156712532044, "sampling/sampling_logp_difference/mean": 0.012766172736883163, "step": 665, "step_time": 5.5065391923984865 }, { "clip_ratio/high_max": 0.003603896126151085, "clip_ratio/high_mean": 0.0018019480630755424, "clip_ratio/low_mean": 0.002848330978304148, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00465027904137969, "completions/clipped_ratio": 0.0, "completions/max_length": 1633.8, "completions/max_terminated_length": 1633.8, "completions/mean_length": 1327.1375, "completions/mean_terminated_length": 1327.1375, "completions/min_length": 937.4, "completions/min_terminated_length": 937.4, "entropy": 0.11601686254143714, "epoch": 0.0536, "frac_reward_zero_std": 0.025, "grad_norm": 0.0497339628636837, "kl": 2.4915341794490815, "learning_rate": 9.94536798324174e-06, "loss": 0.0018591852858662605, "num_tokens": 12775518.0, "reward": 1.0752083778381347, "reward_std": 0.42891920208930967, "rewards/env_game_reward/mean": 1.0752083778381347, "rewards/env_game_reward/std": 0.43512017130851743, "sampling/importance_sampling_ratio/max": 1.531420397758484, "sampling/importance_sampling_ratio/mean": 1.0097869038581848, "sampling/importance_sampling_ratio/min": 0.5700468063354492, "sampling/sampling_logp_difference/max": 0.4885990142822266, "sampling/sampling_logp_difference/mean": 0.01330602504312992, "step": 670, "step_time": 5.495691229999648 }, { "clip_ratio/high_max": 0.0038838613778352737, "clip_ratio/high_mean": 0.0019419306889176368, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0019419306889176368, "completions/clipped_ratio": 0.0, "completions/max_length": 1708.6, "completions/max_terminated_length": 1708.6, "completions/mean_length": 1330.8, "completions/mean_terminated_length": 1330.8, "completions/min_length": 972.6, "completions/min_terminated_length": 972.6, "entropy": 0.10391684621572495, "epoch": 0.054, "frac_reward_zero_std": 0.275, "grad_norm": 0.060416221618652344, "kl": 0.8995959877967834, "learning_rate": 9.945284520480583e-06, "loss": 0.0008666712790727615, "num_tokens": 12925994.0, "reward": 1.2439167022705078, "reward_std": 0.22910259962081908, "rewards/env_game_reward/mean": 1.2439167022705078, "rewards/env_game_reward/std": 0.35838334560394286, "sampling/importance_sampling_ratio/max": 1.6800979375839233, "sampling/importance_sampling_ratio/mean": 1.035771083831787, "sampling/importance_sampling_ratio/min": 0.751843523979187, "sampling/sampling_logp_difference/max": 0.5335385739803314, "sampling/sampling_logp_difference/mean": 0.01053644772619009, "step": 675, "step_time": 5.700633796999318 }, { "epoch": 0.054, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 1647.0, "eval_completions/max_terminated_length": 1647.0, "eval_completions/mean_length": 1418.6666666666667, "eval_completions/mean_terminated_length": 1418.6666666666667, "eval_completions/min_length": 1066.3333333333333, "eval_completions/min_terminated_length": 1066.3333333333333, "eval_entropy": 0.14034629861513773, "eval_frac_reward_zero_std": 0.0, "eval_kl": 0.8713982502619425, "eval_loss": -0.0012209699489176273, "eval_num_tokens": 12925994.0, "eval_reward": 1.1308333277702332, "eval_reward_std": 0.34923218687375385, "eval_rewards/env_game_reward/mean": 1.1308333277702332, "eval_rewards/env_game_reward/std": 0.39376481374104816, "eval_runtime": 5.821, "eval_samples_per_second": 1.718, "eval_sampling/importance_sampling_ratio/max": 1.0657129685084026, "eval_sampling/importance_sampling_ratio/mean": 0.9151384830474854, "eval_sampling/importance_sampling_ratio/min": 0.6475860774517059, "eval_sampling/sampling_logp_difference/max": 0.4063406785329183, "eval_sampling/sampling_logp_difference/mean": 0.01308465547238787, "eval_steps_per_second": 0.344, "step": 675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0019294990226626397, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0019294990226626397, "completions/clipped_ratio": 0.0, "completions/max_length": 1713.6, "completions/max_terminated_length": 1713.6, "completions/mean_length": 1285.875, "completions/mean_terminated_length": 1285.875, "completions/min_length": 907.2, "completions/min_terminated_length": 907.2, "entropy": 0.1750231884419918, "epoch": 0.0544, "frac_reward_zero_std": 0.225, "grad_norm": 0.10211564600467682, "kl": 0.8659535050392151, "learning_rate": 9.945200402711124e-06, "loss": 0.0010523485019803048, "num_tokens": 13072169.0, "reward": 1.1676250457763673, "reward_std": 0.29786873459815977, "rewards/env_game_reward/mean": 1.1676250457763673, "rewards/env_game_reward/std": 0.4279802978038788, "sampling/importance_sampling_ratio/max": 1.4572262048721314, "sampling/importance_sampling_ratio/mean": 1.0047440648078918, "sampling/importance_sampling_ratio/min": 0.6181734561920166, "sampling/sampling_logp_difference/max": 0.5347591876983643, "sampling/sampling_logp_difference/mean": 0.013022671453654766, "step": 680, "step_time": 5.70217539860023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0010416666977107526, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0010416666977107526, "completions/clipped_ratio": 0.0, "completions/max_length": 1707.0, "completions/max_terminated_length": 1707.0, "completions/mean_length": 1213.6875, "completions/mean_terminated_length": 1213.6875, "completions/min_length": 914.6, "completions/min_terminated_length": 914.6, "entropy": 0.16222424656152726, "epoch": 0.0548, "frac_reward_zero_std": 0.15, "grad_norm": 0.046301327645778656, "kl": 1.311411839723587, "learning_rate": 9.945115629948152e-06, "loss": 0.0011385465040802956, "num_tokens": 13211179.0, "reward": 1.187375044822693, "reward_std": 0.2891477644443512, "rewards/env_game_reward/mean": 1.187375044822693, "rewards/env_game_reward/std": 0.40909904837608335, "sampling/importance_sampling_ratio/max": 1.319038200378418, "sampling/importance_sampling_ratio/mean": 0.9719479799270629, "sampling/importance_sampling_ratio/min": 0.6795879960060119, "sampling/sampling_logp_difference/max": 0.41148210763931276, "sampling/sampling_logp_difference/mean": 0.010176723822951317, "step": 685, "step_time": 5.558335913400515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0010416666977107526, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0010416666977107526, "completions/clipped_ratio": 0.0, "completions/max_length": 1661.6, "completions/max_terminated_length": 1661.6, "completions/mean_length": 1282.8875, "completions/mean_terminated_length": 1282.8875, "completions/min_length": 927.4, "completions/min_terminated_length": 927.4, "entropy": 0.15061740726232528, "epoch": 0.0552, "frac_reward_zero_std": 0.225, "grad_norm": 0.03988010436296463, "kl": 0.9013077259063721, "learning_rate": 9.945030202206567e-06, "loss": 0.0009480739012360573, "num_tokens": 13357280.0, "reward": 1.1207083702087401, "reward_std": 0.29786873757839205, "rewards/env_game_reward/mean": 1.1207083702087401, "rewards/env_game_reward/std": 0.4572018027305603, "sampling/importance_sampling_ratio/max": 1.295949673652649, "sampling/importance_sampling_ratio/mean": 1.021243405342102, "sampling/importance_sampling_ratio/min": 0.7810358166694641, "sampling/sampling_logp_difference/max": 0.33255696296691895, "sampling/sampling_logp_difference/mean": 0.010920869559049607, "step": 690, "step_time": 5.616496510598518 }, { "clip_ratio/high_max": 0.0018867924809455872, "clip_ratio/high_mean": 0.0009433962404727936, "clip_ratio/low_mean": 0.0018424611538648606, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002785857394337654, "completions/clipped_ratio": 0.0, "completions/max_length": 1627.0, "completions/max_terminated_length": 1627.0, "completions/mean_length": 1297.0, "completions/mean_terminated_length": 1297.0, "completions/min_length": 929.6, "completions/min_terminated_length": 929.6, "entropy": 0.17935092002153397, "epoch": 0.0556, "frac_reward_zero_std": 0.05, "grad_norm": 0.04438179358839989, "kl": 1.0830827236175538, "learning_rate": 9.944944119501391e-06, "loss": 0.00012169899418950081, "num_tokens": 13503504.0, "reward": 1.102041745185852, "reward_std": 0.4174876242876053, "rewards/env_game_reward/mean": 1.102041745185852, "rewards/env_game_reward/std": 0.4797330558300018, "sampling/importance_sampling_ratio/max": 1.2585298061370849, "sampling/importance_sampling_ratio/mean": 0.991274642944336, "sampling/importance_sampling_ratio/min": 0.684896045923233, "sampling/sampling_logp_difference/max": 0.49414026737213135, "sampling/sampling_logp_difference/mean": 0.011651785112917423, "step": 695, "step_time": 5.530571238000266 }, { "clip_ratio/high_max": 0.0019999999552965165, "clip_ratio/high_mean": 0.0009999999776482583, "clip_ratio/low_mean": 0.0019615384750068187, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002961538452655077, "completions/clipped_ratio": 0.0, "completions/max_length": 1601.0, "completions/max_terminated_length": 1601.0, "completions/mean_length": 1268.6, "completions/mean_terminated_length": 1268.6, "completions/min_length": 884.4, "completions/min_terminated_length": 884.4, "entropy": 0.17323798462748527, "epoch": 0.056, "frac_reward_zero_std": 0.125, "grad_norm": 0.06453540921211243, "kl": 1.1303745985031128, "learning_rate": 9.94485738184775e-06, "loss": 0.004169812053442001, "num_tokens": 13648095.0, "reward": 1.2017917394638062, "reward_std": 0.3141322135925293, "rewards/env_game_reward/mean": 1.2017917394638062, "rewards/env_game_reward/std": 0.4132618486881256, "sampling/importance_sampling_ratio/max": 1.5371316432952882, "sampling/importance_sampling_ratio/mean": 0.9977814793586731, "sampling/importance_sampling_ratio/min": 0.6063427329063416, "sampling/sampling_logp_difference/max": 0.4800256252288818, "sampling/sampling_logp_difference/mean": 0.0133250679820776, "step": 700, "step_time": 5.324122882598749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0009433962404727936, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0009433962404727936, "completions/clipped_ratio": 0.0, "completions/max_length": 1708.0, "completions/max_terminated_length": 1708.0, "completions/mean_length": 1299.7, "completions/mean_terminated_length": 1299.7, "completions/min_length": 908.8, "completions/min_terminated_length": 908.8, "entropy": 0.12524148747324942, "epoch": 0.0564, "frac_reward_zero_std": 0.175, "grad_norm": 0.05619892105460167, "kl": 1.2845276713371276, "learning_rate": 9.944769989260896e-06, "loss": 2.784114331007004e-05, "num_tokens": 13796057.0, "reward": 1.2679583549499511, "reward_std": 0.28408015966415406, "rewards/env_game_reward/mean": 1.2679583549499511, "rewards/env_game_reward/std": 0.3855504870414734, "sampling/importance_sampling_ratio/max": 1.3967360258102417, "sampling/importance_sampling_ratio/mean": 1.0201675534248351, "sampling/importance_sampling_ratio/min": 0.6278232634067535, "sampling/sampling_logp_difference/max": 0.47589802742004395, "sampling/sampling_logp_difference/mean": 0.010255707520991563, "step": 705, "step_time": 5.642836955800158 }, { "clip_ratio/high_max": 0.002083333395421505, "clip_ratio/high_mean": 0.0010416666977107526, "clip_ratio/low_mean": 0.0018894830718636513, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002931149769574404, "completions/clipped_ratio": 0.0, "completions/max_length": 1591.0, "completions/max_terminated_length": 1591.0, "completions/mean_length": 1210.55, "completions/mean_terminated_length": 1210.55, "completions/min_length": 846.0, "completions/min_terminated_length": 846.0, "entropy": 0.11609519273042679, "epoch": 0.0568, "frac_reward_zero_std": 0.025, "grad_norm": 0.07330711930990219, "kl": 1.102458918094635, "learning_rate": 9.944681941756187e-06, "loss": -0.00021149502135813235, "num_tokens": 13934913.0, "reward": 1.2933750391006469, "reward_std": 0.2579171985387802, "rewards/env_game_reward/mean": 1.2933750391006469, "rewards/env_game_reward/std": 0.3375827193260193, "sampling/importance_sampling_ratio/max": 1.4367228507995606, "sampling/importance_sampling_ratio/mean": 0.9958265781402588, "sampling/importance_sampling_ratio/min": 0.5978623509407044, "sampling/sampling_logp_difference/max": 0.5757256031036377, "sampling/sampling_logp_difference/mean": 0.011967730149626732, "step": 710, "step_time": 5.246779600399895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1703.6, "completions/max_terminated_length": 1703.6, "completions/mean_length": 1318.8625, "completions/mean_terminated_length": 1318.8625, "completions/min_length": 995.0, "completions/min_terminated_length": 995.0, "entropy": 0.09110566601157188, "epoch": 0.0572, "frac_reward_zero_std": 0.175, "grad_norm": 0.03678731247782707, "kl": 1.1454283237457275, "learning_rate": 9.944593239349106e-06, "loss": -0.00031706057488918306, "num_tokens": 14084958.0, "reward": 1.3445000171661377, "reward_std": 0.2526728391647339, "rewards/env_game_reward/mean": 1.3445000171661377, "rewards/env_game_reward/std": 0.33565597534179686, "sampling/importance_sampling_ratio/max": 1.3654388189315796, "sampling/importance_sampling_ratio/mean": 1.0032867074012757, "sampling/importance_sampling_ratio/min": 0.7344457149505615, "sampling/sampling_logp_difference/max": 0.3897515535354614, "sampling/sampling_logp_difference/mean": 0.0067275471054017546, "step": 715, "step_time": 5.783622994599137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0019615384750068187, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0019615384750068187, "completions/clipped_ratio": 0.0, "completions/max_length": 1663.6, "completions/max_terminated_length": 1663.6, "completions/mean_length": 1257.9125, "completions/mean_terminated_length": 1257.9125, "completions/min_length": 884.6, "completions/min_terminated_length": 884.6, "entropy": 0.09665783047676087, "epoch": 0.0576, "frac_reward_zero_std": 0.25, "grad_norm": 0.011852607131004333, "kl": 1.0820326685905457, "learning_rate": 9.944503882055243e-06, "loss": 0.0012081136927008628, "num_tokens": 14228074.0, "reward": 1.3299166679382324, "reward_std": 0.2917994186282158, "rewards/env_game_reward/mean": 1.3299166679382324, "rewards/env_game_reward/std": 0.33310834765434266, "sampling/importance_sampling_ratio/max": 1.1939823865890502, "sampling/importance_sampling_ratio/mean": 1.0095659494400024, "sampling/importance_sampling_ratio/min": 0.8032400846481323, "sampling/sampling_logp_difference/max": 0.28812804222106936, "sampling/sampling_logp_difference/mean": 0.006528158858418465, "step": 720, "step_time": 5.56530989899984 }, { "clip_ratio/high_max": 0.00181818176060915, "clip_ratio/high_mean": 0.0014904862269759177, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0014904862269759177, "completions/clipped_ratio": 0.0, "completions/max_length": 1624.6, "completions/max_terminated_length": 1624.6, "completions/mean_length": 1252.5625, "completions/mean_terminated_length": 1252.5625, "completions/min_length": 871.8, "completions/min_terminated_length": 871.8, "entropy": 0.11173812597990036, "epoch": 0.058, "frac_reward_zero_std": 0.225, "grad_norm": 0.06697779148817062, "kl": 1.3808134913444519, "learning_rate": 9.944413869890304e-06, "loss": -0.0006437717471271754, "num_tokens": 14371761.0, "reward": 1.2972917079925537, "reward_std": 0.228572279214859, "rewards/env_game_reward/mean": 1.2972917079925537, "rewards/env_game_reward/std": 0.29336669147014616, "sampling/importance_sampling_ratio/max": 1.120638871192932, "sampling/importance_sampling_ratio/mean": 0.9524656891822815, "sampling/importance_sampling_ratio/min": 0.5668362379074097, "sampling/sampling_logp_difference/max": 0.6708258748054504, "sampling/sampling_logp_difference/mean": 0.00866871876642108, "step": 725, "step_time": 5.52822365559914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0029842125251889227, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0029842125251889227, "completions/clipped_ratio": 0.0, "completions/max_length": 1628.2, "completions/max_terminated_length": 1628.2, "completions/mean_length": 1275.55, "completions/mean_terminated_length": 1275.55, "completions/min_length": 883.2, "completions/min_terminated_length": 883.2, "entropy": 0.1060483768582344, "epoch": 0.0584, "frac_reward_zero_std": 0.25, "grad_norm": 0.022124866023659706, "kl": 1.2471987664699555, "learning_rate": 9.944323202870115e-06, "loss": 0.0006659584119915962, "num_tokens": 14517088.0, "reward": 1.312000036239624, "reward_std": 0.22073518633842468, "rewards/env_game_reward/mean": 1.312000036239624, "rewards/env_game_reward/std": 0.3286490172147751, "sampling/importance_sampling_ratio/max": 1.438795232772827, "sampling/importance_sampling_ratio/mean": 0.9970067143440247, "sampling/importance_sampling_ratio/min": 0.6118991255760193, "sampling/sampling_logp_difference/max": 0.4648634433746338, "sampling/sampling_logp_difference/mean": 0.009209131356328726, "step": 730, "step_time": 5.445555883600173 }, { "clip_ratio/high_max": 0.001960784383118153, "clip_ratio/high_mean": 0.0009803921915590764, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0009803921915590764, "completions/clipped_ratio": 0.0, "completions/max_length": 1680.2, "completions/max_terminated_length": 1680.2, "completions/mean_length": 1208.5, "completions/mean_terminated_length": 1208.5, "completions/min_length": 853.2, "completions/min_terminated_length": 853.2, "entropy": 0.10054874122142791, "epoch": 0.0588, "frac_reward_zero_std": 0.15, "grad_norm": 0.052741412073373795, "kl": 1.1816912770271302, "learning_rate": 9.944231881010614e-06, "loss": 0.0007746644783765078, "num_tokens": 14657077.0, "reward": 1.2604999780654906, "reward_std": 0.22438856363296508, "rewards/env_game_reward/mean": 1.2604999780654906, "rewards/env_game_reward/std": 0.2969023913145065, "sampling/importance_sampling_ratio/max": 1.2568823099136353, "sampling/importance_sampling_ratio/mean": 0.9825718402862549, "sampling/importance_sampling_ratio/min": 0.7182797014713287, "sampling/sampling_logp_difference/max": 0.42328827977180483, "sampling/sampling_logp_difference/mean": 0.009410615637898445, "step": 735, "step_time": 5.590113839400146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0027369407005608084, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0027369407005608084, "completions/clipped_ratio": 0.0, "completions/max_length": 1674.6, "completions/max_terminated_length": 1674.6, "completions/mean_length": 1325.8, "completions/mean_terminated_length": 1325.8, "completions/min_length": 881.8, "completions/min_terminated_length": 881.8, "entropy": 0.08453329205513001, "epoch": 0.0592, "frac_reward_zero_std": 0.2, "grad_norm": 0.05587141960859299, "kl": 1.1068053007125855, "learning_rate": 9.944139904327855e-06, "loss": 0.001067483052611351, "num_tokens": 14807478.0, "reward": 1.3697916984558105, "reward_std": 0.21584435254335405, "rewards/env_game_reward/mean": 1.3697916984558105, "rewards/env_game_reward/std": 0.29844184815883634, "sampling/importance_sampling_ratio/max": 1.4578536033630372, "sampling/importance_sampling_ratio/mean": 1.0180188417434692, "sampling/importance_sampling_ratio/min": 0.7393244504928589, "sampling/sampling_logp_difference/max": 0.47165329456329347, "sampling/sampling_logp_difference/mean": 0.007859865296632051, "step": 740, "step_time": 5.500019186200371 }, { "clip_ratio/high_max": 0.0019999999552965165, "clip_ratio/high_mean": 0.0009999999776482583, "clip_ratio/low_mean": 0.0015050167683511972, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002505016792565584, "completions/clipped_ratio": 0.0, "completions/max_length": 1540.0, "completions/max_terminated_length": 1540.0, "completions/mean_length": 1221.325, "completions/mean_terminated_length": 1221.325, "completions/min_length": 889.2, "completions/min_terminated_length": 889.2, "entropy": 0.17823160253465176, "epoch": 0.0596, "frac_reward_zero_std": 0.125, "grad_norm": 0.2046627402305603, "kl": 2.1721172749996187, "learning_rate": 9.944047272838003e-06, "loss": 0.005118013173341751, "num_tokens": 14947959.0, "reward": 1.328333306312561, "reward_std": 0.23499515950679778, "rewards/env_game_reward/mean": 1.328333306312561, "rewards/env_game_reward/std": 0.3135849416255951, "sampling/importance_sampling_ratio/max": 1.5344476461410523, "sampling/importance_sampling_ratio/mean": 1.0067410469055176, "sampling/importance_sampling_ratio/min": 0.5616122364997982, "sampling/sampling_logp_difference/max": 5.125636863708496, "sampling/sampling_logp_difference/mean": 0.022377347387373448, "step": 745, "step_time": 5.2221048694002095 }, { "clip_ratio/high_max": 0.004959130194038153, "clip_ratio/high_mean": 0.0024795650970190763, "clip_ratio/low_mean": 0.0043784501031041145, "clip_ratio/low_min": 0.0013333333656191826, "clip_ratio/region_mean": 0.006858015200123191, "completions/clipped_ratio": 0.0, "completions/max_length": 1596.4, "completions/max_terminated_length": 1596.4, "completions/mean_length": 1219.2125, "completions/mean_terminated_length": 1219.2125, "completions/min_length": 906.8, "completions/min_terminated_length": 906.8, "entropy": 0.19808728545904158, "epoch": 0.06, "frac_reward_zero_std": 0.25, "grad_norm": 0.028860095888376236, "kl": 1.1904378652572631, "learning_rate": 9.943953986557342e-06, "loss": 0.0009089265018701554, "num_tokens": 15088378.0, "reward": 1.3279583692550658, "reward_std": 0.21242666840553284, "rewards/env_game_reward/mean": 1.3279583692550658, "rewards/env_game_reward/std": 0.3157786726951599, "sampling/importance_sampling_ratio/max": 1.2873046875, "sampling/importance_sampling_ratio/mean": 0.9388688564300537, "sampling/importance_sampling_ratio/min": 0.37807847261428834, "sampling/sampling_logp_difference/max": 0.7840029001235962, "sampling/sampling_logp_difference/mean": 0.016058788914233447, "step": 750, "step_time": 5.506602738799847 }, { "epoch": 0.06, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 1642.3333333333333, "eval_completions/max_terminated_length": 1642.3333333333333, "eval_completions/mean_length": 1389.2916666666667, "eval_completions/mean_terminated_length": 1389.2916666666667, "eval_completions/min_length": 1053.0, "eval_completions/min_terminated_length": 1053.0, "eval_entropy": 0.16020495692888895, "eval_frac_reward_zero_std": 0.16666666666666666, "eval_kl": 1.373230218887329, "eval_loss": -0.0009800799889490008, "eval_num_tokens": 15088378.0, "eval_reward": 1.1752777894337971, "eval_reward_std": 0.38537317017714184, "eval_rewards/env_game_reward/mean": 1.1752777894337971, "eval_rewards/env_game_reward/std": 0.42335541049639386, "eval_runtime": 5.7638, "eval_samples_per_second": 1.735, "eval_sampling/importance_sampling_ratio/max": 1.4313355684280396, "eval_sampling/importance_sampling_ratio/mean": 1.0740643541018169, "eval_sampling/importance_sampling_ratio/min": 0.8583299318949381, "eval_sampling/sampling_logp_difference/max": 0.40136027336120605, "eval_sampling/sampling_logp_difference/mean": 0.011960081600894531, "eval_steps_per_second": 0.347, "step": 750 }, { "clip_ratio/high_max": 0.0028861789032816887, "clip_ratio/high_mean": 0.002068089507520199, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002068089507520199, "completions/clipped_ratio": 0.0, "completions/max_length": 1881.0, "completions/max_terminated_length": 1881.0, "completions/mean_length": 1388.0875, "completions/mean_terminated_length": 1388.0875, "completions/min_length": 982.2, "completions/min_terminated_length": 982.2, "entropy": 0.2721471354365349, "epoch": 0.0604, "frac_reward_zero_std": 0.075, "grad_norm": 0.13366009294986725, "kl": 1.4383060097694398, "learning_rate": 9.943860045502275e-06, "loss": 0.0013721118681132793, "num_tokens": 15242399.0, "reward": 1.1892500162124633, "reward_std": 0.40534055829048155, "rewards/env_game_reward/mean": 1.1892500162124633, "rewards/env_game_reward/std": 0.4778396964073181, "sampling/importance_sampling_ratio/max": 1.5637898206710816, "sampling/importance_sampling_ratio/mean": 1.0453999996185304, "sampling/importance_sampling_ratio/min": 0.7041348934173584, "sampling/sampling_logp_difference/max": 0.39543609619140624, "sampling/sampling_logp_difference/mean": 0.016556292213499545, "step": 755, "step_time": 6.289189056601754 }, { "clip_ratio/high_max": 0.001666666753590107, "clip_ratio/high_mean": 0.0008333333767950535, "clip_ratio/low_mean": 0.0016708438284695148, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002504177112132311, "completions/clipped_ratio": 0.0, "completions/max_length": 1989.6, "completions/max_terminated_length": 1989.6, "completions/mean_length": 1483.5125, "completions/mean_terminated_length": 1483.5125, "completions/min_length": 948.0, "completions/min_terminated_length": 948.0, "entropy": 0.22058189362287522, "epoch": 0.0608, "frac_reward_zero_std": 0.05, "grad_norm": 0.055043403059244156, "kl": 1.480393397808075, "learning_rate": 9.943765449689311e-06, "loss": 0.0006418362259864807, "num_tokens": 15404568.0, "reward": 1.2230416774749755, "reward_std": 0.26340569257736207, "rewards/env_game_reward/mean": 1.2230416774749755, "rewards/env_game_reward/std": 0.3767798840999603, "sampling/importance_sampling_ratio/max": 1.6279995679855346, "sampling/importance_sampling_ratio/mean": 1.014073097705841, "sampling/importance_sampling_ratio/min": 0.5342595517635346, "sampling/sampling_logp_difference/max": 0.5853055238723754, "sampling/sampling_logp_difference/mean": 0.015527874231338501, "step": 760, "step_time": 6.58562049539978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0008620689623057842, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0008620689623057842, "completions/clipped_ratio": 0.0, "completions/max_length": 1925.0, "completions/max_terminated_length": 1925.0, "completions/mean_length": 1438.575, "completions/mean_terminated_length": 1438.575, "completions/min_length": 978.4, "completions/min_terminated_length": 978.4, "entropy": 0.1898469567298889, "epoch": 0.0612, "frac_reward_zero_std": 0.1, "grad_norm": 0.04669173061847687, "kl": 1.7866320490837098, "learning_rate": 9.94367019913508e-06, "loss": -0.0005762927234172821, "num_tokens": 15563247.0, "reward": 1.3553274154663086, "reward_std": 0.27411331832408903, "rewards/env_game_reward/mean": 1.3553274154663086, "rewards/env_game_reward/std": 0.3332707077264786, "sampling/importance_sampling_ratio/max": 1.5550270080566406, "sampling/importance_sampling_ratio/mean": 1.0086857557296753, "sampling/importance_sampling_ratio/min": 0.6248692035675049, "sampling/sampling_logp_difference/max": 0.5656398415565491, "sampling/sampling_logp_difference/mean": 0.016817177832126617, "step": 765, "step_time": 6.523562220599706 }, { "clip_ratio/high_max": 0.0034807992400601507, "clip_ratio/high_mean": 0.0017403996200300754, "clip_ratio/low_mean": 0.0008576392603572458, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0025980389036703855, "completions/clipped_ratio": 0.0, "completions/max_length": 2061.0, "completions/max_terminated_length": 2061.0, "completions/mean_length": 1509.6, "completions/mean_terminated_length": 1509.6, "completions/min_length": 1017.8, "completions/min_terminated_length": 1017.8, "entropy": 0.11955822706222534, "epoch": 0.0616, "frac_reward_zero_std": 0.1, "grad_norm": 0.10209216922521591, "kl": 1.0451581090688706, "learning_rate": 9.943574293856327e-06, "loss": 0.01794065982103348, "num_tokens": 15726895.0, "reward": 1.4529226541519165, "reward_std": 0.21203944683074952, "rewards/env_game_reward/mean": 1.4529226541519165, "rewards/env_game_reward/std": 0.31403881311416626, "sampling/importance_sampling_ratio/max": 1.901965069770813, "sampling/importance_sampling_ratio/mean": 0.8959596395492554, "sampling/importance_sampling_ratio/min": 0.21005995989909126, "sampling/sampling_logp_difference/max": 4.415722644329071, "sampling/sampling_logp_difference/mean": 0.022534814849495887, "step": 770, "step_time": 8.61529116679958 }, { "clip_ratio/high_max": 0.001954938913695514, "clip_ratio/high_mean": 0.000977469456847757, "clip_ratio/low_mean": 0.0013538596511352807, "clip_ratio/low_min": 0.0003663003677502275, "clip_ratio/region_mean": 0.0023313291429076346, "completions/clipped_ratio": 0.0125, "completions/max_length": 1926.0, "completions/max_terminated_length": 1923.6, "completions/mean_length": 1459.275, "completions/mean_terminated_length": 1450.872509765625, "completions/min_length": 964.2, "completions/min_terminated_length": 964.2, "entropy": 0.12240489572286606, "epoch": 0.062, "frac_reward_zero_std": 0.125, "grad_norm": 0.10321921855211258, "kl": 0.7054299473762512, "learning_rate": 9.943477733869912e-06, "loss": 0.037728136777877806, "num_tokens": 15885232.0, "reward": 1.430791687965393, "reward_std": 0.20286390632390977, "rewards/env_game_reward/mean": 1.430791687965393, "rewards/env_game_reward/std": 0.3058221280574799, "sampling/importance_sampling_ratio/max": 1.7364504098892213, "sampling/importance_sampling_ratio/mean": 0.7705297708511353, "sampling/importance_sampling_ratio/min": 0.07456759945838565, "sampling/sampling_logp_difference/max": 4.6616837739944454, "sampling/sampling_logp_difference/mean": 0.01805206313729286, "step": 775, "step_time": 9.152791061800963 }, { "clip_ratio/high_max": 0.0029415366472676395, "clip_ratio/high_mean": 0.001613421703223139, "clip_ratio/low_mean": 0.002161662018625066, "clip_ratio/low_min": 0.00038369541289284825, "clip_ratio/region_mean": 0.0037750837625935675, "completions/clipped_ratio": 0.0625, "completions/max_length": 2117.8, "completions/max_terminated_length": 2057.8, "completions/mean_length": 1638.35, "completions/mean_terminated_length": 1615.8751708984375, "completions/min_length": 1121.2, "completions/min_terminated_length": 1121.2, "entropy": 0.14629979468882084, "epoch": 0.0624, "frac_reward_zero_std": 0.2, "grad_norm": 0.10632387548685074, "kl": 1.703850120306015, "learning_rate": 9.943380519192805e-06, "loss": 0.05711690783500671, "num_tokens": 16060923.0, "reward": 1.5017619371414184, "reward_std": 0.15968829095363618, "rewards/env_game_reward/mean": 1.5017619371414184, "rewards/env_game_reward/std": 0.2627015709877014, "sampling/importance_sampling_ratio/max": 1.9005746364593505, "sampling/importance_sampling_ratio/mean": 0.852447235584259, "sampling/importance_sampling_ratio/min": 0.006003885331677944, "sampling/sampling_logp_difference/max": 10.413811373710633, "sampling/sampling_logp_difference/mean": 0.0183597469702363, "step": 780, "step_time": 10.805634913398535 }, { "clip_ratio/high_max": 0.002865154389292002, "clip_ratio/high_mean": 0.0015746226534247398, "clip_ratio/low_mean": 0.000639259337913245, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002213882014621049, "completions/clipped_ratio": 0.0125, "completions/max_length": 2010.4, "completions/max_terminated_length": 1945.2, "completions/mean_length": 1429.275, "completions/mean_terminated_length": 1417.6500244140625, "completions/min_length": 1004.0, "completions/min_terminated_length": 1004.0, "entropy": 0.13611191138625145, "epoch": 0.0628, "frac_reward_zero_std": 0.125, "grad_norm": 0.08338388800621033, "kl": 0.7149348020553589, "learning_rate": 9.943282649842098e-06, "loss": -0.01947730779647827, "num_tokens": 16217048.0, "reward": 1.4835833787918091, "reward_std": 0.1830228090286255, "rewards/env_game_reward/mean": 1.4835833787918091, "rewards/env_game_reward/std": 0.2726488709449768, "sampling/importance_sampling_ratio/max": 1.8740410327911377, "sampling/importance_sampling_ratio/mean": 0.9315284371376038, "sampling/importance_sampling_ratio/min": 0.29684333456025164, "sampling/sampling_logp_difference/max": 4.141123366355896, "sampling/sampling_logp_difference/mean": 0.009943006094545126, "step": 785, "step_time": 8.90153669700012 }, { "clip_ratio/high_max": 0.0005181347019970417, "clip_ratio/high_mean": 0.00025906735099852086, "clip_ratio/low_mean": 0.0014250828709919006, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0016841501754242926, "completions/clipped_ratio": 0.0, "completions/max_length": 2021.0, "completions/max_terminated_length": 2021.0, "completions/mean_length": 1568.0, "completions/mean_terminated_length": 1568.0, "completions/min_length": 1089.6, "completions/min_terminated_length": 1089.6, "entropy": 0.1328461952507496, "epoch": 0.0632, "frac_reward_zero_std": 0.225, "grad_norm": 0.07064507901668549, "kl": 0.66820108294487, "learning_rate": 9.943184125834994e-06, "loss": 0.03775088787078858, "num_tokens": 16386132.0, "reward": 1.5388035774230957, "reward_std": 0.13462809175252916, "rewards/env_game_reward/mean": 1.5388035774230957, "rewards/env_game_reward/std": 0.2006644695997238, "sampling/importance_sampling_ratio/max": 1.3795575380325318, "sampling/importance_sampling_ratio/mean": 0.930507481098175, "sampling/importance_sampling_ratio/min": 0.3511299118457078, "sampling/sampling_logp_difference/max": 6.95696439743042, "sampling/sampling_logp_difference/mean": 0.013957394100725651, "step": 790, "step_time": 8.834509170798992 }, { "clip_ratio/high_max": 0.002479099528864026, "clip_ratio/high_mean": 0.001239549764432013, "clip_ratio/low_mean": 0.0016097922343760729, "clip_ratio/low_min": 0.0006060605868697166, "clip_ratio/region_mean": 0.002849341952241957, "completions/clipped_ratio": 0.0125, "completions/max_length": 1934.6, "completions/max_terminated_length": 1934.6, "completions/mean_length": 1502.5125, "completions/mean_terminated_length": 1505.343359375, "completions/min_length": 978.2, "completions/min_terminated_length": 978.2, "entropy": 0.16015452295541763, "epoch": 0.0636, "frac_reward_zero_std": 0.225, "grad_norm": 0.14999434351921082, "kl": 1.1131888061761857, "learning_rate": 9.943084947188814e-06, "loss": -0.012555241584777832, "num_tokens": 16549995.0, "reward": 1.5487381219863892, "reward_std": 0.12269987612962723, "rewards/env_game_reward/mean": 1.5487381219863892, "rewards/env_game_reward/std": 0.21309578120708467, "sampling/importance_sampling_ratio/max": 1.5328397512435914, "sampling/importance_sampling_ratio/mean": 0.9361981153488159, "sampling/importance_sampling_ratio/min": 0.22185008656667407, "sampling/sampling_logp_difference/max": 7.313794112205505, "sampling/sampling_logp_difference/mean": 0.02059700442478061, "step": 795, "step_time": 7.839289776599617 }, { "clip_ratio/high_max": 0.0006250000093132258, "clip_ratio/high_mean": 0.0003125000046566129, "clip_ratio/low_mean": 0.0009224468318279833, "clip_ratio/low_min": 0.0004545454401522875, "clip_ratio/region_mean": 0.0012349468364845962, "completions/clipped_ratio": 0.0, "completions/max_length": 1974.4, "completions/max_terminated_length": 1974.4, "completions/mean_length": 1553.1875, "completions/mean_terminated_length": 1553.1875, "completions/min_length": 1044.8, "completions/min_terminated_length": 1044.8, "entropy": 0.17186126336455346, "epoch": 0.064, "frac_reward_zero_std": 0.075, "grad_norm": 0.11871084570884705, "kl": 1.1377224385738374, "learning_rate": 9.942985113920988e-06, "loss": 0.029259467124938966, "num_tokens": 16717754.0, "reward": 1.536363124847412, "reward_std": 0.1405543178319931, "rewards/env_game_reward/mean": 1.536363124847412, "rewards/env_game_reward/std": 0.16022475957870483, "sampling/importance_sampling_ratio/max": 1.7989203214645386, "sampling/importance_sampling_ratio/mean": 1.0040669798851014, "sampling/importance_sampling_ratio/min": 0.494415277243539, "sampling/sampling_logp_difference/max": 5.424048852920532, "sampling/sampling_logp_difference/mean": 0.013903583586215972, "step": 800, "step_time": 8.159463358601352 }, { "clip_ratio/high_max": 0.0017976711736992002, "clip_ratio/high_mean": 0.0008988355868496001, "clip_ratio/low_mean": 0.0016219173092395067, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002520752896089107, "completions/clipped_ratio": 0.0125, "completions/max_length": 2027.0, "completions/max_terminated_length": 2027.0, "completions/mean_length": 1557.8875, "completions/mean_terminated_length": 1553.0508544921875, "completions/min_length": 1052.6, "completions/min_terminated_length": 1052.6, "entropy": 0.17057686448097228, "epoch": 0.0644, "frac_reward_zero_std": 0.3, "grad_norm": 0.10110954195261002, "kl": 0.8300488352775574, "learning_rate": 9.942884626049074e-06, "loss": 0.07206388711929321, "num_tokens": 16886207.0, "reward": 1.585892915725708, "reward_std": 0.07475129663944244, "rewards/env_game_reward/mean": 1.585892915725708, "rewards/env_game_reward/std": 0.10948928892612457, "sampling/importance_sampling_ratio/max": 1.4725745916366577, "sampling/importance_sampling_ratio/mean": 0.9733787894248962, "sampling/importance_sampling_ratio/min": 0.46375017166137694, "sampling/sampling_logp_difference/max": 0.5177584171295166, "sampling/sampling_logp_difference/mean": 0.008902581129223108, "step": 805, "step_time": 9.054756587599694 }, { "clip_ratio/high_max": 0.004471949115395546, "clip_ratio/high_mean": 0.002951120538637042, "clip_ratio/low_mean": 0.0007151460275053978, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00366626656614244, "completions/clipped_ratio": 0.0, "completions/max_length": 1901.2, "completions/max_terminated_length": 1901.2, "completions/mean_length": 1397.4125, "completions/mean_terminated_length": 1397.4125, "completions/min_length": 1015.4, "completions/min_terminated_length": 1015.4, "entropy": 0.43061949610710143, "epoch": 0.0648, "frac_reward_zero_std": 0.25, "grad_norm": 0.08036839962005615, "kl": 2.31134774684906, "learning_rate": 9.942783483590728e-06, "loss": 0.010299382358789444, "num_tokens": 17040061.0, "reward": 1.5596429109573364, "reward_std": 0.08182235956192016, "rewards/env_game_reward/mean": 1.5596429109573364, "rewards/env_game_reward/std": 0.13593876510858535, "sampling/importance_sampling_ratio/max": 1.5340835809707642, "sampling/importance_sampling_ratio/mean": 0.9579884886741639, "sampling/importance_sampling_ratio/min": 0.2988631462066783, "sampling/sampling_logp_difference/max": 6.111863076686859, "sampling/sampling_logp_difference/mean": 0.035615741088986394, "step": 810, "step_time": 6.779588287798106 }, { "clip_ratio/high_max": 0.002151205716654658, "clip_ratio/high_mean": 0.001408936199732125, "clip_ratio/low_mean": 0.0037830369081348183, "clip_ratio/low_min": 0.001368796918541193, "clip_ratio/region_mean": 0.005191973014734686, "completions/clipped_ratio": 0.0, "completions/max_length": 1977.0, "completions/max_terminated_length": 1977.0, "completions/mean_length": 1473.0375, "completions/mean_terminated_length": 1473.0375, "completions/min_length": 974.4, "completions/min_terminated_length": 974.4, "entropy": 0.40927894711494445, "epoch": 0.0652, "frac_reward_zero_std": 0.225, "grad_norm": 0.07628747820854187, "kl": 2.2936408281326295, "learning_rate": 9.942681686563735e-06, "loss": 0.008698154985904694, "num_tokens": 17200509.0, "reward": 1.5981250286102295, "reward_std": 0.05533111207187176, "rewards/env_game_reward/mean": 1.5981250286102295, "rewards/env_game_reward/std": 0.08176134005188943, "sampling/importance_sampling_ratio/max": 1.5122954607009889, "sampling/importance_sampling_ratio/mean": 0.9499858260154724, "sampling/importance_sampling_ratio/min": 0.2841934680938721, "sampling/sampling_logp_difference/max": 9.145073175430298, "sampling/sampling_logp_difference/mean": 0.07416130937635898, "step": 815, "step_time": 6.956511554600001 }, { "clip_ratio/high_max": 0.0007575757801532746, "clip_ratio/high_mean": 0.0003787878900766373, "clip_ratio/low_mean": 0.0013752276543527841, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0017540155444294215, "completions/clipped_ratio": 0.0, "completions/max_length": 1854.4, "completions/max_terminated_length": 1854.4, "completions/mean_length": 1441.1875, "completions/mean_terminated_length": 1441.1875, "completions/min_length": 1054.6, "completions/min_terminated_length": 1054.6, "entropy": 0.34008778631687164, "epoch": 0.0656, "frac_reward_zero_std": 0.225, "grad_norm": 0.07586944103240967, "kl": 2.400510585308075, "learning_rate": 9.942579234985985e-06, "loss": 0.012332186847925187, "num_tokens": 17358344.0, "reward": 1.5910000562667848, "reward_std": 0.07530688047409058, "rewards/env_game_reward/mean": 1.5910000562667848, "rewards/env_game_reward/std": 0.12365454062819481, "sampling/importance_sampling_ratio/max": 1.4079306602478028, "sampling/importance_sampling_ratio/mean": 0.9352385640144348, "sampling/importance_sampling_ratio/min": 0.277198314811837, "sampling/sampling_logp_difference/max": 11.205174541473388, "sampling/sampling_logp_difference/mean": 0.046147267892956735, "step": 820, "step_time": 6.762566551601049 }, { "clip_ratio/high_max": 0.0006944444496184588, "clip_ratio/high_mean": 0.0003472222248092294, "clip_ratio/low_mean": 0.0016576209105551242, "clip_ratio/low_min": 0.000283286115154624, "clip_ratio/region_mean": 0.0020048431353643535, "completions/clipped_ratio": 0.0, "completions/max_length": 1995.4, "completions/max_terminated_length": 1995.4, "completions/mean_length": 1518.4, "completions/mean_terminated_length": 1518.4, "completions/min_length": 1045.2, "completions/min_terminated_length": 1045.2, "entropy": 0.3576410233974457, "epoch": 0.066, "frac_reward_zero_std": 0.15, "grad_norm": 0.10342220216989517, "kl": 2.481777250766754, "learning_rate": 9.942476128875491e-06, "loss": 0.01274801343679428, "num_tokens": 17523295.0, "reward": 1.586625051498413, "reward_std": 0.0768978700041771, "rewards/env_game_reward/mean": 1.586625051498413, "rewards/env_game_reward/std": 0.10694200098514557, "sampling/importance_sampling_ratio/max": 1.6131411075592041, "sampling/importance_sampling_ratio/mean": 0.9216183066368103, "sampling/importance_sampling_ratio/min": 0.1865751624797475, "sampling/sampling_logp_difference/max": 10.270191860198974, "sampling/sampling_logp_difference/mean": 0.049572935700416564, "step": 825, "step_time": 7.239306969800237 }, { "epoch": 0.066, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 1889.0, "eval_completions/max_terminated_length": 1889.0, "eval_completions/mean_length": 1637.9166666666667, "eval_completions/mean_terminated_length": 1637.9166666666667, "eval_completions/min_length": 1247.0, "eval_completions/min_terminated_length": 1247.0, "eval_entropy": 0.3013338545958201, "eval_frac_reward_zero_std": 0.08333333333333333, "eval_kl": 4.269202629725139, "eval_loss": 0.009242034517228603, "eval_num_tokens": 17523295.0, "eval_reward": 1.555416742960612, "eval_reward_std": 0.13493956004579863, "eval_rewards/env_game_reward/mean": 1.555416742960612, "eval_rewards/env_game_reward/std": 0.22197324534257254, "eval_runtime": 7.4138, "eval_samples_per_second": 1.349, "eval_sampling/importance_sampling_ratio/max": 1.2260379791259766, "eval_sampling/importance_sampling_ratio/mean": 0.9835768540700277, "eval_sampling/importance_sampling_ratio/min": 0.669548749923706, "eval_sampling/sampling_logp_difference/max": 0.3156093756357829, "eval_sampling/sampling_logp_difference/mean": 0.013049931886295477, "eval_steps_per_second": 0.27, "step": 825 }, { "clip_ratio/high_max": 0.0026498251594603063, "clip_ratio/high_mean": 0.0013249125797301531, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0013249125797301531, "completions/clipped_ratio": 0.0, "completions/max_length": 1927.0, "completions/max_terminated_length": 1927.0, "completions/mean_length": 1415.7375, "completions/mean_terminated_length": 1415.7375, "completions/min_length": 1012.2, "completions/min_terminated_length": 1012.2, "entropy": 0.30437230616807937, "epoch": 0.0664, "frac_reward_zero_std": 0.225, "grad_norm": 0.0816667228937149, "kl": 2.4667917609214784, "learning_rate": 9.942372368250377e-06, "loss": 0.0189794585108757, "num_tokens": 17678924.0, "reward": 1.5786250352859497, "reward_std": 0.09740396738052368, "rewards/env_game_reward/mean": 1.5786250352859497, "rewards/env_game_reward/std": 0.16835185438394545, "sampling/importance_sampling_ratio/max": 1.5452775001525878, "sampling/importance_sampling_ratio/mean": 0.9819482445716858, "sampling/importance_sampling_ratio/min": 0.520148062706763, "sampling/sampling_logp_difference/max": 4.023156452178955, "sampling/sampling_logp_difference/mean": 0.033988011069595814, "step": 830, "step_time": 6.711538025199843 }, { "clip_ratio/high_max": 0.0032954429741948845, "clip_ratio/high_mean": 0.0016477214870974422, "clip_ratio/low_mean": 0.001202381821349263, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0028501033084467053, "completions/clipped_ratio": 0.0, "completions/max_length": 1927.4, "completions/max_terminated_length": 1927.4, "completions/mean_length": 1432.5, "completions/mean_terminated_length": 1432.5, "completions/min_length": 1027.6, "completions/min_terminated_length": 1027.6, "entropy": 0.327534431219101, "epoch": 0.0668, "frac_reward_zero_std": 0.125, "grad_norm": 0.06545510143041611, "kl": 3.1863216876983644, "learning_rate": 9.942267953128885e-06, "loss": 0.011105220019817352, "num_tokens": 17836027.0, "reward": 1.5842679262161254, "reward_std": 0.07740294709801673, "rewards/env_game_reward/mean": 1.5842679262161254, "rewards/env_game_reward/std": 0.11849845722317695, "sampling/importance_sampling_ratio/max": 1.4498932838439942, "sampling/importance_sampling_ratio/mean": 0.9487195491790772, "sampling/importance_sampling_ratio/min": 0.29165607984107556, "sampling/sampling_logp_difference/max": 5.829511404037476, "sampling/sampling_logp_difference/mean": 0.03138357251882553, "step": 835, "step_time": 6.9147914322013095 }, { "clip_ratio/high_max": 0.004935299325734377, "clip_ratio/high_mean": 0.0027280662674456836, "clip_ratio/low_mean": 0.002076923102140427, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00480498936958611, "completions/clipped_ratio": 0.0, "completions/max_length": 1901.4, "completions/max_terminated_length": 1901.4, "completions/mean_length": 1447.075, "completions/mean_terminated_length": 1447.075, "completions/min_length": 1032.4, "completions/min_terminated_length": 1032.4, "entropy": 0.24285254254937172, "epoch": 0.0672, "frac_reward_zero_std": 0.1, "grad_norm": 0.21589381992816925, "kl": 1.4673928499221802, "learning_rate": 9.942162883529365e-06, "loss": -0.01951298713684082, "num_tokens": 17994453.0, "reward": 1.362357211112976, "reward_std": 0.26269018054008486, "rewards/env_game_reward/mean": 1.362357211112976, "rewards/env_game_reward/std": 0.3235168933868408, "sampling/importance_sampling_ratio/max": 1.762507152557373, "sampling/importance_sampling_ratio/mean": 1.026704490184784, "sampling/importance_sampling_ratio/min": 0.6443805754184723, "sampling/sampling_logp_difference/max": 0.44364252090454104, "sampling/sampling_logp_difference/mean": 0.013111063651740551, "step": 840, "step_time": 6.5076123672013635 }, { "clip_ratio/high_max": 0.0037218520883470774, "clip_ratio/high_mean": 0.0018609260441735387, "clip_ratio/low_mean": 0.0017191730381455272, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0035800991114228963, "completions/clipped_ratio": 0.0, "completions/max_length": 2005.8, "completions/max_terminated_length": 2005.8, "completions/mean_length": 1515.8375, "completions/mean_terminated_length": 1515.8375, "completions/min_length": 1026.4, "completions/min_terminated_length": 1026.4, "entropy": 0.3497733250260353, "epoch": 0.0676, "frac_reward_zero_std": 0.075, "grad_norm": 0.09921447187662125, "kl": 1.3188701629638673, "learning_rate": 9.94205715947029e-06, "loss": 0.010336892306804657, "num_tokens": 18158649.0, "reward": 1.3419702529907227, "reward_std": 0.24037423133850097, "rewards/env_game_reward/mean": 1.3419702529907227, "rewards/env_game_reward/std": 0.3400465488433838, "sampling/importance_sampling_ratio/max": 1.7033623218536378, "sampling/importance_sampling_ratio/mean": 0.9735162138938904, "sampling/importance_sampling_ratio/min": 0.48452032804489137, "sampling/sampling_logp_difference/max": 0.5224750876426697, "sampling/sampling_logp_difference/mean": 0.0164469039067626, "step": 845, "step_time": 7.678838656601147 }, { "clip_ratio/high_max": 0.004272268549539149, "clip_ratio/high_mean": 0.0021361342747695743, "clip_ratio/low_mean": 0.0017409127380233259, "clip_ratio/low_min": 0.00020811655558645726, "clip_ratio/region_mean": 0.0038770470418967307, "completions/clipped_ratio": 0.025, "completions/max_length": 2252.8, "completions/max_terminated_length": 2162.0, "completions/mean_length": 1561.925, "completions/mean_terminated_length": 1542.1192138671875, "completions/min_length": 1022.8, "completions/min_terminated_length": 1022.8, "entropy": 0.6017354875802994, "epoch": 0.068, "frac_reward_zero_std": 0.125, "grad_norm": 0.38035619258880615, "kl": 1.1315938919782638, "learning_rate": 9.941950780970246e-06, "loss": 0.08799705505371094, "num_tokens": 18325891.0, "reward": 1.4051547288894652, "reward_std": 0.16788736879825591, "rewards/env_game_reward/mean": 1.4051547288894652, "rewards/env_game_reward/std": 0.27388512790203096, "sampling/importance_sampling_ratio/max": 1.4328656196594238, "sampling/importance_sampling_ratio/mean": 0.8519251465797424, "sampling/importance_sampling_ratio/min": 0.3052798181772232, "sampling/sampling_logp_difference/max": 3.1186322927474976, "sampling/sampling_logp_difference/mean": 0.024020181223750114, "step": 850, "step_time": 11.81878617679904 }, { "clip_ratio/high_max": 0.0009455415420234203, "clip_ratio/high_mean": 0.00047277077101171017, "clip_ratio/low_mean": 0.000978588976431638, "clip_ratio/low_min": 0.00048076924867928027, "clip_ratio/region_mean": 0.0014513597823679447, "completions/clipped_ratio": 0.025, "completions/max_length": 2146.6, "completions/max_terminated_length": 2106.0, "completions/mean_length": 1544.9875, "completions/mean_terminated_length": 1529.2891845703125, "completions/min_length": 1068.0, "completions/min_terminated_length": 1068.0, "entropy": 0.5300098687410355, "epoch": 0.0684, "frac_reward_zero_std": 0.125, "grad_norm": 0.15949228405952454, "kl": 1.1649099051952363, "learning_rate": 9.941843748047931e-06, "loss": 0.032023686170578006, "num_tokens": 18492250.0, "reward": 1.5154583692550658, "reward_std": 0.15400618463754653, "rewards/env_game_reward/mean": 1.5154583692550658, "rewards/env_game_reward/std": 0.20309044122695924, "sampling/importance_sampling_ratio/max": 1.7556475162506104, "sampling/importance_sampling_ratio/mean": 0.9517914056777954, "sampling/importance_sampling_ratio/min": 0.19819140780014094, "sampling/sampling_logp_difference/max": 5.317162609100341, "sampling/sampling_logp_difference/mean": 0.029889048635959627, "step": 855, "step_time": 10.00929546920015 }, { "clip_ratio/high_max": 0.002626922621857375, "clip_ratio/high_mean": 0.0013134613109286875, "clip_ratio/low_mean": 0.0014400507032405585, "clip_ratio/low_min": 0.0005416349973529577, "clip_ratio/region_mean": 0.0027535120607353747, "completions/clipped_ratio": 0.025, "completions/max_length": 1999.2, "completions/max_terminated_length": 1999.2, "completions/mean_length": 1433.7875, "completions/mean_terminated_length": 1430.055029296875, "completions/min_length": 1025.0, "completions/min_terminated_length": 1025.0, "entropy": 0.5983447372913361, "epoch": 0.0688, "frac_reward_zero_std": 0.2, "grad_norm": 0.17352095246315002, "kl": 1.2142629444599151, "learning_rate": 9.94173606072216e-06, "loss": 0.03372189402580261, "num_tokens": 18648334.0, "reward": 1.4906190633773804, "reward_std": 0.16036173179745675, "rewards/env_game_reward/mean": 1.4906190633773804, "rewards/env_game_reward/std": 0.23361046463251114, "sampling/importance_sampling_ratio/max": 1.8726997137069703, "sampling/importance_sampling_ratio/mean": 0.9728065609931946, "sampling/importance_sampling_ratio/min": 0.2550904452800751, "sampling/sampling_logp_difference/max": 0.7147324085235596, "sampling/sampling_logp_difference/mean": 0.021650590375065802, "step": 860, "step_time": 8.624581174000923 }, { "clip_ratio/high_max": 0.0015781309455633163, "clip_ratio/high_mean": 0.0007890654727816582, "clip_ratio/low_mean": 0.0009416018030606211, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0017306672758422791, "completions/clipped_ratio": 0.025, "completions/max_length": 1994.2, "completions/max_terminated_length": 1990.6, "completions/mean_length": 1527.225, "completions/mean_terminated_length": 1523.9850341796875, "completions/min_length": 1055.0, "completions/min_terminated_length": 1055.0, "entropy": 0.4782690331339836, "epoch": 0.0692, "frac_reward_zero_std": 0.2, "grad_norm": 0.08745373040437698, "kl": 0.9816187143325805, "learning_rate": 9.941627719011864e-06, "loss": 0.003973131626844406, "num_tokens": 18813877.0, "reward": 1.4740357398986816, "reward_std": 0.20445487946271895, "rewards/env_game_reward/mean": 1.4740357398986816, "rewards/env_game_reward/std": 0.29361526668071747, "sampling/importance_sampling_ratio/max": 1.4921527624130249, "sampling/importance_sampling_ratio/mean": 0.9029739260673523, "sampling/importance_sampling_ratio/min": 0.15594935725966935, "sampling/sampling_logp_difference/max": 2.1523058891296385, "sampling/sampling_logp_difference/mean": 0.01937628984451294, "step": 865, "step_time": 8.709527614199033 }, { "clip_ratio/high_max": 0.00011587485205382109, "clip_ratio/high_mean": 5.7937426026910545e-05, "clip_ratio/low_mean": 0.0009105266013648361, "clip_ratio/low_min": 8.110299822874367e-05, "clip_ratio/region_mean": 0.0009684640273917467, "completions/clipped_ratio": 0.05, "completions/max_length": 2151.6, "completions/max_terminated_length": 2120.2, "completions/mean_length": 1570.3, "completions/mean_terminated_length": 1552.0944580078126, "completions/min_length": 1056.4, "completions/min_terminated_length": 1056.4, "entropy": 0.395656031370163, "epoch": 0.0696, "frac_reward_zero_std": 0.225, "grad_norm": 0.145682230591774, "kl": 0.5095479518175126, "learning_rate": 9.94151872293609e-06, "loss": 0.13460533618927, "num_tokens": 18982505.0, "reward": 1.5383512496948242, "reward_std": 0.1430460289120674, "rewards/env_game_reward/mean": 1.5383512496948242, "rewards/env_game_reward/std": 0.22576954811811448, "sampling/importance_sampling_ratio/max": 1.6490776062011718, "sampling/importance_sampling_ratio/mean": 0.9158960461616517, "sampling/importance_sampling_ratio/min": 0.18876880555620987, "sampling/sampling_logp_difference/max": 5.222516059875488, "sampling/sampling_logp_difference/mean": 0.013656648620963097, "step": 870, "step_time": 11.048820288398565 }, { "clip_ratio/high_max": 0.0011350326240062714, "clip_ratio/high_mean": 0.0005675163120031357, "clip_ratio/low_mean": 0.0004305612586904317, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0009980775532312692, "completions/clipped_ratio": 0.125, "completions/max_length": 2096.6, "completions/max_terminated_length": 2062.8, "completions/mean_length": 1591.075, "completions/mean_terminated_length": 1555.29052734375, "completions/min_length": 1070.6, "completions/min_terminated_length": 1070.6, "entropy": 0.4600535213947296, "epoch": 0.07, "frac_reward_zero_std": 0.225, "grad_norm": 0.2988151013851166, "kl": 0.5012727200984954, "learning_rate": 9.941409072513995e-06, "loss": 0.09155293107032776, "num_tokens": 19152822.0, "reward": 1.537291669845581, "reward_std": 0.14348376393318177, "rewards/env_game_reward/mean": 1.537291669845581, "rewards/env_game_reward/std": 0.21587826758623124, "sampling/importance_sampling_ratio/max": 1.8174469709396361, "sampling/importance_sampling_ratio/mean": 0.8739750266075135, "sampling/importance_sampling_ratio/min": 0.1697855882086297, "sampling/sampling_logp_difference/max": 7.956750082969665, "sampling/sampling_logp_difference/mean": 0.019922800362110138, "step": 875, "step_time": 10.96744642199992 }, { "clip_ratio/high_max": 0.0012740743928588926, "clip_ratio/high_mean": 0.0006370371964294463, "clip_ratio/low_mean": 0.0002832013618899509, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0009202385641401634, "completions/clipped_ratio": 0.0875, "completions/max_length": 2273.2, "completions/max_terminated_length": 2243.0, "completions/mean_length": 1785.6375, "completions/mean_terminated_length": 1753.2542236328125, "completions/min_length": 1217.2, "completions/min_terminated_length": 1217.2, "entropy": 0.4493249997496605, "epoch": 0.0704, "frac_reward_zero_std": 0.275, "grad_norm": 0.15489841997623444, "kl": 0.4825821310281754, "learning_rate": 9.941298767764855e-06, "loss": 0.11126923561096191, "num_tokens": 19340994.0, "reward": 1.497991108894348, "reward_std": 0.18598171770572663, "rewards/env_game_reward/mean": 1.497991108894348, "rewards/env_game_reward/std": 0.3028584122657776, "sampling/importance_sampling_ratio/max": 1.7850000619888307, "sampling/importance_sampling_ratio/mean": 0.9235016465187073, "sampling/importance_sampling_ratio/min": 0.24714634816055997, "sampling/sampling_logp_difference/max": 7.065726852416992, "sampling/sampling_logp_difference/mean": 0.015392303094267845, "step": 880, "step_time": 14.431479327999114 }, { "clip_ratio/high_max": 0.0020738797960802914, "clip_ratio/high_mean": 0.0011360907810740173, "clip_ratio/low_mean": 0.00042371204181108625, "clip_ratio/low_min": 0.00013386880746111274, "clip_ratio/region_mean": 0.0015598028141539544, "completions/clipped_ratio": 0.05, "completions/max_length": 2470.6, "completions/max_terminated_length": 2435.6, "completions/mean_length": 1883.7125, "completions/mean_terminated_length": 1864.0649658203124, "completions/min_length": 1284.2, "completions/min_terminated_length": 1284.2, "entropy": 0.307071553170681, "epoch": 0.0708, "frac_reward_zero_std": 0.1, "grad_norm": 0.1149914562702179, "kl": 22.70539956986904, "learning_rate": 9.94118780870806e-06, "loss": 0.2994292497634888, "num_tokens": 19534798.0, "reward": 1.5454092025756836, "reward_std": 0.21367461532354354, "rewards/env_game_reward/mean": 1.5454092025756836, "rewards/env_game_reward/std": 0.29595637023448945, "sampling/importance_sampling_ratio/max": 1.624759316444397, "sampling/importance_sampling_ratio/mean": 0.8059692859649659, "sampling/importance_sampling_ratio/min": 0.06373572412819155, "sampling/sampling_logp_difference/max": 7.899578714370728, "sampling/sampling_logp_difference/mean": 0.014438807778060437, "step": 885, "step_time": 15.077695120799035 }, { "clip_ratio/high_max": 0.0012833183922339232, "clip_ratio/high_mean": 0.000692523896577768, "clip_ratio/low_mean": 0.000726046136696823, "clip_ratio/low_min": 0.0001866494829300791, "clip_ratio/region_mean": 0.0014185700129019096, "completions/clipped_ratio": 0.075, "completions/max_length": 2363.6, "completions/max_terminated_length": 2356.4, "completions/mean_length": 1824.9, "completions/mean_terminated_length": 1800.4438232421876, "completions/min_length": 1215.6, "completions/min_terminated_length": 1215.6, "entropy": 0.4529956132173538, "epoch": 0.0712, "frac_reward_zero_std": 0.225, "grad_norm": 0.28981900215148926, "kl": 0.4304444819688797, "learning_rate": 9.941076195363116e-06, "loss": 0.01028164178133011, "num_tokens": 19724009.0, "reward": 1.5794018268585206, "reward_std": 0.1288491576910019, "rewards/env_game_reward/mean": 1.5794018268585206, "rewards/env_game_reward/std": 0.21744558215141296, "sampling/importance_sampling_ratio/max": 1.9161789178848267, "sampling/importance_sampling_ratio/mean": 0.904331374168396, "sampling/importance_sampling_ratio/min": 3.2518115607157076e-05, "sampling/sampling_logp_difference/max": 14.280098342895508, "sampling/sampling_logp_difference/mean": 0.019957776367664336, "step": 890, "step_time": 15.076246012999036 }, { "clip_ratio/high_max": 0.001930954516865313, "clip_ratio/high_mean": 0.0009654772584326565, "clip_ratio/low_mean": 0.0008705812797416002, "clip_ratio/low_min": 0.00018939394503831864, "clip_ratio/region_mean": 0.0018360585323534905, "completions/clipped_ratio": 0.0375, "completions/max_length": 2446.4, "completions/max_terminated_length": 2442.8, "completions/mean_length": 1835.6875, "completions/mean_terminated_length": 1819.96455078125, "completions/min_length": 1285.4, "completions/min_terminated_length": 1285.4, "entropy": 0.26241480112075805, "epoch": 0.0716, "frac_reward_zero_std": 0.075, "grad_norm": 0.2046409249305725, "kl": 0.4811114311218262, "learning_rate": 9.940963927749643e-06, "loss": 0.12879633903503418, "num_tokens": 19914187.0, "reward": 1.4908958196640014, "reward_std": 0.27980804443359375, "rewards/env_game_reward/mean": 1.4908958196640014, "rewards/env_game_reward/std": 0.3526145279407501, "sampling/importance_sampling_ratio/max": 1.8963224649429322, "sampling/importance_sampling_ratio/mean": 0.9690939664840699, "sampling/importance_sampling_ratio/min": 0.30016586495963227, "sampling/sampling_logp_difference/max": 2.3270081281661987, "sampling/sampling_logp_difference/mean": 0.01102782627567649, "step": 895, "step_time": 12.819816202801302 }, { "clip_ratio/high_max": 0.0008085624547675252, "clip_ratio/high_mean": 0.0005102588038425893, "clip_ratio/low_mean": 0.0005037825962062925, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0010140414000488819, "completions/clipped_ratio": 0.025, "completions/max_length": 2384.2, "completions/max_terminated_length": 2384.2, "completions/mean_length": 1735.475, "completions/mean_terminated_length": 1735.693359375, "completions/min_length": 1147.6, "completions/min_terminated_length": 1147.6, "entropy": 0.3156480178236961, "epoch": 0.072, "frac_reward_zero_std": 0.075, "grad_norm": 0.2748168706893921, "kl": 0.5220715075731277, "learning_rate": 9.940851005887376e-06, "loss": -0.02894388735294342, "num_tokens": 20095788.0, "reward": 1.4607172727584838, "reward_std": 0.2872200310230255, "rewards/env_game_reward/mean": 1.4607172727584838, "rewards/env_game_reward/std": 0.3808079123497009, "sampling/importance_sampling_ratio/max": 2.075653338432312, "sampling/importance_sampling_ratio/mean": 0.9483648419380188, "sampling/importance_sampling_ratio/min": 0.11939566901285976, "sampling/sampling_logp_difference/max": 6.860753440856934, "sampling/sampling_logp_difference/mean": 0.015069704689085483, "step": 900, "step_time": 12.859525776599913 }, { "epoch": 0.072, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.041666666666666664, "eval_completions/max_length": 2727.0, "eval_completions/max_terminated_length": 2307.3333333333335, "eval_completions/mean_length": 2059.0, "eval_completions/mean_terminated_length": 1995.732177734375, "eval_completions/min_length": 1542.0, "eval_completions/min_terminated_length": 1542.0, "eval_entropy": 0.2817843755086263, "eval_frac_reward_zero_std": 0.16666666666666666, "eval_kl": 0.3643878698348999, "eval_loss": -0.0993538573384285, "eval_num_tokens": 20095788.0, "eval_reward": 1.457232157389323, "eval_reward_std": 0.28440003593762714, "eval_rewards/env_game_reward/mean": 1.457232157389323, "eval_rewards/env_game_reward/std": 0.3733518322308858, "eval_runtime": 19.7318, "eval_samples_per_second": 0.507, "eval_sampling/importance_sampling_ratio/max": 1.6719778378804524, "eval_sampling/importance_sampling_ratio/mean": 1.0260129968325298, "eval_sampling/importance_sampling_ratio/min": 0.3794739445050557, "eval_sampling/sampling_logp_difference/max": 0.3471987247467041, "eval_sampling/sampling_logp_difference/mean": 0.007802010824282964, "eval_steps_per_second": 0.101, "step": 900 }, { "clip_ratio/high_max": 0.0012189357890747487, "clip_ratio/high_mean": 0.0007577869633678347, "clip_ratio/low_mean": 0.0006541514245327562, "clip_ratio/low_min": 0.0003207181231118739, "clip_ratio/region_mean": 0.0014119383762590588, "completions/clipped_ratio": 0.05, "completions/max_length": 2461.6, "completions/max_terminated_length": 2448.6, "completions/mean_length": 1848.575, "completions/mean_terminated_length": 1824.4354248046875, "completions/min_length": 1311.6, "completions/min_terminated_length": 1311.6, "entropy": 0.3148694097995758, "epoch": 0.0724, "frac_reward_zero_std": 0.125, "grad_norm": 0.19136394560337067, "kl": 0.43292770683765414, "learning_rate": 9.940737429796167e-06, "loss": -0.0005514532327651978, "num_tokens": 20286731.0, "reward": 1.5534344911575317, "reward_std": 0.164444400370121, "rewards/env_game_reward/mean": 1.5534344911575317, "rewards/env_game_reward/std": 0.25763210356235505, "sampling/importance_sampling_ratio/max": 1.920850896835327, "sampling/importance_sampling_ratio/mean": 0.9820604085922241, "sampling/importance_sampling_ratio/min": 0.36941553354263307, "sampling/sampling_logp_difference/max": 3.685489463806152, "sampling/sampling_logp_difference/mean": 0.012805731128901243, "step": 905, "step_time": 13.696737619400665 }, { "clip_ratio/high_max": 0.0015069938148371876, "clip_ratio/high_mean": 0.000920184439746663, "clip_ratio/low_mean": 0.00035937174980062994, "clip_ratio/low_min": 0.00021658835466951132, "clip_ratio/region_mean": 0.001279556195368059, "completions/clipped_ratio": 0.0375, "completions/max_length": 2569.6, "completions/max_terminated_length": 2511.4, "completions/mean_length": 1820.3, "completions/mean_terminated_length": 1806.58505859375, "completions/min_length": 1275.2, "completions/min_terminated_length": 1275.2, "entropy": 0.33324653208255767, "epoch": 0.0728, "frac_reward_zero_std": 0.05, "grad_norm": 0.1512303650379181, "kl": 0.39021323025226595, "learning_rate": 9.940623199495979e-06, "loss": 0.16941314935684204, "num_tokens": 20474836.0, "reward": 1.5158392906188964, "reward_std": 0.2332189589738846, "rewards/env_game_reward/mean": 1.5158392906188964, "rewards/env_game_reward/std": 0.31163732558488844, "sampling/importance_sampling_ratio/max": 2.071822476387024, "sampling/importance_sampling_ratio/mean": 0.8877307176589966, "sampling/importance_sampling_ratio/min": 0.07263679296009132, "sampling/sampling_logp_difference/max": 11.022346949577331, "sampling/sampling_logp_difference/mean": 0.01563575156033039, "step": 910, "step_time": 15.462018063198775 }, { "clip_ratio/high_max": 0.001472691132221371, "clip_ratio/high_mean": 0.000864570785779506, "clip_ratio/low_mean": 0.0009776385850273073, "clip_ratio/low_min": 0.00032047065906226635, "clip_ratio/region_mean": 0.001842209347523749, "completions/clipped_ratio": 0.0375, "completions/max_length": 2470.0, "completions/max_terminated_length": 2431.2, "completions/mean_length": 1867.8875, "completions/mean_terminated_length": 1865.7714599609376, "completions/min_length": 1331.2, "completions/min_terminated_length": 1331.2, "entropy": 0.36800833940505984, "epoch": 0.0732, "frac_reward_zero_std": 0.1, "grad_norm": 0.17658759653568268, "kl": 0.46147061288356783, "learning_rate": 9.940508315006892e-06, "loss": 0.03878949880599976, "num_tokens": 20666662.0, "reward": 1.5093273878097535, "reward_std": 0.2326802283525467, "rewards/env_game_reward/mean": 1.5093273878097535, "rewards/env_game_reward/std": 0.32178205251693726, "sampling/importance_sampling_ratio/max": 1.9555179595947265, "sampling/importance_sampling_ratio/mean": 0.8527028918266296, "sampling/importance_sampling_ratio/min": 0.033528552153945776, "sampling/sampling_logp_difference/max": 10.650108671188354, "sampling/sampling_logp_difference/mean": 0.022430221550166607, "step": 915, "step_time": 14.398402365599031 }, { "clip_ratio/high_max": 0.0011677380418404937, "clip_ratio/high_mean": 0.0007100923685356975, "clip_ratio/low_mean": 0.0008342034270754084, "clip_ratio/low_min": 0.0001377410488203168, "clip_ratio/region_mean": 0.001544295810163021, "completions/clipped_ratio": 0.0875, "completions/max_length": 2436.8, "completions/max_terminated_length": 2436.8, "completions/mean_length": 1821.1875, "completions/mean_terminated_length": 1790.2981689453125, "completions/min_length": 1247.4, "completions/min_terminated_length": 1247.4, "entropy": 0.31378189474344254, "epoch": 0.0736, "frac_reward_zero_std": 0.2, "grad_norm": 0.2656424045562744, "kl": 0.40927897691726683, "learning_rate": 9.940392776349104e-06, "loss": 0.1518398642539978, "num_tokens": 20854145.0, "reward": 1.5011369228363036, "reward_std": 0.19547294974327087, "rewards/env_game_reward/mean": 1.5011369228363036, "rewards/env_game_reward/std": 0.3228561282157898, "sampling/importance_sampling_ratio/max": 1.8403097152709962, "sampling/importance_sampling_ratio/mean": 0.8172707557678223, "sampling/importance_sampling_ratio/min": 0.05242107790071242, "sampling/sampling_logp_difference/max": 10.830488920211792, "sampling/sampling_logp_difference/mean": 0.01946103759109974, "step": 920, "step_time": 15.806952799401916 }, { "clip_ratio/high_max": 0.002922432462219149, "clip_ratio/high_mean": 0.0016289437422528862, "clip_ratio/low_mean": 0.0005936998728429899, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002222643652930856, "completions/clipped_ratio": 0.0375, "completions/max_length": 2578.0, "completions/max_terminated_length": 2346.2, "completions/mean_length": 1863.5625, "completions/mean_terminated_length": 1825.928271484375, "completions/min_length": 1254.6, "completions/min_terminated_length": 1254.6, "entropy": 0.2191880613565445, "epoch": 0.074, "frac_reward_zero_std": 0.175, "grad_norm": 0.17762354016304016, "kl": 0.4934216499328613, "learning_rate": 9.940276583542922e-06, "loss": 0.16184706687927247, "num_tokens": 21046543.0, "reward": 1.5385030031204223, "reward_std": 0.19008124768733978, "rewards/env_game_reward/mean": 1.5385030031204223, "rewards/env_game_reward/std": 0.3026488572359085, "sampling/importance_sampling_ratio/max": 1.5807328701019288, "sampling/importance_sampling_ratio/mean": 0.9147717595100403, "sampling/importance_sampling_ratio/min": 0.14841571303332562, "sampling/sampling_logp_difference/max": 7.422147178649903, "sampling/sampling_logp_difference/mean": 0.011565564665943384, "step": 925, "step_time": 13.949015424399112 }, { "clip_ratio/high_max": 0.001220880087930709, "clip_ratio/high_mean": 0.0006884431757498532, "clip_ratio/low_mean": 0.0010157813434489072, "clip_ratio/low_min": 0.0006492685759440064, "clip_ratio/region_mean": 0.0017042245075572283, "completions/clipped_ratio": 0.0125, "completions/max_length": 2307.2, "completions/max_terminated_length": 2307.2, "completions/mean_length": 1872.1625, "completions/mean_terminated_length": 1870.493359375, "completions/min_length": 1372.4, "completions/min_terminated_length": 1372.4, "entropy": 0.3217353641986847, "epoch": 0.0744, "frac_reward_zero_std": 0.125, "grad_norm": 0.1309574842453003, "kl": 0.5329782962799072, "learning_rate": 9.940159736608773e-06, "loss": 0.003825952857732773, "num_tokens": 21240177.0, "reward": 1.5249047756195069, "reward_std": 0.1893110007047653, "rewards/env_game_reward/mean": 1.5249047756195069, "rewards/env_game_reward/std": 0.3156297117471695, "sampling/importance_sampling_ratio/max": 1.7126131534576416, "sampling/importance_sampling_ratio/mean": 0.8812451601028443, "sampling/importance_sampling_ratio/min": 0.09435489480078894, "sampling/sampling_logp_difference/max": 12.496303129196168, "sampling/sampling_logp_difference/mean": 0.022152267023921014, "step": 930, "step_time": 12.41166779679843 }, { "clip_ratio/high_max": 0.0024893431225791575, "clip_ratio/high_mean": 0.0015371947665698826, "clip_ratio/low_mean": 0.000855160562787205, "clip_ratio/low_min": 0.00016750418581068516, "clip_ratio/region_mean": 0.0023923553293570877, "completions/clipped_ratio": 0.025, "completions/max_length": 2308.8, "completions/max_terminated_length": 2303.8, "completions/mean_length": 1734.975, "completions/mean_terminated_length": 1722.290869140625, "completions/min_length": 1180.6, "completions/min_terminated_length": 1180.6, "entropy": 0.3756895184516907, "epoch": 0.0748, "frac_reward_zero_std": 0.125, "grad_norm": 0.22695200145244598, "kl": 0.57938232421875, "learning_rate": 9.940042235567198e-06, "loss": 0.06497732996940613, "num_tokens": 21421312.0, "reward": 1.5562530040740967, "reward_std": 0.17508048713207244, "rewards/env_game_reward/mean": 1.5562530040740967, "rewards/env_game_reward/std": 0.2580333352088928, "sampling/importance_sampling_ratio/max": 1.9110926389694214, "sampling/importance_sampling_ratio/mean": 0.9796088457107544, "sampling/importance_sampling_ratio/min": 0.1705502788976498, "sampling/sampling_logp_difference/max": 7.984076070785522, "sampling/sampling_logp_difference/mean": 0.01644737496972084, "step": 935, "step_time": 11.930417591598962 }, { "clip_ratio/high_max": 0.003084265359211713, "clip_ratio/high_mean": 0.0018833025882486255, "clip_ratio/low_mean": 0.0007503544562496245, "clip_ratio/low_min": 0.00014577260008081795, "clip_ratio/region_mean": 0.00263365704449825, "completions/clipped_ratio": 0.025, "completions/max_length": 2340.0, "completions/max_terminated_length": 2340.0, "completions/mean_length": 1727.275, "completions/mean_terminated_length": 1716.6592041015624, "completions/min_length": 1215.2, "completions/min_terminated_length": 1215.2, "entropy": 0.31620822846889496, "epoch": 0.0752, "frac_reward_zero_std": 0.0, "grad_norm": 0.24355845153331757, "kl": 0.6145551562309265, "learning_rate": 9.939924080438852e-06, "loss": 0.11798319816589356, "num_tokens": 21601758.0, "reward": 1.5743244171142579, "reward_std": 0.15734387189149857, "rewards/env_game_reward/mean": 1.5743244171142579, "rewards/env_game_reward/std": 0.22823202311992646, "sampling/importance_sampling_ratio/max": 2.0933955669403077, "sampling/importance_sampling_ratio/mean": 1.0007439851760864, "sampling/importance_sampling_ratio/min": 0.3973370999097824, "sampling/sampling_logp_difference/max": 0.4600968599319458, "sampling/sampling_logp_difference/mean": 0.011099493503570557, "step": 940, "step_time": 11.759782355798233 }, { "clip_ratio/high_max": 0.001626867160666734, "clip_ratio/high_mean": 0.0009445225528907031, "clip_ratio/low_mean": 0.0014562467520590871, "clip_ratio/low_min": 0.00011627906933426857, "clip_ratio/region_mean": 0.00240076927584596, "completions/clipped_ratio": 0.0125, "completions/max_length": 2321.2, "completions/max_terminated_length": 2321.2, "completions/mean_length": 1699.95, "completions/mean_terminated_length": 1694.1233642578125, "completions/min_length": 1174.8, "completions/min_terminated_length": 1174.8, "entropy": 0.2677352011203766, "epoch": 0.0756, "frac_reward_zero_std": 0.125, "grad_norm": 0.15144741535186768, "kl": 0.6979381173849106, "learning_rate": 9.939805271244503e-06, "loss": 0.08515205383300781, "num_tokens": 21781002.0, "reward": 1.4935282707214355, "reward_std": 0.18328165709972383, "rewards/env_game_reward/mean": 1.4935282707214355, "rewards/env_game_reward/std": 0.2772886216640472, "sampling/importance_sampling_ratio/max": 1.7501282215118408, "sampling/importance_sampling_ratio/mean": 0.8283731698989868, "sampling/importance_sampling_ratio/min": 0.06416170225804453, "sampling/sampling_logp_difference/max": 12.058596181869508, "sampling/sampling_logp_difference/mean": 0.016518401354551314, "step": 945, "step_time": 12.266804081601004 }, { "clip_ratio/high_max": 0.0045323959086090325, "clip_ratio/high_mean": 0.0026198809733614325, "clip_ratio/low_mean": 0.0007645079400390387, "clip_ratio/low_min": 0.0003045685356482863, "clip_ratio/region_mean": 0.0033843889308627696, "completions/clipped_ratio": 0.05, "completions/max_length": 2202.6, "completions/max_terminated_length": 2183.0, "completions/mean_length": 1666.2375, "completions/mean_terminated_length": 1642.8640625, "completions/min_length": 1118.0, "completions/min_terminated_length": 1118.0, "entropy": 0.2549864038825035, "epoch": 0.076, "frac_reward_zero_std": 0.1, "grad_norm": 0.09294123202562332, "kl": 0.7737713813781738, "learning_rate": 9.939685808005038e-06, "loss": -0.01393904983997345, "num_tokens": 21955811.0, "reward": 1.477282738685608, "reward_std": 0.21622737050056456, "rewards/env_game_reward/mean": 1.477282738685608, "rewards/env_game_reward/std": 0.34079847633838656, "sampling/importance_sampling_ratio/max": 1.767937183380127, "sampling/importance_sampling_ratio/mean": 0.9298573493957519, "sampling/importance_sampling_ratio/min": 0.14373855862080057, "sampling/sampling_logp_difference/max": 9.330408585071563, "sampling/sampling_logp_difference/mean": 0.014421308785676957, "step": 950, "step_time": 11.463380426800722 }, { "clip_ratio/high_max": 0.005439925438258797, "clip_ratio/high_mean": 0.003208035236457363, "clip_ratio/low_mean": 0.0033877444453537463, "clip_ratio/low_min": 0.0006987016880884766, "clip_ratio/region_mean": 0.006595779792405665, "completions/clipped_ratio": 0.0125, "completions/max_length": 2315.6, "completions/max_terminated_length": 2270.6, "completions/mean_length": 1716.55, "completions/mean_terminated_length": 1707.9341796875, "completions/min_length": 1110.8, "completions/min_terminated_length": 1110.8, "entropy": 0.3051726818084717, "epoch": 0.0764, "frac_reward_zero_std": 0.05, "grad_norm": 0.10971368849277496, "kl": 0.9875833690166473, "learning_rate": 9.939565690741458e-06, "loss": 0.011976981163024902, "num_tokens": 22135718.0, "reward": 1.409833312034607, "reward_std": 0.2909154981374741, "rewards/env_game_reward/mean": 1.409833312034607, "rewards/env_game_reward/std": 0.45696545839309693, "sampling/importance_sampling_ratio/max": 1.5724833250045775, "sampling/importance_sampling_ratio/mean": 0.781300175189972, "sampling/importance_sampling_ratio/min": 3.93051331259997e-06, "sampling/sampling_logp_difference/max": 12.314488220214844, "sampling/sampling_logp_difference/mean": 0.04921445026993752, "step": 955, "step_time": 10.660027011402416 }, { "clip_ratio/high_max": 0.0050688618794083595, "clip_ratio/high_mean": 0.0030160827096551656, "clip_ratio/low_mean": 0.0024181493849027903, "clip_ratio/low_min": 0.001296406053006649, "clip_ratio/region_mean": 0.005434232112020254, "completions/clipped_ratio": 0.0125, "completions/max_length": 2306.6, "completions/max_terminated_length": 2299.2, "completions/mean_length": 1687.0375, "completions/mean_terminated_length": 1679.5216796875, "completions/min_length": 1124.2, "completions/min_terminated_length": 1124.2, "entropy": 0.2659656837582588, "epoch": 0.0768, "frac_reward_zero_std": 0.075, "grad_norm": 0.07717063277959824, "kl": 0.867502224445343, "learning_rate": 9.939444919474875e-06, "loss": 0.03871417641639709, "num_tokens": 22313392.0, "reward": 1.432592248916626, "reward_std": 0.2927884966135025, "rewards/env_game_reward/mean": 1.432592248916626, "rewards/env_game_reward/std": 0.41850276589393615, "sampling/importance_sampling_ratio/max": 1.9147189617156983, "sampling/importance_sampling_ratio/mean": 0.6610526800155639, "sampling/importance_sampling_ratio/min": 6.190264483464527e-11, "sampling/sampling_logp_difference/max": 19.166898345947267, "sampling/sampling_logp_difference/mean": 0.08357204496860504, "step": 960, "step_time": 11.269645843000763 }, { "clip_ratio/high_max": 0.00402466943487525, "clip_ratio/high_mean": 0.002358415606431663, "clip_ratio/low_mean": 0.0015746165940072388, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0039330321713350715, "completions/clipped_ratio": 0.0, "completions/max_length": 2263.4, "completions/max_terminated_length": 2263.4, "completions/mean_length": 1771.7, "completions/mean_terminated_length": 1771.7, "completions/min_length": 1157.8, "completions/min_terminated_length": 1157.8, "entropy": 0.3173187494277954, "epoch": 0.0772, "frac_reward_zero_std": 0.2, "grad_norm": 0.08260554075241089, "kl": 0.7630613714456558, "learning_rate": 9.939323494226523e-06, "loss": -0.0028585586696863174, "num_tokens": 22499063.0, "reward": 1.5061264991760255, "reward_std": 0.22805666625499726, "rewards/env_game_reward/mean": 1.5061264991760255, "rewards/env_game_reward/std": 0.37998464703559875, "sampling/importance_sampling_ratio/max": 1.352769374847412, "sampling/importance_sampling_ratio/mean": 0.7304522514343261, "sampling/importance_sampling_ratio/min": 0.05258167387152959, "sampling/sampling_logp_difference/max": 17.498226356506347, "sampling/sampling_logp_difference/mean": 0.06935336329042911, "step": 965, "step_time": 10.030467351600965 }, { "clip_ratio/high_max": 0.0029599617468193174, "clip_ratio/high_mean": 0.0017886228044517339, "clip_ratio/low_mean": 0.002303900709375739, "clip_ratio/low_min": 0.0009720305213704705, "clip_ratio/region_mean": 0.004092523554572835, "completions/clipped_ratio": 0.0, "completions/max_length": 2316.2, "completions/max_terminated_length": 2316.2, "completions/mean_length": 1650.2125, "completions/mean_terminated_length": 1650.2125, "completions/min_length": 1100.6, "completions/min_terminated_length": 1100.6, "entropy": 0.31040072441101074, "epoch": 0.0776, "frac_reward_zero_std": 0.075, "grad_norm": 0.17387092113494873, "kl": 0.7710592210292816, "learning_rate": 9.939201415017744e-06, "loss": -0.057826101779937744, "num_tokens": 22674597.0, "reward": 1.4797053575515746, "reward_std": 0.27919352650642393, "rewards/env_game_reward/mean": 1.4797053575515746, "rewards/env_game_reward/std": 0.37522571682929995, "sampling/importance_sampling_ratio/max": 1.5621517896652222, "sampling/importance_sampling_ratio/mean": 0.6350171446800232, "sampling/importance_sampling_ratio/min": 1.4516258057226337e-12, "sampling/sampling_logp_difference/max": 24.295112991333006, "sampling/sampling_logp_difference/mean": 0.09522517807781697, "step": 970, "step_time": 10.581186058397725 }, { "clip_ratio/high_max": 0.0035527752072084693, "clip_ratio/high_mean": 0.0019544816430425273, "clip_ratio/low_mean": 0.0014669578959001228, "clip_ratio/low_min": 0.00055584826041013, "clip_ratio/region_mean": 0.0034214395738672463, "completions/clipped_ratio": 0.05, "completions/max_length": 2394.8, "completions/max_terminated_length": 2261.6, "completions/mean_length": 1739.1625, "completions/mean_terminated_length": 1705.8567138671874, "completions/min_length": 1138.8, "completions/min_terminated_length": 1138.8, "entropy": 0.450977149605751, "epoch": 0.078, "frac_reward_zero_std": 0.075, "grad_norm": 0.13805274665355682, "kl": 0.7013565450906754, "learning_rate": 9.93907868187e-06, "loss": -0.0027586638927459715, "num_tokens": 22857731.0, "reward": 1.4618571519851684, "reward_std": 0.28324676156044004, "rewards/env_game_reward/mean": 1.4618571519851684, "rewards/env_game_reward/std": 0.3617980420589447, "sampling/importance_sampling_ratio/max": 1.79449143409729, "sampling/importance_sampling_ratio/mean": 0.637387079000473, "sampling/importance_sampling_ratio/min": 3.107777472507138e-14, "sampling/sampling_logp_difference/max": 22.73606185913086, "sampling/sampling_logp_difference/mean": 0.08924243450164795, "step": 975, "step_time": 12.915631853001106 }, { "epoch": 0.078, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.08333333333333333, "eval_completions/max_length": 2266.0, "eval_completions/max_terminated_length": 2250.0, "eval_completions/mean_length": 1855.9166666666667, "eval_completions/mean_terminated_length": 1823.6310221354167, "eval_completions/min_length": 1335.3333333333333, "eval_completions/min_terminated_length": 1335.3333333333333, "eval_entropy": 0.3405195375283559, "eval_frac_reward_zero_std": 0.3333333333333333, "eval_kl": 0.5966706077257792, "eval_loss": -0.01148859690874815, "eval_num_tokens": 22857731.0, "eval_reward": 1.6079761981964111, "eval_reward_std": 0.11246365184585254, "eval_rewards/env_game_reward/mean": 1.6079761981964111, "eval_rewards/env_game_reward/std": 0.1685823400815328, "eval_runtime": 16.002, "eval_samples_per_second": 0.625, "eval_sampling/importance_sampling_ratio/max": 1.5225164890289307, "eval_sampling/importance_sampling_ratio/mean": 0.5733944574991862, "eval_sampling/importance_sampling_ratio/min": 2.351474487362804e-12, "eval_sampling/sampling_logp_difference/max": 23.101406733194988, "eval_sampling/sampling_logp_difference/mean": 0.09195188557108243, "eval_steps_per_second": 0.125, "step": 975 } ], "logging_steps": 5, "max_steps": 37500, "num_input_tokens_seen": 22857731, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }