| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.00084, |
| "eval_steps": 500, |
| "global_step": 84, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 13167.0, |
| "completions/max_terminated_length": 13167.0, |
| "completions/mean_length": 9931.84375, |
| "completions/mean_terminated_length": 9931.84375, |
| "completions/min_length": 204.0, |
| "completions/min_terminated_length": 204.0, |
| "entropy": 0.02259049008716829, |
| "epoch": 1e-05, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 5.972987651824951, |
| "kl": 0.0, |
| "learning_rate": 0.0, |
| "loss": 0.4677, |
| "num_tokens": 344589.0, |
| "reward": -0.5509895086288452, |
| "reward_std": 0.49180489778518677, |
| "rewards/rollout_reward_func/mean": -0.5509895086288452, |
| "rewards/rollout_reward_func/std": 0.5359447598457336, |
| "sampling/importance_sampling_ratio/max": 2.058147430419922, |
| "sampling/importance_sampling_ratio/mean": 0.9989001750946045, |
| "sampling/importance_sampling_ratio/min": 0.06982824206352234, |
| "sampling/sampling_logp_difference/max": 2.6617166996002197, |
| "sampling/sampling_logp_difference/mean": 0.007194924633949995, |
| "step": 1, |
| "step_time": 156.04581609299998 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "entropy": 0.02259049008716829, |
| "epoch": 2e-05, |
| "grad_norm": 6.000457286834717, |
| "kl": 0.0, |
| "learning_rate": 1.4285714285714286e-06, |
| "loss": 0.4677, |
| "step": 2, |
| "step_time": 72.52690972699997 |
| }, |
| { |
| "clip_ratio/high_max": 0.0052298564405646175, |
| "clip_ratio/high_mean": 0.0026149282202823088, |
| "clip_ratio/low_mean": 0.0035879433271475136, |
| "clip_ratio/low_min": 0.00041666667675599456, |
| "clip_ratio/region_mean": 0.006202871503774077, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 13398.0, |
| "completions/max_terminated_length": 13398.0, |
| "completions/mean_length": 11894.3125, |
| "completions/mean_terminated_length": 11894.3125, |
| "completions/min_length": 7109.0, |
| "completions/min_terminated_length": 7109.0, |
| "entropy": 0.023217253852635622, |
| "epoch": 3e-05, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 6.642491340637207, |
| "kl": 0.0027546791861823294, |
| "learning_rate": 2.8571428571428573e-06, |
| "loss": 0.2037, |
| "num_tokens": 752299.0, |
| "reward": -0.48321714997291565, |
| "reward_std": 0.7304716110229492, |
| "rewards/rollout_reward_func/mean": -0.48321714997291565, |
| "rewards/rollout_reward_func/std": 0.7494373917579651, |
| "sampling/importance_sampling_ratio/max": 3.0, |
| "sampling/importance_sampling_ratio/mean": 0.9993765354156494, |
| "sampling/importance_sampling_ratio/min": 0.08995848894119263, |
| "sampling/sampling_logp_difference/max": 2.408406972885132, |
| "sampling/sampling_logp_difference/mean": 0.009509067051112652, |
| "step": 3, |
| "step_time": 167.4341570240008 |
| }, |
| { |
| "clip_ratio/high_max": 0.0048466095759067684, |
| "clip_ratio/high_mean": 0.0024233047879533842, |
| "clip_ratio/low_mean": 0.0035862942750100046, |
| "clip_ratio/low_min": 0.0010411091789137572, |
| "clip_ratio/region_mean": 0.006009599004755728, |
| "entropy": 0.02273811263148673, |
| "epoch": 4e-05, |
| "grad_norm": 6.468554496765137, |
| "kl": 0.0035955551825281873, |
| "learning_rate": 4.285714285714286e-06, |
| "loss": 0.2014, |
| "step": 4, |
| "step_time": 75.431375376 |
| }, |
| { |
| "clip_ratio/high_max": 0.0030947362538427114, |
| "clip_ratio/high_mean": 0.0017841105291154236, |
| "clip_ratio/low_mean": 0.002839852197212167, |
| "clip_ratio/low_min": 0.0010658372775651515, |
| "clip_ratio/region_mean": 0.004623962740879506, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 12874.0, |
| "completions/max_terminated_length": 12874.0, |
| "completions/mean_length": 11251.96875, |
| "completions/mean_terminated_length": 11251.96875, |
| "completions/min_length": 3238.0, |
| "completions/min_terminated_length": 3238.0, |
| "entropy": 0.024516460485756397, |
| "epoch": 5e-05, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 5.106947422027588, |
| "kl": 0.0033718565209710505, |
| "learning_rate": 5.7142857142857145e-06, |
| "loss": -0.2472, |
| "num_tokens": 1138956.0, |
| "reward": -0.43200355768203735, |
| "reward_std": 0.6725019812583923, |
| "rewards/rollout_reward_func/mean": -0.43200355768203735, |
| "rewards/rollout_reward_func/std": 0.7308401465415955, |
| "sampling/importance_sampling_ratio/max": 2.691387414932251, |
| "sampling/importance_sampling_ratio/mean": 1.000849723815918, |
| "sampling/importance_sampling_ratio/min": 0.28908851742744446, |
| "sampling/sampling_logp_difference/max": 1.2410223484039307, |
| "sampling/sampling_logp_difference/mean": 0.00824270211160183, |
| "step": 5, |
| "step_time": 160.70129029100008 |
| }, |
| { |
| "clip_ratio/high_max": 0.0022010681277606636, |
| "clip_ratio/high_mean": 0.0015363333659479395, |
| "clip_ratio/low_mean": 0.002654661060660146, |
| "clip_ratio/low_min": 0.0010658372775651515, |
| "clip_ratio/region_mean": 0.004190994426608086, |
| "entropy": 0.024331653432454914, |
| "epoch": 6e-05, |
| "grad_norm": 5.248446941375732, |
| "kl": 0.003470558443950722, |
| "learning_rate": 7.142857142857143e-06, |
| "loss": -0.2503, |
| "step": 6, |
| "step_time": 70.39162059900036 |
| }, |
| { |
| "clip_ratio/high_max": 0.004718718817457557, |
| "clip_ratio/high_mean": 0.0023593594087287784, |
| "clip_ratio/low_mean": 0.00323545208084397, |
| "clip_ratio/low_min": 0.0008474673668388277, |
| "clip_ratio/region_mean": 0.0055948115477804095, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 13424.0, |
| "completions/max_terminated_length": 13424.0, |
| "completions/mean_length": 10642.4375, |
| "completions/mean_terminated_length": 10642.4375, |
| "completions/min_length": 1097.0, |
| "completions/min_terminated_length": 1097.0, |
| "entropy": 0.02454580759513192, |
| "epoch": 7e-05, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 5.2174973487854, |
| "kl": 0.002597570359284873, |
| "learning_rate": 8.571428571428573e-06, |
| "loss": 0.4074, |
| "num_tokens": 1506776.0, |
| "reward": -0.4508303999900818, |
| "reward_std": 0.856971263885498, |
| "rewards/rollout_reward_func/mean": -0.4508303999900818, |
| "rewards/rollout_reward_func/std": 0.875674307346344, |
| "sampling/importance_sampling_ratio/max": 2.2130932807922363, |
| "sampling/importance_sampling_ratio/mean": 0.9991341829299927, |
| "sampling/importance_sampling_ratio/min": 0.09515064209699631, |
| "sampling/sampling_logp_difference/max": 2.3522939682006836, |
| "sampling/sampling_logp_difference/mean": 0.007671562489122152, |
| "step": 7, |
| "step_time": 167.14791396400005 |
| }, |
| { |
| "clip_ratio/high_max": 0.005202832107897848, |
| "clip_ratio/high_mean": 0.002601416053948924, |
| "clip_ratio/low_mean": 0.0041243805608246475, |
| "clip_ratio/low_min": 0.0012697646743617952, |
| "clip_ratio/region_mean": 0.006725796643877402, |
| "entropy": 0.02552157419268042, |
| "epoch": 8e-05, |
| "grad_norm": 3.9264698028564453, |
| "kl": 0.0049671942251734436, |
| "learning_rate": 1e-05, |
| "loss": 0.402, |
| "step": 8, |
| "step_time": 75.2512055050006 |
| }, |
| { |
| "clip_ratio/high_max": 0.006363416585372761, |
| "clip_ratio/high_mean": 0.004113822695217095, |
| "clip_ratio/low_mean": 0.0022823161561973393, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.006396138880518265, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 13250.0, |
| "completions/max_terminated_length": 13250.0, |
| "completions/mean_length": 11233.6875, |
| "completions/mean_terminated_length": 11233.6875, |
| "completions/min_length": 1185.0, |
| "completions/min_terminated_length": 1185.0, |
| "entropy": 0.027631424833089113, |
| "epoch": 9e-05, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 6.5448150634765625, |
| "kl": 0.00364598065152677, |
| "learning_rate": 1.1428571428571429e-05, |
| "loss": -0.0485, |
| "num_tokens": 1893166.0, |
| "reward": -0.5741807222366333, |
| "reward_std": 0.40697890520095825, |
| "rewards/rollout_reward_func/mean": -0.5741807222366333, |
| "rewards/rollout_reward_func/std": 0.4363042116165161, |
| "sampling/importance_sampling_ratio/max": 3.0, |
| "sampling/importance_sampling_ratio/mean": 1.0008200407028198, |
| "sampling/importance_sampling_ratio/min": 0.09258205443620682, |
| "sampling/sampling_logp_difference/max": 2.37965989112854, |
| "sampling/sampling_logp_difference/mean": 0.009641825221478939, |
| "step": 9, |
| "step_time": 165.40504330000158 |
| }, |
| { |
| "clip_ratio/high_max": 0.005221198371145874, |
| "clip_ratio/high_mean": 0.0032498103246325627, |
| "clip_ratio/low_mean": 0.0037138201732886955, |
| "clip_ratio/low_min": 0.00042517005931586027, |
| "clip_ratio/region_mean": 0.006963630527025089, |
| "entropy": 0.028140071080997586, |
| "epoch": 0.0001, |
| "grad_norm": 5.636337757110596, |
| "kl": 0.005874367059732322, |
| "learning_rate": 1.2857142857142857e-05, |
| "loss": -0.0496, |
| "step": 10, |
| "step_time": 74.03838988500002 |
| }, |
| { |
| "clip_ratio/high_max": 0.009417513123480603, |
| "clip_ratio/high_mean": 0.0053452335559995845, |
| "clip_ratio/low_mean": 0.002856071077985689, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.008201304633985274, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 13467.0, |
| "completions/max_terminated_length": 13467.0, |
| "completions/mean_length": 10857.9375, |
| "completions/mean_terminated_length": 10857.9375, |
| "completions/min_length": 1234.0, |
| "completions/min_terminated_length": 1234.0, |
| "entropy": 0.025376274134032428, |
| "epoch": 0.00011, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 3.4508001804351807, |
| "kl": 0.007294285507668974, |
| "learning_rate": 1.4285714285714285e-05, |
| "loss": 0.4251, |
| "num_tokens": 2267468.0, |
| "reward": -0.41874319314956665, |
| "reward_std": 0.7331498861312866, |
| "rewards/rollout_reward_func/mean": -0.41874319314956665, |
| "rewards/rollout_reward_func/std": 0.7618294954299927, |
| "sampling/importance_sampling_ratio/max": 2.2565839290618896, |
| "sampling/importance_sampling_ratio/mean": 0.9988585114479065, |
| "sampling/importance_sampling_ratio/min": 0.19103538990020752, |
| "sampling/sampling_logp_difference/max": 1.6552965641021729, |
| "sampling/sampling_logp_difference/mean": 0.007842643186450005, |
| "step": 11, |
| "step_time": 168.32780298800117 |
| }, |
| { |
| "clip_ratio/high_max": 0.010110442468430847, |
| "clip_ratio/high_mean": 0.005691698199370876, |
| "clip_ratio/low_mean": 0.0022150053700897843, |
| "clip_ratio/low_min": 0.00042808218859136105, |
| "clip_ratio/region_mean": 0.007906703525804915, |
| "entropy": 0.0246156333014369, |
| "epoch": 0.00012, |
| "grad_norm": 3.3792054653167725, |
| "kl": 0.00864831962826429, |
| "learning_rate": 1.5714285714285715e-05, |
| "loss": 0.4308, |
| "step": 12, |
| "step_time": 75.09417690599867 |
| }, |
| { |
| "clip_ratio/high_max": 0.007237050449475646, |
| "clip_ratio/high_mean": 0.003618525224737823, |
| "clip_ratio/low_mean": 0.004828559438465163, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.008447084634099156, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 12770.0, |
| "completions/max_terminated_length": 12770.0, |
| "completions/mean_length": 9700.59375, |
| "completions/mean_terminated_length": 9700.59375, |
| "completions/min_length": 678.0, |
| "completions/min_terminated_length": 678.0, |
| "entropy": 0.02448252754402347, |
| "epoch": 0.00013, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 4.033958435058594, |
| "kl": 0.008902751103960327, |
| "learning_rate": 1.7142857142857145e-05, |
| "loss": 0.9859, |
| "num_tokens": 2605064.0, |
| "reward": -0.3038055896759033, |
| "reward_std": 0.6740490198135376, |
| "rewards/rollout_reward_func/mean": -0.3038055896759033, |
| "rewards/rollout_reward_func/std": 0.7864021062850952, |
| "sampling/importance_sampling_ratio/max": 2.7903988361358643, |
| "sampling/importance_sampling_ratio/mean": 1.0011733770370483, |
| "sampling/importance_sampling_ratio/min": 0.015171236358582973, |
| "sampling/sampling_logp_difference/max": 4.188354015350342, |
| "sampling/sampling_logp_difference/mean": 0.007430948317050934, |
| "step": 13, |
| "step_time": 154.12419534999935 |
| }, |
| { |
| "clip_ratio/high_max": 0.002730079897446558, |
| "clip_ratio/high_mean": 0.001365039948723279, |
| "clip_ratio/low_mean": 0.006960608443478122, |
| "clip_ratio/low_min": 0.00041666667675599456, |
| "clip_ratio/region_mean": 0.008325648421305232, |
| "entropy": 0.024137669446645305, |
| "epoch": 0.00014, |
| "grad_norm": 3.100346565246582, |
| "kl": 0.008414470611569413, |
| "learning_rate": 1.8571428571428572e-05, |
| "loss": 0.9784, |
| "step": 14, |
| "step_time": 69.95948773200143 |
| }, |
| { |
| "clip_ratio/high_max": 0.006266110052820295, |
| "clip_ratio/high_mean": 0.003558225085726008, |
| "clip_ratio/low_mean": 0.004969277448253706, |
| "clip_ratio/low_min": 0.00042517005931586027, |
| "clip_ratio/region_mean": 0.008527502533979714, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 13632.0, |
| "completions/max_terminated_length": 13632.0, |
| "completions/mean_length": 10339.125, |
| "completions/mean_terminated_length": 10339.125, |
| "completions/min_length": 1158.0, |
| "completions/min_terminated_length": 1158.0, |
| "entropy": 0.024794226861558855, |
| "epoch": 0.00015, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 3.3704771995544434, |
| "kl": 0.015760348596813856, |
| "learning_rate": 2e-05, |
| "loss": -0.1, |
| "num_tokens": 2963008.0, |
| "reward": -0.6195909976959229, |
| "reward_std": 0.6979167461395264, |
| "rewards/rollout_reward_func/mean": -0.6195909976959229, |
| "rewards/rollout_reward_func/std": 0.7056158781051636, |
| "sampling/importance_sampling_ratio/max": 2.3136019706726074, |
| "sampling/importance_sampling_ratio/mean": 1.000252366065979, |
| "sampling/importance_sampling_ratio/min": 0.14486655592918396, |
| "sampling/sampling_logp_difference/max": 1.9319422245025635, |
| "sampling/sampling_logp_difference/mean": 0.0070663755759596825, |
| "step": 15, |
| "step_time": 162.4636791780008 |
| }, |
| { |
| "clip_ratio/high_max": 0.008671565796248615, |
| "clip_ratio/high_mean": 0.005187579081393778, |
| "clip_ratio/low_mean": 0.0059419748722575605, |
| "clip_ratio/low_min": 0.00042517005931586027, |
| "clip_ratio/region_mean": 0.011129553953651339, |
| "entropy": 0.023938709287904203, |
| "epoch": 0.00016, |
| "grad_norm": 3.0196239948272705, |
| "kl": 0.016887567695448524, |
| "learning_rate": 2.1428571428571428e-05, |
| "loss": -0.1029, |
| "step": 16, |
| "step_time": 75.08079870300026 |
| }, |
| { |
| "clip_ratio/high_max": 0.00510084442794323, |
| "clip_ratio/high_mean": 0.002550422213971615, |
| "clip_ratio/low_mean": 0.004087600493221544, |
| "clip_ratio/low_min": 0.0016365568444598466, |
| "clip_ratio/region_mean": 0.006638022648985498, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 13272.0, |
| "completions/max_terminated_length": 13272.0, |
| "completions/mean_length": 10441.21875, |
| "completions/mean_terminated_length": 10441.21875, |
| "completions/min_length": 2067.0, |
| "completions/min_terminated_length": 2067.0, |
| "entropy": 0.027783271041698754, |
| "epoch": 0.00017, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 3.9961190223693848, |
| "kl": 0.018047706194920465, |
| "learning_rate": 2.2857142857142858e-05, |
| "loss": 1.118, |
| "num_tokens": 3324273.0, |
| "reward": -0.22455403208732605, |
| "reward_std": 0.9343163371086121, |
| "rewards/rollout_reward_func/mean": -0.22455403208732605, |
| "rewards/rollout_reward_func/std": 0.9539775252342224, |
| "sampling/importance_sampling_ratio/max": 2.838672637939453, |
| "sampling/importance_sampling_ratio/mean": 0.9985122680664062, |
| "sampling/importance_sampling_ratio/min": 0.2129591554403305, |
| "sampling/sampling_logp_difference/max": 1.5466549396514893, |
| "sampling/sampling_logp_difference/mean": 0.011087952181696892, |
| "step": 17, |
| "step_time": 162.3905202420001 |
| }, |
| { |
| "clip_ratio/high_max": 0.005912428721785545, |
| "clip_ratio/high_mean": 0.0029562143608927727, |
| "clip_ratio/low_mean": 0.0026932653854601085, |
| "clip_ratio/low_min": 0.0005040322430431843, |
| "clip_ratio/region_mean": 0.005649479775456712, |
| "entropy": 0.028106234036386013, |
| "epoch": 0.00018, |
| "grad_norm": 3.900911569595337, |
| "kl": 0.03091182082789601, |
| "learning_rate": 2.4285714285714288e-05, |
| "loss": 1.1189, |
| "step": 18, |
| "step_time": 74.16337497800077 |
| }, |
| { |
| "clip_ratio/high_max": 0.01214791223173961, |
| "clip_ratio/high_mean": 0.006297170388279483, |
| "clip_ratio/low_mean": 0.0026435064210090786, |
| "clip_ratio/low_min": 0.00042517005931586027, |
| "clip_ratio/region_mean": 0.008940676809288561, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 12931.0, |
| "completions/max_terminated_length": 12931.0, |
| "completions/mean_length": 10159.4375, |
| "completions/mean_terminated_length": 10159.4375, |
| "completions/min_length": 1245.0, |
| "completions/min_terminated_length": 1245.0, |
| "entropy": 0.028414718341082335, |
| "epoch": 0.00019, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 5.980042934417725, |
| "kl": 0.029700799204874784, |
| "learning_rate": 2.5714285714285714e-05, |
| "loss": 0.2205, |
| "num_tokens": 3676437.0, |
| "reward": -0.5079509615898132, |
| "reward_std": 0.6904375553131104, |
| "rewards/rollout_reward_func/mean": -0.5079509615898132, |
| "rewards/rollout_reward_func/std": 0.7216994762420654, |
| "sampling/importance_sampling_ratio/max": 2.166841745376587, |
| "sampling/importance_sampling_ratio/mean": 0.9985523223876953, |
| "sampling/importance_sampling_ratio/min": 0.19324928522109985, |
| "sampling/sampling_logp_difference/max": 1.6437742710113525, |
| "sampling/sampling_logp_difference/mean": 0.008595403283834457, |
| "step": 19, |
| "step_time": 156.51792565699907 |
| }, |
| { |
| "clip_ratio/high_max": 0.018410747725283727, |
| "clip_ratio/high_mean": 0.009882654994726181, |
| "clip_ratio/low_mean": 0.00612688584078569, |
| "clip_ratio/low_min": 0.0012755101779475808, |
| "clip_ratio/region_mean": 0.016009540820959955, |
| "entropy": 0.02830331851146184, |
| "epoch": 0.0002, |
| "grad_norm": 4.322617530822754, |
| "kl": 0.044774680165573955, |
| "learning_rate": 2.714285714285714e-05, |
| "loss": 0.22, |
| "step": 20, |
| "step_time": 71.70477191100008 |
| }, |
| { |
| "clip_ratio/high_max": 0.003913487191312015, |
| "clip_ratio/high_mean": 0.0021989916858728975, |
| "clip_ratio/low_mean": 0.0029591854254249483, |
| "clip_ratio/low_min": 0.0004562043759506196, |
| "clip_ratio/region_mean": 0.005158177111297846, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 13476.0, |
| "completions/max_terminated_length": 13476.0, |
| "completions/mean_length": 9885.6875, |
| "completions/mean_terminated_length": 9885.6875, |
| "completions/min_length": 2196.0, |
| "completions/min_terminated_length": 2196.0, |
| "entropy": 0.02020650013582781, |
| "epoch": 0.00021, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 3.4713399410247803, |
| "kl": 0.037866046448471025, |
| "learning_rate": 2.857142857142857e-05, |
| "loss": 1.0734, |
| "num_tokens": 4019800.0, |
| "reward": -0.24059510231018066, |
| "reward_std": 0.9731942415237427, |
| "rewards/rollout_reward_func/mean": -0.24059510231018066, |
| "rewards/rollout_reward_func/std": 1.004063367843628, |
| "sampling/importance_sampling_ratio/max": 2.5626845359802246, |
| "sampling/importance_sampling_ratio/mean": 0.999998927116394, |
| "sampling/importance_sampling_ratio/min": 0.34062984585762024, |
| "sampling/sampling_logp_difference/max": 1.0769588947296143, |
| "sampling/sampling_logp_difference/mean": 0.005168822128325701, |
| "step": 21, |
| "step_time": 156.87602913699993 |
| }, |
| { |
| "clip_ratio/high_max": 0.005465811875183135, |
| "clip_ratio/high_mean": 0.0027329059375915676, |
| "clip_ratio/low_mean": 0.003968855453422293, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.006701761391013861, |
| "entropy": 0.020286828803364187, |
| "epoch": 0.00022, |
| "grad_norm": 2.8511836528778076, |
| "kl": 0.05319512345158728, |
| "learning_rate": 3e-05, |
| "loss": 1.0666, |
| "step": 22, |
| "step_time": 73.16974372700179 |
| }, |
| { |
| "clip_ratio/high_max": 0.00847262708703056, |
| "clip_ratio/high_mean": 0.00423631354351528, |
| "clip_ratio/low_mean": 0.002743131757597439, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.006979445330216549, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 12993.0, |
| "completions/max_terminated_length": 12993.0, |
| "completions/mean_length": 9312.96875, |
| "completions/mean_terminated_length": 9312.96875, |
| "completions/min_length": 1575.0, |
| "completions/min_terminated_length": 1575.0, |
| "entropy": 0.023286236566491425, |
| "epoch": 0.00023, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 4.416781902313232, |
| "kl": 0.08850510988850147, |
| "learning_rate": 3.142857142857143e-05, |
| "loss": 0.8796, |
| "num_tokens": 4344597.0, |
| "reward": -0.15241163969039917, |
| "reward_std": 0.7749617099761963, |
| "rewards/rollout_reward_func/mean": -0.15241163969039917, |
| "rewards/rollout_reward_func/std": 0.971662700176239, |
| "sampling/importance_sampling_ratio/max": 3.0, |
| "sampling/importance_sampling_ratio/mean": 0.9986792802810669, |
| "sampling/importance_sampling_ratio/min": 0.13984200358390808, |
| "sampling/sampling_logp_difference/max": 1.9672420024871826, |
| "sampling/sampling_logp_difference/mean": 0.008968186564743519, |
| "step": 23, |
| "step_time": 149.11503398000104 |
| }, |
| { |
| "clip_ratio/high_max": 0.008956918376497924, |
| "clip_ratio/high_mean": 0.004696990654338151, |
| "clip_ratio/low_mean": 0.002034541597822681, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.006731532179401256, |
| "entropy": 0.02346635900903493, |
| "epoch": 0.00024, |
| "grad_norm": 2.7925429344177246, |
| "kl": 0.11299954203423113, |
| "learning_rate": 3.285714285714286e-05, |
| "loss": 0.8791, |
| "step": 24, |
| "step_time": 68.35307336800088 |
| }, |
| { |
| "clip_ratio/high_max": 0.007403911498840898, |
| "clip_ratio/high_mean": 0.003701955749420449, |
| "clip_ratio/low_mean": 0.0032710890081943944, |
| "clip_ratio/low_min": 0.00048449612222611904, |
| "clip_ratio/region_mean": 0.006973044728511013, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 13149.0, |
| "completions/max_terminated_length": 13149.0, |
| "completions/mean_length": 10183.96875, |
| "completions/mean_terminated_length": 10183.96875, |
| "completions/min_length": 1690.0, |
| "completions/min_terminated_length": 1690.0, |
| "entropy": 0.02520149474730715, |
| "epoch": 0.00025, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 3.3123133182525635, |
| "kl": 0.08004874712787569, |
| "learning_rate": 3.428571428571429e-05, |
| "loss": -0.2265, |
| "num_tokens": 4696978.0, |
| "reward": -0.39890268445014954, |
| "reward_std": 0.4381568431854248, |
| "rewards/rollout_reward_func/mean": -0.39890268445014954, |
| "rewards/rollout_reward_func/std": 0.5122197270393372, |
| "sampling/importance_sampling_ratio/max": 2.404775381088257, |
| "sampling/importance_sampling_ratio/mean": 0.9991356730461121, |
| "sampling/importance_sampling_ratio/min": 0.11307276040315628, |
| "sampling/sampling_logp_difference/max": 2.1797237396240234, |
| "sampling/sampling_logp_difference/mean": 0.008621575310826302, |
| "step": 25, |
| "step_time": 153.41606010800206 |
| }, |
| { |
| "clip_ratio/high_max": 0.003569191030692309, |
| "clip_ratio/high_mean": 0.002021337946644053, |
| "clip_ratio/low_mean": 0.005834054827573709, |
| "clip_ratio/low_min": 0.000972777372226119, |
| "clip_ratio/region_mean": 0.007855392774217762, |
| "entropy": 0.02575664728647098, |
| "epoch": 0.00026, |
| "grad_norm": 3.546144962310791, |
| "kl": 0.07968595699639991, |
| "learning_rate": 3.571428571428572e-05, |
| "loss": -0.2349, |
| "step": 26, |
| "step_time": 71.74039238800015 |
| }, |
| { |
| "clip_ratio/high_max": 0.011878328310558572, |
| "clip_ratio/high_mean": 0.007235260869492777, |
| "clip_ratio/low_mean": 0.002440269570797682, |
| "clip_ratio/low_min": 0.0017878417274914682, |
| "clip_ratio/region_mean": 0.009675530440290459, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 13152.0, |
| "completions/max_terminated_length": 13152.0, |
| "completions/mean_length": 9718.9375, |
| "completions/mean_terminated_length": 9718.9375, |
| "completions/min_length": 1111.0, |
| "completions/min_terminated_length": 1111.0, |
| "entropy": 0.04362651810515672, |
| "epoch": 0.00027, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 4.255147933959961, |
| "kl": 0.1335945016471669, |
| "learning_rate": 3.7142857142857143e-05, |
| "loss": 0.4813, |
| "num_tokens": 5034383.0, |
| "reward": -0.2570054233074188, |
| "reward_std": 0.6717403531074524, |
| "rewards/rollout_reward_func/mean": -0.2570054233074188, |
| "rewards/rollout_reward_func/std": 0.800593376159668, |
| "sampling/importance_sampling_ratio/max": 3.0, |
| "sampling/importance_sampling_ratio/mean": 0.9988204836845398, |
| "sampling/importance_sampling_ratio/min": 0.2177802473306656, |
| "sampling/sampling_logp_difference/max": 1.971644401550293, |
| "sampling/sampling_logp_difference/mean": 0.013164675794541836, |
| "step": 27, |
| "step_time": 154.97156673099835 |
| }, |
| { |
| "clip_ratio/high_max": 0.014473556191660464, |
| "clip_ratio/high_mean": 0.008749909291509539, |
| "clip_ratio/low_mean": 0.004799299247679301, |
| "clip_ratio/low_min": 0.001802365412004292, |
| "clip_ratio/region_mean": 0.013549208524636924, |
| "entropy": 0.04518118337728083, |
| "epoch": 0.00028, |
| "grad_norm": 5.101240158081055, |
| "kl": 0.12378271645866334, |
| "learning_rate": 3.857142857142858e-05, |
| "loss": 0.4689, |
| "step": 28, |
| "step_time": 70.95561770900167 |
| }, |
| { |
| "clip_ratio/high_max": 0.005707955948309973, |
| "clip_ratio/high_mean": 0.0035324228374520317, |
| "clip_ratio/low_mean": 0.002015564765315503, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0055479876027675346, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 13635.0, |
| "completions/max_terminated_length": 13635.0, |
| "completions/mean_length": 10821.4375, |
| "completions/mean_terminated_length": 10821.4375, |
| "completions/min_length": 3160.0, |
| "completions/min_terminated_length": 3160.0, |
| "entropy": 0.03376710624434054, |
| "epoch": 0.00029, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 6.056061267852783, |
| "kl": 0.10205801925621927, |
| "learning_rate": 4e-05, |
| "loss": 0.217, |
| "num_tokens": 5407036.0, |
| "reward": -0.5185161828994751, |
| "reward_std": 0.492318719625473, |
| "rewards/rollout_reward_func/mean": -0.5185161828994751, |
| "rewards/rollout_reward_func/std": 0.5280240774154663, |
| "sampling/importance_sampling_ratio/max": 3.0, |
| "sampling/importance_sampling_ratio/mean": 1.0005357265472412, |
| "sampling/importance_sampling_ratio/min": 0.12561221420764923, |
| "sampling/sampling_logp_difference/max": 2.0745558738708496, |
| "sampling/sampling_logp_difference/mean": 0.010849224403500557, |
| "step": 29, |
| "step_time": 163.0567665669996 |
| }, |
| { |
| "clip_ratio/high_max": 0.01110520790098235, |
| "clip_ratio/high_mean": 0.006027282943250611, |
| "clip_ratio/low_mean": 0.003446056245593354, |
| "clip_ratio/low_min": 0.00042808218859136105, |
| "clip_ratio/region_mean": 0.009473339232499711, |
| "entropy": 0.03455971780931577, |
| "epoch": 0.0003, |
| "grad_norm": 3.1213033199310303, |
| "kl": 0.09978742833482102, |
| "learning_rate": 4.1428571428571437e-05, |
| "loss": 0.1998, |
| "step": 30, |
| "step_time": 75.22149042100045 |
| }, |
| { |
| "clip_ratio/high_max": 0.007910042797448114, |
| "clip_ratio/high_mean": 0.004427617866895162, |
| "clip_ratio/low_mean": 0.004107993547222577, |
| "clip_ratio/low_min": 0.0005040322430431843, |
| "clip_ratio/region_mean": 0.008535611428669654, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 13526.0, |
| "completions/max_terminated_length": 13526.0, |
| "completions/mean_length": 10820.34375, |
| "completions/mean_terminated_length": 10820.34375, |
| "completions/min_length": 1826.0, |
| "completions/min_terminated_length": 1826.0, |
| "entropy": 0.03883296772255562, |
| "epoch": 0.00031, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 3.4302663803100586, |
| "kl": 0.0946993782708887, |
| "learning_rate": 4.2857142857142856e-05, |
| "loss": 0.1392, |
| "num_tokens": 5779856.0, |
| "reward": -0.35248619318008423, |
| "reward_std": 0.7979208827018738, |
| "rewards/rollout_reward_func/mean": -0.35248619318008423, |
| "rewards/rollout_reward_func/std": 0.8563511967658997, |
| "sampling/importance_sampling_ratio/max": 3.0, |
| "sampling/importance_sampling_ratio/mean": 0.9991589188575745, |
| "sampling/importance_sampling_ratio/min": 0.1591530442237854, |
| "sampling/sampling_logp_difference/max": 1.8378889560699463, |
| "sampling/sampling_logp_difference/mean": 0.010356503538787365, |
| "step": 31, |
| "step_time": 167.77319714499845 |
| }, |
| { |
| "clip_ratio/high_max": 0.013770086516160518, |
| "clip_ratio/high_mean": 0.0080132340954151, |
| "clip_ratio/low_mean": 0.006177089671837166, |
| "clip_ratio/low_min": 0.0004960317746736109, |
| "clip_ratio/region_mean": 0.014190323767252266, |
| "entropy": 0.04003481334075332, |
| "epoch": 0.00032, |
| "grad_norm": 2.200430154800415, |
| "kl": 0.0942922827671282, |
| "learning_rate": 4.428571428571428e-05, |
| "loss": 0.1266, |
| "step": 32, |
| "step_time": 75.98298019100093 |
| }, |
| { |
| "clip_ratio/high_max": 0.0056163399131037295, |
| "clip_ratio/high_mean": 0.0028081699565518647, |
| "clip_ratio/low_mean": 0.007905823076725937, |
| "clip_ratio/low_min": 0.004023919755127281, |
| "clip_ratio/region_mean": 0.010713993047829717, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 13762.0, |
| "completions/max_terminated_length": 13762.0, |
| "completions/mean_length": 10885.375, |
| "completions/mean_terminated_length": 10885.375, |
| "completions/min_length": 803.0, |
| "completions/min_terminated_length": 803.0, |
| "entropy": 0.0625891622621566, |
| "epoch": 0.00033, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 5.534822463989258, |
| "kl": 0.1454108317848295, |
| "learning_rate": 4.5714285714285716e-05, |
| "loss": 0.6602, |
| "num_tokens": 6154484.0, |
| "reward": -0.13570240139961243, |
| "reward_std": 0.933979332447052, |
| "rewards/rollout_reward_func/mean": -0.13570240139961243, |
| "rewards/rollout_reward_func/std": 1.0047199726104736, |
| "sampling/importance_sampling_ratio/max": 3.0, |
| "sampling/importance_sampling_ratio/mean": 1.001332402229309, |
| "sampling/importance_sampling_ratio/min": 0.18328945338726044, |
| "sampling/sampling_logp_difference/max": 1.9671940803527832, |
| "sampling/sampling_logp_difference/mean": 0.01589260809123516, |
| "step": 33, |
| "step_time": 164.3425249209995 |
| }, |
| { |
| "clip_ratio/high_max": 0.012784746824763715, |
| "clip_ratio/high_mean": 0.0071671667392365634, |
| "clip_ratio/low_mean": 0.011323995684506372, |
| "clip_ratio/low_min": 0.00381069362629205, |
| "clip_ratio/region_mean": 0.018491162394639105, |
| "entropy": 0.06436851329635829, |
| "epoch": 0.00034, |
| "grad_norm": 4.530355453491211, |
| "kl": 0.15744604653446004, |
| "learning_rate": 4.714285714285714e-05, |
| "loss": 0.6463, |
| "step": 34, |
| "step_time": 75.79230114999973 |
| }, |
| { |
| "clip_ratio/high_max": 0.0122655353625305, |
| "clip_ratio/high_mean": 0.006788362079532817, |
| "clip_ratio/low_mean": 0.0035715404665097594, |
| "clip_ratio/low_min": 0.00042808218859136105, |
| "clip_ratio/region_mean": 0.010359902604250237, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 13321.0, |
| "completions/max_terminated_length": 13321.0, |
| "completions/mean_length": 10277.34375, |
| "completions/mean_terminated_length": 10277.34375, |
| "completions/min_length": 2255.0, |
| "completions/min_terminated_length": 2255.0, |
| "entropy": 0.06130359717644751, |
| "epoch": 0.00035, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 6.084246635437012, |
| "kl": 0.20213585742749274, |
| "learning_rate": 4.8571428571428576e-05, |
| "loss": 0.491, |
| "num_tokens": 6509987.0, |
| "reward": -0.39044952392578125, |
| "reward_std": 0.7217355370521545, |
| "rewards/rollout_reward_func/mean": -0.39044952392578125, |
| "rewards/rollout_reward_func/std": 0.7610857486724854, |
| "sampling/importance_sampling_ratio/max": 2.5243656635284424, |
| "sampling/importance_sampling_ratio/mean": 0.998782217502594, |
| "sampling/importance_sampling_ratio/min": 0.02197389304637909, |
| "sampling/sampling_logp_difference/max": 3.8179001808166504, |
| "sampling/sampling_logp_difference/mean": 0.01720621809363365, |
| "step": 35, |
| "step_time": 161.66476932600017 |
| }, |
| { |
| "clip_ratio/high_max": 0.014183318184223026, |
| "clip_ratio/high_mean": 0.008639149455120787, |
| "clip_ratio/low_mean": 0.007928328239358962, |
| "clip_ratio/low_min": 0.002639831742271781, |
| "clip_ratio/region_mean": 0.01656747775268741, |
| "entropy": 0.06414063868578523, |
| "epoch": 0.00036, |
| "grad_norm": 4.390661239624023, |
| "kl": 0.22910623659845442, |
| "learning_rate": 5e-05, |
| "loss": 0.4805, |
| "step": 36, |
| "step_time": 73.4045656210019 |
| }, |
| { |
| "clip_ratio/high_max": 0.008312889549415559, |
| "clip_ratio/high_mean": 0.004156444774707779, |
| "clip_ratio/low_mean": 0.005125935596879572, |
| "clip_ratio/low_min": 0.0012918247375637293, |
| "clip_ratio/region_mean": 0.00928238031337969, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 13365.0, |
| "completions/max_terminated_length": 13365.0, |
| "completions/mean_length": 10076.125, |
| "completions/mean_terminated_length": 10076.125, |
| "completions/min_length": 1117.0, |
| "completions/min_terminated_length": 1117.0, |
| "entropy": 0.0663802761118859, |
| "epoch": 0.00037, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 3.7034361362457275, |
| "kl": 0.23728731309529394, |
| "learning_rate": 4.999300402366083e-05, |
| "loss": 0.5465, |
| "num_tokens": 6858892.0, |
| "reward": -0.17172592878341675, |
| "reward_std": 0.7274531126022339, |
| "rewards/rollout_reward_func/mean": -0.17172592878341675, |
| "rewards/rollout_reward_func/std": 0.7500751614570618, |
| "sampling/importance_sampling_ratio/max": 2.6516499519348145, |
| "sampling/importance_sampling_ratio/mean": 1.0006123781204224, |
| "sampling/importance_sampling_ratio/min": 0.22508670389652252, |
| "sampling/sampling_logp_difference/max": 1.491269588470459, |
| "sampling/sampling_logp_difference/mean": 0.01602303236722946, |
| "step": 37, |
| "step_time": 164.07891472499978 |
| }, |
| { |
| "clip_ratio/high_max": 0.01003516762284562, |
| "clip_ratio/high_mean": 0.005539513367693871, |
| "clip_ratio/low_mean": 0.014460239675827324, |
| "clip_ratio/low_min": 0.006212757696630433, |
| "clip_ratio/region_mean": 0.019999753101728857, |
| "entropy": 0.06495539681054652, |
| "epoch": 0.00038, |
| "grad_norm": 2.917160749435425, |
| "kl": 0.23570971423760056, |
| "learning_rate": 4.997202131530303e-05, |
| "loss": 0.5318, |
| "step": 38, |
| "step_time": 74.72924379499818 |
| }, |
| { |
| "clip_ratio/high_max": 0.006990043446421623, |
| "clip_ratio/high_mean": 0.0038956627249717712, |
| "clip_ratio/low_mean": 0.005693367682397366, |
| "clip_ratio/low_min": 0.00042229730752296746, |
| "clip_ratio/region_mean": 0.009589030363713391, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 12719.0, |
| "completions/max_terminated_length": 12719.0, |
| "completions/mean_length": 10167.65625, |
| "completions/mean_terminated_length": 10167.65625, |
| "completions/min_length": 2475.0, |
| "completions/min_terminated_length": 2475.0, |
| "entropy": 0.06094362598378211, |
| "epoch": 0.00039, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 3.3789939880371094, |
| "kl": 0.19482448839698918, |
| "learning_rate": 4.993706753300993e-05, |
| "loss": 0.2761, |
| "num_tokens": 7211045.0, |
| "reward": -0.20416001975536346, |
| "reward_std": 0.7475550174713135, |
| "rewards/rollout_reward_func/mean": -0.20416001975536346, |
| "rewards/rollout_reward_func/std": 0.7947777509689331, |
| "sampling/importance_sampling_ratio/max": 3.0, |
| "sampling/importance_sampling_ratio/mean": 1.0014694929122925, |
| "sampling/importance_sampling_ratio/min": 0.28650444746017456, |
| "sampling/sampling_logp_difference/max": 1.9272756576538086, |
| "sampling/sampling_logp_difference/mean": 0.013411741703748703, |
| "step": 39, |
| "step_time": 159.4788131209989 |
| }, |
| { |
| "clip_ratio/high_max": 0.012886389653431252, |
| "clip_ratio/high_mean": 0.007645118064829148, |
| "clip_ratio/low_mean": 0.009694800595752895, |
| "clip_ratio/low_min": 0.00044326239731162786, |
| "clip_ratio/region_mean": 0.017339918646030128, |
| "entropy": 0.06182372476905584, |
| "epoch": 0.0004, |
| "grad_norm": 2.5175423622131348, |
| "kl": 0.22034673113375902, |
| "learning_rate": 4.988816876060381e-05, |
| "loss": 0.2518, |
| "step": 40, |
| "step_time": 70.61948872499943 |
| }, |
| { |
| "clip_ratio/high_max": 0.010759571479866281, |
| "clip_ratio/high_mean": 0.005379785739933141, |
| "clip_ratio/low_mean": 0.004713524816907011, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.010093310542288236, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 12860.0, |
| "completions/max_terminated_length": 12860.0, |
| "completions/mean_length": 10191.6875, |
| "completions/mean_terminated_length": 10191.6875, |
| "completions/min_length": 1034.0, |
| "completions/min_terminated_length": 1034.0, |
| "entropy": 0.0719709824770689, |
| "epoch": 0.00041, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 3.783087968826294, |
| "kl": 0.23890396440401673, |
| "learning_rate": 4.98253614881812e-05, |
| "loss": 0.8387, |
| "num_tokens": 7563650.0, |
| "reward": 0.09715636074542999, |
| "reward_std": 0.861005425453186, |
| "rewards/rollout_reward_func/mean": 0.09715636074542999, |
| "rewards/rollout_reward_func/std": 0.8409163355827332, |
| "sampling/importance_sampling_ratio/max": 2.1479194164276123, |
| "sampling/importance_sampling_ratio/mean": 0.9980385303497314, |
| "sampling/importance_sampling_ratio/min": 0.25166046619415283, |
| "sampling/sampling_logp_difference/max": 1.3796745538711548, |
| "sampling/sampling_logp_difference/mean": 0.01620703563094139, |
| "step": 41, |
| "step_time": 156.14588745600213 |
| }, |
| { |
| "clip_ratio/high_max": 0.015028952562715858, |
| "clip_ratio/high_mean": 0.007514476281357929, |
| "clip_ratio/low_mean": 0.013302808598382398, |
| "clip_ratio/low_min": 0.0027654054574668407, |
| "clip_ratio/region_mean": 0.020817284763325006, |
| "entropy": 0.07558500277809799, |
| "epoch": 0.00042, |
| "grad_norm": 2.8221476078033447, |
| "kl": 0.2473655454814434, |
| "learning_rate": 4.974869258488254e-05, |
| "loss": 0.8332, |
| "step": 42, |
| "step_time": 71.34366872400005 |
| }, |
| { |
| "clip_ratio/high_max": 0.00897236549644731, |
| "clip_ratio/high_mean": 0.00470022382796742, |
| "clip_ratio/low_mean": 0.002213932399172336, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.006914156285347417, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 13893.0, |
| "completions/max_terminated_length": 13893.0, |
| "completions/mean_length": 11105.28125, |
| "completions/mean_terminated_length": 11105.28125, |
| "completions/min_length": 514.0, |
| "completions/min_terminated_length": 514.0, |
| "entropy": 0.07579021051060408, |
| "epoch": 0.00043, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 5.432410717010498, |
| "kl": 0.3065204853191972, |
| "learning_rate": 4.965821926391673e-05, |
| "loss": -0.6028, |
| "num_tokens": 7945588.0, |
| "reward": -0.1402868777513504, |
| "reward_std": 0.6740955114364624, |
| "rewards/rollout_reward_func/mean": -0.1402868777513504, |
| "rewards/rollout_reward_func/std": 0.7691654562950134, |
| "sampling/importance_sampling_ratio/max": 3.0, |
| "sampling/importance_sampling_ratio/mean": 0.9994540214538574, |
| "sampling/importance_sampling_ratio/min": 0.2775813639163971, |
| "sampling/sampling_logp_difference/max": 2.002950668334961, |
| "sampling/sampling_logp_difference/mean": 0.016589157283306122, |
| "step": 43, |
| "step_time": 166.56439664799746 |
| }, |
| { |
| "clip_ratio/high_max": 0.02129412887734361, |
| "clip_ratio/high_mean": 0.011415275061153807, |
| "clip_ratio/low_mean": 0.007396826345939189, |
| "clip_ratio/low_min": 0.0017404509417247027, |
| "clip_ratio/region_mean": 0.01881210133433342, |
| "entropy": 0.07551638572476804, |
| "epoch": 0.00044, |
| "grad_norm": 4.626571178436279, |
| "kl": 0.34492466133087873, |
| "learning_rate": 4.9554009039866464e-05, |
| "loss": -0.6141, |
| "step": 44, |
| "step_time": 76.74440019599933 |
| }, |
| { |
| "clip_ratio/high_max": 0.010172237700317055, |
| "clip_ratio/high_mean": 0.0050861188501585275, |
| "clip_ratio/low_mean": 0.00778736115898937, |
| "clip_ratio/low_min": 0.0032280217565130442, |
| "clip_ratio/region_mean": 0.012873480009147897, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 13578.0, |
| "completions/max_terminated_length": 13578.0, |
| "completions/mean_length": 10818.84375, |
| "completions/mean_terminated_length": 10818.84375, |
| "completions/min_length": 1938.0, |
| "completions/min_terminated_length": 1938.0, |
| "entropy": 0.0941541725769639, |
| "epoch": 0.00045, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 6.4678053855896, |
| "kl": 0.3121062908321619, |
| "learning_rate": 4.9436139678306335e-05, |
| "loss": -0.2261, |
| "num_tokens": 8318532.0, |
| "reward": 0.15859833359718323, |
| "reward_std": 0.755577027797699, |
| "rewards/rollout_reward_func/mean": 0.15859833359718323, |
| "rewards/rollout_reward_func/std": 0.8585173487663269, |
| "sampling/importance_sampling_ratio/max": 2.691387414932251, |
| "sampling/importance_sampling_ratio/mean": 0.9984525442123413, |
| "sampling/importance_sampling_ratio/min": 0.06687777489423752, |
| "sampling/sampling_logp_difference/max": 2.7048885822296143, |
| "sampling/sampling_logp_difference/mean": 0.023697983473539352, |
| "step": 45, |
| "step_time": 164.00571862300148 |
| }, |
| { |
| "clip_ratio/high_max": 0.020086677744984627, |
| "clip_ratio/high_mean": 0.010638576990459114, |
| "clip_ratio/low_mean": 0.013478812892572023, |
| "clip_ratio/low_min": 0.005088503879960626, |
| "clip_ratio/region_mean": 0.0241173897520639, |
| "entropy": 0.09132221271283925, |
| "epoch": 0.00046, |
| "grad_norm": 4.049323081970215, |
| "kl": 0.3125941874459386, |
| "learning_rate": 4.930469913777124e-05, |
| "loss": -0.2404, |
| "step": 46, |
| "step_time": 75.26833040499969 |
| }, |
| { |
| "clip_ratio/high_max": 0.008710403984878212, |
| "clip_ratio/high_mean": 0.006181505916174501, |
| "clip_ratio/low_mean": 0.006149827502667904, |
| "clip_ratio/low_min": 0.0017361111240461469, |
| "clip_ratio/region_mean": 0.012331333418842405, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 13339.0, |
| "completions/max_terminated_length": 13339.0, |
| "completions/mean_length": 9838.40625, |
| "completions/mean_terminated_length": 9838.40625, |
| "completions/min_length": 1577.0, |
| "completions/min_terminated_length": 1577.0, |
| "entropy": 0.0639520538970828, |
| "epoch": 0.00047, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 3.6693246364593506, |
| "kl": 0.28815168514847755, |
| "learning_rate": 4.91597855041184e-05, |
| "loss": -0.4524, |
| "num_tokens": 8660124.0, |
| "reward": 0.18700893223285675, |
| "reward_std": 0.8370898365974426, |
| "rewards/rollout_reward_func/mean": 0.18700893223285675, |
| "rewards/rollout_reward_func/std": 0.9505638480186462, |
| "sampling/importance_sampling_ratio/max": 3.0, |
| "sampling/importance_sampling_ratio/mean": 0.9988111853599548, |
| "sampling/importance_sampling_ratio/min": 0.20165325701236725, |
| "sampling/sampling_logp_difference/max": 1.601205587387085, |
| "sampling/sampling_logp_difference/mean": 0.01677616499364376, |
| "step": 47, |
| "step_time": 156.50041884999882 |
| }, |
| { |
| "clip_ratio/high_max": 0.010966176458168775, |
| "clip_ratio/high_mean": 0.008428345259744674, |
| "clip_ratio/low_mean": 0.009785499976715073, |
| "clip_ratio/low_min": 0.0022047124803066254, |
| "clip_ratio/region_mean": 0.018213845294667408, |
| "entropy": 0.06090119876898825, |
| "epoch": 0.00048, |
| "grad_norm": 2.817453145980835, |
| "kl": 0.2784329962451011, |
| "learning_rate": 4.900150691733207e-05, |
| "loss": -0.4609, |
| "step": 48, |
| "step_time": 71.43669509799929 |
| }, |
| { |
| "clip_ratio/high_max": 0.012393949960824102, |
| "clip_ratio/high_mean": 0.007038811716483906, |
| "clip_ratio/low_mean": 0.004084319676621817, |
| "clip_ratio/low_min": 0.00041666667675599456, |
| "clip_ratio/region_mean": 0.011123131407657638, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 13544.0, |
| "completions/max_terminated_length": 13544.0, |
| "completions/mean_length": 10274.09375, |
| "completions/mean_terminated_length": 10274.09375, |
| "completions/min_length": 2238.0, |
| "completions/min_terminated_length": 2238.0, |
| "entropy": 0.06463533453643322, |
| "epoch": 0.00049, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 3.4794275760650635, |
| "kl": 0.23157021962106228, |
| "learning_rate": 4.8829981490825384e-05, |
| "loss": 0.6766, |
| "num_tokens": 9015251.0, |
| "reward": 0.05801337957382202, |
| "reward_std": 0.8594080209732056, |
| "rewards/rollout_reward_func/mean": 0.05801337957382202, |
| "rewards/rollout_reward_func/std": 0.9448277354240417, |
| "sampling/importance_sampling_ratio/max": 2.314483165740967, |
| "sampling/importance_sampling_ratio/mean": 0.998992919921875, |
| "sampling/importance_sampling_ratio/min": 0.08525566011667252, |
| "sampling/sampling_logp_difference/max": 2.4621007442474365, |
| "sampling/sampling_logp_difference/mean": 0.01729561574757099, |
| "step": 49, |
| "step_time": 160.99621369300075 |
| }, |
| { |
| "clip_ratio/high_max": 0.017639295605476946, |
| "clip_ratio/high_mean": 0.009471276862313971, |
| "clip_ratio/low_mean": 0.008798229115200229, |
| "clip_ratio/low_min": 0.002916666679084301, |
| "clip_ratio/region_mean": 0.018269505802891217, |
| "entropy": 0.06586812436580658, |
| "epoch": 0.0005, |
| "grad_norm": 3.246238946914673, |
| "kl": 0.23268628818914294, |
| "learning_rate": 4.864533722329971e-05, |
| "loss": 0.6675, |
| "step": 50, |
| "step_time": 74.08785445800095 |
| }, |
| { |
| "clip_ratio/high_max": 0.008645561116281897, |
| "clip_ratio/high_mean": 0.004785452736541629, |
| "clip_ratio/low_mean": 0.003304112848127261, |
| "clip_ratio/low_min": 0.00048449612222611904, |
| "clip_ratio/region_mean": 0.008089565526461229, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 13357.0, |
| "completions/max_terminated_length": 13357.0, |
| "completions/mean_length": 9449.75, |
| "completions/mean_terminated_length": 9449.75, |
| "completions/min_length": 1164.0, |
| "completions/min_terminated_length": 1164.0, |
| "entropy": 0.06446462532039732, |
| "epoch": 0.00051, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 4.304991245269775, |
| "kl": 0.3380787476198748, |
| "learning_rate": 4.8447711903227245e-05, |
| "loss": -0.0874, |
| "num_tokens": 9343997.0, |
| "reward": 0.031104259192943573, |
| "reward_std": 0.8977135419845581, |
| "rewards/rollout_reward_func/mean": 0.031104259192943573, |
| "rewards/rollout_reward_func/std": 1.0250816345214844, |
| "sampling/importance_sampling_ratio/max": 3.0, |
| "sampling/importance_sampling_ratio/mean": 1.0004096031188965, |
| "sampling/importance_sampling_ratio/min": 0.18863455951213837, |
| "sampling/sampling_logp_difference/max": 1.6679437160491943, |
| "sampling/sampling_logp_difference/mean": 0.014661731198430061, |
| "step": 51, |
| "step_time": 151.50292257000183 |
| }, |
| { |
| "clip_ratio/high_max": 0.012868363002780825, |
| "clip_ratio/high_mean": 0.007327448722207919, |
| "clip_ratio/low_mean": 0.006595586746698245, |
| "clip_ratio/low_min": 0.0034068059467244893, |
| "clip_ratio/region_mean": 0.013923035527113825, |
| "entropy": 0.06929566781036556, |
| "epoch": 0.00052, |
| "grad_norm": 3.573127031326294, |
| "kl": 0.2613374365027994, |
| "learning_rate": 4.8237253006028074e-05, |
| "loss": -0.0965, |
| "step": 52, |
| "step_time": 70.4309434930019 |
| }, |
| { |
| "clip_ratio/high_max": 0.008529713173629716, |
| "clip_ratio/high_mean": 0.004628228620276786, |
| "clip_ratio/low_mean": 0.004744059449876659, |
| "clip_ratio/low_min": 0.00126980320783332, |
| "clip_ratio/region_mean": 0.0093722880264977, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 13527.0, |
| "completions/max_terminated_length": 13527.0, |
| "completions/mean_length": 10749.8125, |
| "completions/mean_terminated_length": 10749.8125, |
| "completions/min_length": 1542.0, |
| "completions/min_terminated_length": 1542.0, |
| "entropy": 0.0799917277181521, |
| "epoch": 0.00053, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 3.5762784481048584, |
| "kl": 0.2966231992468238, |
| "learning_rate": 4.801411758401846e-05, |
| "loss": 0.9423, |
| "num_tokens": 9714613.0, |
| "reward": 0.14535844326019287, |
| "reward_std": 1.0995876789093018, |
| "rewards/rollout_reward_func/mean": 0.14535844326019287, |
| "rewards/rollout_reward_func/std": 1.0902811288833618, |
| "sampling/importance_sampling_ratio/max": 2.0842576026916504, |
| "sampling/importance_sampling_ratio/mean": 0.9999011754989624, |
| "sampling/importance_sampling_ratio/min": 0.13890385627746582, |
| "sampling/sampling_logp_difference/max": 1.973973274230957, |
| "sampling/sampling_logp_difference/mean": 0.01623544469475746, |
| "step": 53, |
| "step_time": 167.5663473889981 |
| }, |
| { |
| "clip_ratio/high_max": 0.011895997042302042, |
| "clip_ratio/high_mean": 0.00703811485436745, |
| "clip_ratio/low_mean": 0.008775666501605883, |
| "clip_ratio/low_min": 0.0012786609295289963, |
| "clip_ratio/region_mean": 0.015813781414180994, |
| "entropy": 0.08241763606201857, |
| "epoch": 0.00054, |
| "grad_norm": 2.2500929832458496, |
| "kl": 0.2891239356249571, |
| "learning_rate": 4.777847214921259e-05, |
| "loss": 0.9246, |
| "step": 54, |
| "step_time": 75.45897021599922 |
| }, |
| { |
| "clip_ratio/high_max": 0.009276975266402587, |
| "clip_ratio/high_mean": 0.004638487633201294, |
| "clip_ratio/low_mean": 0.005842676997417584, |
| "clip_ratio/low_min": 0.0008503401186317205, |
| "clip_ratio/region_mean": 0.010481164674274623, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 13624.0, |
| "completions/max_terminated_length": 13624.0, |
| "completions/mean_length": 11374.6875, |
| "completions/mean_terminated_length": 11374.6875, |
| "completions/min_length": 2137.0, |
| "completions/min_terminated_length": 2137.0, |
| "entropy": 0.08610659511759877, |
| "epoch": 0.00055, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 3.0837926864624023, |
| "kl": 0.28733005002141, |
| "learning_rate": 4.753049254906501e-05, |
| "loss": 0.21, |
| "num_tokens": 10104624.0, |
| "reward": -0.09346111118793488, |
| "reward_std": 0.821108877658844, |
| "rewards/rollout_reward_func/mean": -0.09346111118793488, |
| "rewards/rollout_reward_func/std": 0.9060606956481934, |
| "sampling/importance_sampling_ratio/max": 3.0, |
| "sampling/importance_sampling_ratio/mean": 1.0002198219299316, |
| "sampling/importance_sampling_ratio/min": 0.3293769061565399, |
| "sampling/sampling_logp_difference/max": 1.3009889125823975, |
| "sampling/sampling_logp_difference/mean": 0.01777353696525097, |
| "step": 55, |
| "step_time": 169.56354937799915 |
| }, |
| { |
| "clip_ratio/high_max": 0.016487823944771662, |
| "clip_ratio/high_mean": 0.009136769062024541, |
| "clip_ratio/low_mean": 0.0069089080061530694, |
| "clip_ratio/low_min": 0.0030126339406706393, |
| "clip_ratio/region_mean": 0.01604567709728144, |
| "entropy": 0.09009328670799732, |
| "epoch": 0.00056, |
| "grad_norm": 3.528965473175049, |
| "kl": 0.28088642843067646, |
| "learning_rate": 4.727036383524666e-05, |
| "loss": 0.1973, |
| "step": 56, |
| "step_time": 75.04982400900099 |
| }, |
| { |
| "clip_ratio/high_max": 0.007419974484946579, |
| "clip_ratio/high_mean": 0.004121171456063166, |
| "clip_ratio/low_mean": 0.0033842776028905064, |
| "clip_ratio/low_min": 0.0012699189246632159, |
| "clip_ratio/region_mean": 0.007505449088057503, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 13511.0, |
| "completions/max_terminated_length": 13511.0, |
| "completions/mean_length": 10176.125, |
| "completions/mean_terminated_length": 10176.125, |
| "completions/min_length": 1202.0, |
| "completions/min_terminated_length": 1202.0, |
| "entropy": 0.0732960153836757, |
| "epoch": 0.00057, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 3.3770620822906494, |
| "kl": 0.3082230528816581, |
| "learning_rate": 4.699828012555243e-05, |
| "loss": 0.5982, |
| "num_tokens": 10456269.0, |
| "reward": -0.12018954753875732, |
| "reward_std": 0.9542105793952942, |
| "rewards/rollout_reward_func/mean": -0.12018954753875732, |
| "rewards/rollout_reward_func/std": 1.0915628671646118, |
| "sampling/importance_sampling_ratio/max": 2.9350130558013916, |
| "sampling/importance_sampling_ratio/mean": 0.9999834299087524, |
| "sampling/importance_sampling_ratio/min": 0.362204909324646, |
| "sampling/sampling_logp_difference/max": 1.076711893081665, |
| "sampling/sampling_logp_difference/mean": 0.015124820172786713, |
| "step": 57, |
| "step_time": 160.69087591700008 |
| }, |
| { |
| "clip_ratio/high_max": 0.017830504453741014, |
| "clip_ratio/high_mean": 0.009982116374885663, |
| "clip_ratio/low_mean": 0.009146305441390723, |
| "clip_ratio/low_min": 0.0012932273093611002, |
| "clip_ratio/region_mean": 0.019128421699861065, |
| "entropy": 0.0739774244138971, |
| "epoch": 0.00058, |
| "grad_norm": 7.0747246742248535, |
| "kl": 0.3264038683846593, |
| "learning_rate": 4.671444445904316e-05, |
| "loss": 0.5779, |
| "step": 58, |
| "step_time": 73.05348470599984 |
| }, |
| { |
| "clip_ratio/high_max": 0.013040319143328816, |
| "clip_ratio/high_mean": 0.007098863337887451, |
| "clip_ratio/low_mean": 0.003575551469111815, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.010674414719687775, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 13197.0, |
| "completions/max_terminated_length": 13197.0, |
| "completions/mean_length": 8837.90625, |
| "completions/mean_terminated_length": 8837.90625, |
| "completions/min_length": 678.0, |
| "completions/min_terminated_length": 678.0, |
| "entropy": 0.09057109360583127, |
| "epoch": 0.00059, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 7.666227340698242, |
| "kl": 0.4448018502444029, |
| "learning_rate": 4.641906864453027e-05, |
| "loss": -0.265, |
| "num_tokens": 10765241.0, |
| "reward": 0.296211302280426, |
| "reward_std": 1.1419236660003662, |
| "rewards/rollout_reward_func/mean": 0.296211302280426, |
| "rewards/rollout_reward_func/std": 1.1032841205596924, |
| "sampling/importance_sampling_ratio/max": 2.8205833435058594, |
| "sampling/importance_sampling_ratio/mean": 0.9991936683654785, |
| "sampling/importance_sampling_ratio/min": 0.14112254977226257, |
| "sampling/sampling_logp_difference/max": 1.9581265449523926, |
| "sampling/sampling_logp_difference/mean": 0.021620940417051315, |
| "step": 59, |
| "step_time": 155.945755681003 |
| }, |
| { |
| "clip_ratio/high_max": 0.02123168739490211, |
| "clip_ratio/high_mean": 0.011910196451935917, |
| "clip_ratio/low_mean": 0.015003619948402047, |
| "clip_ratio/low_min": 0.003669736732263118, |
| "clip_ratio/region_mean": 0.026913816400337964, |
| "entropy": 0.09111437713727355, |
| "epoch": 0.0006, |
| "grad_norm": 4.09487771987915, |
| "kl": 0.34631976671516895, |
| "learning_rate": 4.6112373102516095e-05, |
| "loss": -0.286, |
| "step": 60, |
| "step_time": 71.43645765800102 |
| }, |
| { |
| "clip_ratio/high_max": 0.0070640986377839, |
| "clip_ratio/high_mean": 0.00353204931889195, |
| "clip_ratio/low_mean": 0.004103203638806008, |
| "clip_ratio/low_min": 0.00041666667675599456, |
| "clip_ratio/region_mean": 0.007635252914042212, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 13493.0, |
| "completions/max_terminated_length": 13493.0, |
| "completions/mean_length": 9962.53125, |
| "completions/mean_terminated_length": 9962.53125, |
| "completions/min_length": 1611.0, |
| "completions/min_terminated_length": 1611.0, |
| "entropy": 0.08869605418294668, |
| "epoch": 0.00061, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 5.003176689147949, |
| "kl": 0.32290424313396215, |
| "learning_rate": 4.5794586700707875e-05, |
| "loss": 0.4087, |
| "num_tokens": 11110298.0, |
| "reward": 0.19760021567344666, |
| "reward_std": 1.0259727239608765, |
| "rewards/rollout_reward_func/mean": 0.19760021567344666, |
| "rewards/rollout_reward_func/std": 1.0379148721694946, |
| "sampling/importance_sampling_ratio/max": 2.7306482791900635, |
| "sampling/importance_sampling_ratio/mean": 1.0000582933425903, |
| "sampling/importance_sampling_ratio/min": 0.3961387276649475, |
| "sampling/sampling_logp_difference/max": 1.0045390129089355, |
| "sampling/sampling_logp_difference/mean": 0.0211578831076622, |
| "step": 61, |
| "step_time": 160.88398975100063 |
| }, |
| { |
| "clip_ratio/high_max": 0.018062620365526527, |
| "clip_ratio/high_mean": 0.009733557322761044, |
| "clip_ratio/low_mean": 0.013738173875026405, |
| "clip_ratio/low_min": 0.0035855557944159955, |
| "clip_ratio/region_mean": 0.02347173122689128, |
| "entropy": 0.09201134112663567, |
| "epoch": 0.00062, |
| "grad_norm": 3.820011615753174, |
| "kl": 0.32759621646255255, |
| "learning_rate": 4.546594658322805e-05, |
| "loss": 0.3893, |
| "step": 62, |
| "step_time": 72.43900915299491 |
| }, |
| { |
| "clip_ratio/high_max": 0.004652095143683255, |
| "clip_ratio/high_mean": 0.0023260475718416274, |
| "clip_ratio/low_mean": 0.005644983262754977, |
| "clip_ratio/low_min": 0.0017199248541146517, |
| "clip_ratio/region_mean": 0.007971030776388943, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 13066.0, |
| "completions/max_terminated_length": 13066.0, |
| "completions/mean_length": 10042.84375, |
| "completions/mean_terminated_length": 10042.84375, |
| "completions/min_length": 1710.0, |
| "completions/min_terminated_length": 1710.0, |
| "entropy": 0.1067513944581151, |
| "epoch": 0.00063, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 3.7682888507843018, |
| "kl": 0.45440504513680935, |
| "learning_rate": 4.512669799364848e-05, |
| "loss": 0.3975, |
| "num_tokens": 11457778.0, |
| "reward": 0.3108959197998047, |
| "reward_std": 0.9355272054672241, |
| "rewards/rollout_reward_func/mean": 0.3108959197998047, |
| "rewards/rollout_reward_func/std": 0.9091988205909729, |
| "sampling/importance_sampling_ratio/max": 3.0, |
| "sampling/importance_sampling_ratio/mean": 1.0004922151565552, |
| "sampling/importance_sampling_ratio/min": 0.25997379422187805, |
| "sampling/sampling_logp_difference/max": 1.9117159843444824, |
| "sampling/sampling_logp_difference/mean": 0.02266036719083786, |
| "step": 63, |
| "step_time": 158.2137243500074 |
| }, |
| { |
| "clip_ratio/high_max": 0.011193088546860963, |
| "clip_ratio/high_mean": 0.0058913555985782295, |
| "clip_ratio/low_mean": 0.013004933163756505, |
| "clip_ratio/low_min": 0.005248830129858106, |
| "clip_ratio/region_mean": 0.018896288704127073, |
| "entropy": 0.10795356682501733, |
| "epoch": 0.00064, |
| "grad_norm": 3.645940065383911, |
| "kl": 0.470172350294888, |
| "learning_rate": 4.477709409198042e-05, |
| "loss": 0.3908, |
| "step": 64, |
| "step_time": 71.35263364000275 |
| }, |
| { |
| "clip_ratio/high_max": 0.008740827877772972, |
| "clip_ratio/high_mean": 0.004891247299383394, |
| "clip_ratio/low_mean": 0.004758857699926011, |
| "clip_ratio/low_min": 0.0008361297659575939, |
| "clip_ratio/region_mean": 0.009650105028413236, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 13721.0, |
| "completions/max_terminated_length": 13721.0, |
| "completions/mean_length": 10885.40625, |
| "completions/mean_terminated_length": 10885.40625, |
| "completions/min_length": 2306.0, |
| "completions/min_terminated_length": 2306.0, |
| "entropy": 0.1310987388715148, |
| "epoch": 0.00065, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 3.6566226482391357, |
| "kl": 0.5131763480603695, |
| "learning_rate": 4.441739576575714e-05, |
| "loss": 0.7262, |
| "num_tokens": 11832210.0, |
| "reward": 0.37453246116638184, |
| "reward_std": 1.1100566387176514, |
| "rewards/rollout_reward_func/mean": 0.37453246116638184, |
| "rewards/rollout_reward_func/std": 1.122207760810852, |
| "sampling/importance_sampling_ratio/max": 3.0, |
| "sampling/importance_sampling_ratio/mean": 0.9988963603973389, |
| "sampling/importance_sampling_ratio/min": 0.25997379422187805, |
| "sampling/sampling_logp_difference/max": 1.7353017330169678, |
| "sampling/sampling_logp_difference/mean": 0.02562958560883999, |
| "step": 65, |
| "step_time": 163.06677165699693 |
| }, |
| { |
| "clip_ratio/high_max": 0.021744761732406914, |
| "clip_ratio/high_mean": 0.012366945913527161, |
| "clip_ratio/low_mean": 0.009314247145084664, |
| "clip_ratio/low_min": 0.002372722141444683, |
| "clip_ratio/region_mean": 0.021681193175027147, |
| "entropy": 0.13085555145516992, |
| "epoch": 0.00066, |
| "grad_norm": 3.274498224258423, |
| "kl": 0.48789327405393124, |
| "learning_rate": 4.404787143534977e-05, |
| "loss": 0.7077, |
| "step": 66, |
| "step_time": 75.6290548800007 |
| }, |
| { |
| "clip_ratio/high_max": 0.005601790326181799, |
| "clip_ratio/high_mean": 0.0028008951630908996, |
| "clip_ratio/low_mean": 0.003591711341869086, |
| "clip_ratio/low_min": 0.0016466806118842214, |
| "clip_ratio/region_mean": 0.006392606504959986, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 13492.0, |
| "completions/max_terminated_length": 13492.0, |
| "completions/mean_length": 10049.25, |
| "completions/mean_terminated_length": 10049.25, |
| "completions/min_length": 684.0, |
| "completions/min_terminated_length": 684.0, |
| "entropy": 0.12767728650942445, |
| "epoch": 0.00067, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 3.9584059715270996, |
| "kl": 0.43974771071225405, |
| "learning_rate": 4.366879685366202e-05, |
| "loss": 0.3535, |
| "num_tokens": 12180022.0, |
| "reward": 0.1577407419681549, |
| "reward_std": 1.1268913745880127, |
| "rewards/rollout_reward_func/mean": 0.1577407419681549, |
| "rewards/rollout_reward_func/std": 1.1074063777923584, |
| "sampling/importance_sampling_ratio/max": 3.0, |
| "sampling/importance_sampling_ratio/mean": 1.001326560974121, |
| "sampling/importance_sampling_ratio/min": 0.42815104126930237, |
| "sampling/sampling_logp_difference/max": 1.152717113494873, |
| "sampling/sampling_logp_difference/mean": 0.021073060110211372, |
| "step": 67, |
| "step_time": 160.9693315609984 |
| }, |
| { |
| "clip_ratio/high_max": 0.017967351304832846, |
| "clip_ratio/high_mean": 0.008983675652416423, |
| "clip_ratio/low_mean": 0.017658226686762646, |
| "clip_ratio/low_min": 0.004194744222331792, |
| "clip_ratio/region_mean": 0.026641902572009712, |
| "entropy": 0.12733671674504876, |
| "epoch": 0.00068, |
| "grad_norm": 3.018509864807129, |
| "kl": 0.4440254373475909, |
| "learning_rate": 4.3280454900353015e-05, |
| "loss": 0.3265, |
| "step": 68, |
| "step_time": 75.01162057099282 |
| }, |
| { |
| "clip_ratio/high_max": 0.010869733727304265, |
| "clip_ratio/high_mean": 0.005718957792851143, |
| "clip_ratio/low_mean": 0.0034849131188821048, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.009203870882629417, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 13515.0, |
| "completions/max_terminated_length": 13515.0, |
| "completions/mean_length": 8650.84375, |
| "completions/mean_terminated_length": 8650.84375, |
| "completions/min_length": 387.0, |
| "completions/min_terminated_length": 387.0, |
| "entropy": 0.11021804087795317, |
| "epoch": 0.00069, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 3.1527934074401855, |
| "kl": 0.37845719885081053, |
| "learning_rate": 4.288313537074191e-05, |
| "loss": 0.5884, |
| "num_tokens": 12483099.0, |
| "reward": 0.2558709979057312, |
| "reward_std": 0.9413133859634399, |
| "rewards/rollout_reward_func/mean": 0.2558709979057312, |
| "rewards/rollout_reward_func/std": 1.0643043518066406, |
| "sampling/importance_sampling_ratio/max": 3.0, |
| "sampling/importance_sampling_ratio/mean": 0.998905599117279, |
| "sampling/importance_sampling_ratio/min": 0.3961387276649475, |
| "sampling/sampling_logp_difference/max": 1.179542064666748, |
| "sampling/sampling_logp_difference/mean": 0.021235648542642593, |
| "step": 69, |
| "step_time": 162.1463760960014 |
| }, |
| { |
| "clip_ratio/high_max": 0.02583102328935638, |
| "clip_ratio/high_mean": 0.015480009169550613, |
| "clip_ratio/low_mean": 0.013606639229692519, |
| "clip_ratio/low_min": 0.002621016465127468, |
| "clip_ratio/region_mean": 0.029086648602969944, |
| "entropy": 0.10582013777457178, |
| "epoch": 0.0007, |
| "grad_norm": 3.362034320831299, |
| "kl": 0.3859092304483056, |
| "learning_rate": 4.2477134759551676e-05, |
| "loss": 0.5836, |
| "step": 70, |
| "step_time": 73.605654605999 |
| }, |
| { |
| "clip_ratio/high_max": 0.003980476642027497, |
| "clip_ratio/high_mean": 0.0021999698801664636, |
| "clip_ratio/low_mean": 0.0036015229270560667, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.00580149280722253, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 13375.0, |
| "completions/max_terminated_length": 13375.0, |
| "completions/mean_length": 9448.03125, |
| "completions/mean_terminated_length": 9448.03125, |
| "completions/min_length": 1765.0, |
| "completions/min_terminated_length": 1765.0, |
| "entropy": 0.08842878974974155, |
| "epoch": 0.00071, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 3.0361220836639404, |
| "kl": 0.3907409608364105, |
| "learning_rate": 4.206275603965376e-05, |
| "loss": 0.4317, |
| "num_tokens": 12811546.0, |
| "reward": 0.10175018012523651, |
| "reward_std": 1.2597105503082275, |
| "rewards/rollout_reward_func/mean": 0.10175018012523651, |
| "rewards/rollout_reward_func/std": 1.224672555923462, |
| "sampling/importance_sampling_ratio/max": 2.153721809387207, |
| "sampling/importance_sampling_ratio/mean": 0.9977552890777588, |
| "sampling/importance_sampling_ratio/min": 0.25212857127189636, |
| "sampling/sampling_logp_difference/max": 1.377816081047058, |
| "sampling/sampling_logp_difference/mean": 0.017261814326047897, |
| "step": 71, |
| "step_time": 158.47965298599775 |
| }, |
| { |
| "clip_ratio/high_max": 0.012883998395409435, |
| "clip_ratio/high_mean": 0.0064419991977047175, |
| "clip_ratio/low_mean": 0.004444706108188257, |
| "clip_ratio/low_min": 0.0017248203221242875, |
| "clip_ratio/region_mean": 0.010886705276789144, |
| "entropy": 0.09079739707522094, |
| "epoch": 0.00072, |
| "grad_norm": 2.404242753982544, |
| "kl": 0.3891353765502572, |
| "learning_rate": 4.1640308435978284e-05, |
| "loss": 0.4232, |
| "step": 72, |
| "step_time": 73.1000140619999 |
| }, |
| { |
| "clip_ratio/high_max": 0.00914364744676277, |
| "clip_ratio/high_mean": 0.005490152951097116, |
| "clip_ratio/low_mean": 0.004159306932706386, |
| "clip_ratio/low_min": 0.0016835585120134056, |
| "clip_ratio/region_mean": 0.009649459869251586, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 13773.0, |
| "completions/max_terminated_length": 13773.0, |
| "completions/mean_length": 9316.125, |
| "completions/mean_terminated_length": 9316.125, |
| "completions/min_length": 1200.0, |
| "completions/min_terminated_length": 1200.0, |
| "entropy": 0.09313291660510004, |
| "epoch": 0.00073, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 3.408350944519043, |
| "kl": 0.34883329924196005, |
| "learning_rate": 4.121010719475882e-05, |
| "loss": 0.0489, |
| "num_tokens": 13135643.0, |
| "reward": 0.2852872610092163, |
| "reward_std": 1.1488020420074463, |
| "rewards/rollout_reward_func/mean": 0.2852872610092163, |
| "rewards/rollout_reward_func/std": 1.1279585361480713, |
| "sampling/importance_sampling_ratio/max": 3.0, |
| "sampling/importance_sampling_ratio/mean": 1.0009342432022095, |
| "sampling/importance_sampling_ratio/min": 0.2820616066455841, |
| "sampling/sampling_logp_difference/max": 1.31003737449646, |
| "sampling/sampling_logp_difference/mean": 0.01842338591814041, |
| "step": 73, |
| "step_time": 154.65388823600188 |
| }, |
| { |
| "clip_ratio/high_max": 0.013598717050626874, |
| "clip_ratio/high_mean": 0.0074385893531143665, |
| "clip_ratio/low_mean": 0.011851943374495022, |
| "clip_ratio/low_min": 0.005019015457946807, |
| "clip_ratio/region_mean": 0.019290532698505558, |
| "entropy": 0.09284176700748503, |
| "epoch": 0.00074, |
| "grad_norm": 2.88838267326355, |
| "kl": 0.3572141509503126, |
| "learning_rate": 4.077247334828387e-05, |
| "loss": 0.0384, |
| "step": 74, |
| "step_time": 73.51275038800486 |
| }, |
| { |
| "clip_ratio/high_max": 0.008469170046737418, |
| "clip_ratio/high_mean": 0.0051088373438687995, |
| "clip_ratio/low_mean": 0.004827196316909976, |
| "clip_ratio/low_min": 0.00041946308920159936, |
| "clip_ratio/region_mean": 0.00993603361712303, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 13591.0, |
| "completions/max_terminated_length": 13591.0, |
| "completions/mean_length": 10868.78125, |
| "completions/mean_terminated_length": 10868.78125, |
| "completions/min_length": 2652.0, |
| "completions/min_terminated_length": 2652.0, |
| "entropy": 0.11976136197336018, |
| "epoch": 0.00075, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 3.328458070755005, |
| "kl": 0.47066329792141914, |
| "learning_rate": 4.032773347533051e-05, |
| "loss": -0.0744, |
| "num_tokens": 13509593.0, |
| "reward": 0.33224326372146606, |
| "reward_std": 1.1032413244247437, |
| "rewards/rollout_reward_func/mean": 0.33224326372146606, |
| "rewards/rollout_reward_func/std": 1.0542694330215454, |
| "sampling/importance_sampling_ratio/max": 2.090331792831421, |
| "sampling/importance_sampling_ratio/mean": 0.9978272318840027, |
| "sampling/importance_sampling_ratio/min": 0.3244995176792145, |
| "sampling/sampling_logp_difference/max": 1.1254712343215942, |
| "sampling/sampling_logp_difference/mean": 0.01953265815973282, |
| "step": 75, |
| "step_time": 162.99350261499603 |
| }, |
| { |
| "clip_ratio/high_max": 0.013260606676340103, |
| "clip_ratio/high_mean": 0.007932637934572995, |
| "clip_ratio/low_mean": 0.010386872600065544, |
| "clip_ratio/low_min": 0.003419152199057862, |
| "clip_ratio/region_mean": 0.01831951050553471, |
| "entropy": 0.12087863381020725, |
| "epoch": 0.00076, |
| "grad_norm": 2.9509475231170654, |
| "kl": 0.47699476033449173, |
| "learning_rate": 3.9876219457459105e-05, |
| "loss": -0.0953, |
| "step": 76, |
| "step_time": 74.80923694300145 |
| }, |
| { |
| "clip_ratio/high_max": 0.0033680717169772834, |
| "clip_ratio/high_mean": 0.0019300988496979699, |
| "clip_ratio/low_mean": 0.004023112080176361, |
| "clip_ratio/low_min": 0.001683558599324897, |
| "clip_ratio/region_mean": 0.005953210958978161, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 13043.0, |
| "completions/max_terminated_length": 13043.0, |
| "completions/mean_length": 10891.125, |
| "completions/mean_terminated_length": 10891.125, |
| "completions/min_length": 2403.0, |
| "completions/min_terminated_length": 2403.0, |
| "entropy": 0.11249894462525845, |
| "epoch": 0.00077, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 3.528323173522949, |
| "kl": 0.45352290105074644, |
| "learning_rate": 3.9418268231350794e-05, |
| "loss": 0.4185, |
| "num_tokens": 13884272.0, |
| "reward": 0.25572478771209717, |
| "reward_std": 1.1789345741271973, |
| "rewards/rollout_reward_func/mean": 0.25572478771209717, |
| "rewards/rollout_reward_func/std": 1.1531264781951904, |
| "sampling/importance_sampling_ratio/max": 3.0, |
| "sampling/importance_sampling_ratio/mean": 0.9990130662918091, |
| "sampling/importance_sampling_ratio/min": 0.20195981860160828, |
| "sampling/sampling_logp_difference/max": 1.5996865034103394, |
| "sampling/sampling_logp_difference/mean": 0.018811069428920746, |
| "step": 77, |
| "step_time": 164.1396708480006 |
| }, |
| { |
| "clip_ratio/high_max": 0.009418375324457884, |
| "clip_ratio/high_mean": 0.006461102922912687, |
| "clip_ratio/low_mean": 0.011049802458728664, |
| "clip_ratio/low_min": 0.004669325251597911, |
| "clip_ratio/region_mean": 0.01751090532343369, |
| "entropy": 0.11076454306021333, |
| "epoch": 0.00078, |
| "grad_norm": 2.8760058879852295, |
| "kl": 0.4641501298174262, |
| "learning_rate": 3.8954221537372784e-05, |
| "loss": 0.4054, |
| "step": 78, |
| "step_time": 72.70563589599624 |
| }, |
| { |
| "clip_ratio/high_max": 0.005599236872512847, |
| "clip_ratio/high_mean": 0.0027996184362564236, |
| "clip_ratio/low_mean": 0.004529917219770141, |
| "clip_ratio/low_min": 0.00042517005931586027, |
| "clip_ratio/region_mean": 0.007329535641474649, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 13665.0, |
| "completions/max_terminated_length": 13665.0, |
| "completions/mean_length": 10477.25, |
| "completions/mean_terminated_length": 10477.25, |
| "completions/min_length": 3171.0, |
| "completions/min_terminated_length": 3171.0, |
| "entropy": 0.10928369383327663, |
| "epoch": 0.00079, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 3.4090378284454346, |
| "kl": 0.504779901355505, |
| "learning_rate": 3.848442566455879e-05, |
| "loss": 0.2142, |
| "num_tokens": 14245453.0, |
| "reward": 0.39725062251091003, |
| "reward_std": 1.1237173080444336, |
| "rewards/rollout_reward_func/mean": 0.39725062251091003, |
| "rewards/rollout_reward_func/std": 1.0794570446014404, |
| "sampling/importance_sampling_ratio/max": 2.5544402599334717, |
| "sampling/importance_sampling_ratio/mean": 1.0012173652648926, |
| "sampling/importance_sampling_ratio/min": 0.12977543473243713, |
| "sampling/sampling_logp_difference/max": 2.04194974899292, |
| "sampling/sampling_logp_difference/mean": 0.01983896642923355, |
| "step": 79, |
| "step_time": 164.4674953970025 |
| }, |
| { |
| "clip_ratio/high_max": 0.00996807124465704, |
| "clip_ratio/high_mean": 0.005246640677796677, |
| "clip_ratio/low_mean": 0.01202807053050492, |
| "clip_ratio/low_min": 0.006143381004221737, |
| "clip_ratio/region_mean": 0.017274711281061172, |
| "entropy": 0.10772595112212002, |
| "epoch": 0.0008, |
| "grad_norm": 3.7778007984161377, |
| "kl": 0.5062860492616892, |
| "learning_rate": 3.800923119219528e-05, |
| "loss": 0.1939, |
| "step": 80, |
| "step_time": 74.33145354599947 |
| }, |
| { |
| "clip_ratio/high_max": 0.009137832559645176, |
| "clip_ratio/high_mean": 0.00524506566580385, |
| "clip_ratio/low_mean": 0.007272154442034662, |
| "clip_ratio/low_min": 0.002546385396271944, |
| "clip_ratio/region_mean": 0.012517220136942342, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 13498.0, |
| "completions/max_terminated_length": 13498.0, |
| "completions/mean_length": 9313.21875, |
| "completions/mean_terminated_length": 9313.21875, |
| "completions/min_length": 1181.0, |
| "completions/min_terminated_length": 1181.0, |
| "entropy": 0.08699527697172016, |
| "epoch": 0.00081, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 2.986673593521118, |
| "kl": 0.5458228345960379, |
| "learning_rate": 3.752899272820599e-05, |
| "loss": -0.1808, |
| "num_tokens": 14569692.0, |
| "reward": 0.2744262218475342, |
| "reward_std": 1.180228352546692, |
| "rewards/rollout_reward_func/mean": 0.2744262218475342, |
| "rewards/rollout_reward_func/std": 1.1685067415237427, |
| "sampling/importance_sampling_ratio/max": 2.5544402599334717, |
| "sampling/importance_sampling_ratio/mean": 1.0001392364501953, |
| "sampling/importance_sampling_ratio/min": 0.21354326605796814, |
| "sampling/sampling_logp_difference/max": 1.543915867805481, |
| "sampling/sampling_logp_difference/mean": 0.017606310546398163, |
| "step": 81, |
| "step_time": 156.98116288799793 |
| }, |
| { |
| "clip_ratio/high_max": 0.0205975886201486, |
| "clip_ratio/high_mean": 0.011662664590403438, |
| "clip_ratio/low_mean": 0.006897521496284753, |
| "clip_ratio/low_min": 0.0013111887965351343, |
| "clip_ratio/region_mean": 0.01856018614489585, |
| "entropy": 0.08844376017805189, |
| "epoch": 0.00082, |
| "grad_norm": 3.179837465286255, |
| "kl": 0.49759334325790405, |
| "learning_rate": 3.7044068644530266e-05, |
| "loss": -0.1825, |
| "step": 82, |
| "step_time": 72.62646461700206 |
| }, |
| { |
| "clip_ratio/high_max": 0.00906756124459207, |
| "clip_ratio/high_mean": 0.00509181636152789, |
| "clip_ratio/low_mean": 0.0038351798575604334, |
| "clip_ratio/low_min": 0.0012641340435948223, |
| "clip_ratio/region_mean": 0.008926996189984493, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 13279.0, |
| "completions/max_terminated_length": 13279.0, |
| "completions/mean_length": 9933.28125, |
| "completions/mean_terminated_length": 9933.28125, |
| "completions/min_length": 661.0, |
| "completions/min_terminated_length": 661.0, |
| "entropy": 0.09674874134361744, |
| "epoch": 0.00083, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 4.155754566192627, |
| "kl": 0.5548635525628924, |
| "learning_rate": 3.6554820809692434e-05, |
| "loss": 0.5622, |
| "num_tokens": 14913694.0, |
| "reward": 0.4627847671508789, |
| "reward_std": 1.0783663988113403, |
| "rewards/rollout_reward_func/mean": 0.4627847671508789, |
| "rewards/rollout_reward_func/std": 1.0711275339126587, |
| "sampling/importance_sampling_ratio/max": 2.7722387313842773, |
| "sampling/importance_sampling_ratio/mean": 1.0004757642745972, |
| "sampling/importance_sampling_ratio/min": 0.5378848314285278, |
| "sampling/sampling_logp_difference/max": 1.0196552276611328, |
| "sampling/sampling_logp_difference/mean": 0.017515596002340317, |
| "step": 83, |
| "step_time": 157.01505748699674 |
| }, |
| { |
| "clip_ratio/high_max": 0.014731630391906947, |
| "clip_ratio/high_mean": 0.0073658151959534734, |
| "clip_ratio/low_mean": 0.0047907959669828415, |
| "clip_ratio/low_min": 0.0017347846878692508, |
| "clip_ratio/region_mean": 0.012156611250247806, |
| "entropy": 0.09983515413478017, |
| "epoch": 0.00084, |
| "grad_norm": 2.53806471824646, |
| "kl": 0.5409666234627366, |
| "learning_rate": 3.606161431876201e-05, |
| "loss": 0.5411, |
| "step": 84, |
| "step_time": 71.1588070430007 |
| } |
| ], |
| "logging_steps": 1.0, |
| "max_steps": 150, |
| "num_input_tokens_seen": 14913694, |
| "num_train_epochs": 1, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|