gin_rummy-v1.0.18 / trainer_state.json
iamPi's picture
Upload folder using huggingface_hub
ffea1bc verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.00084,
"eval_steps": 500,
"global_step": 84,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 13167.0,
"completions/max_terminated_length": 13167.0,
"completions/mean_length": 9931.84375,
"completions/mean_terminated_length": 9931.84375,
"completions/min_length": 204.0,
"completions/min_terminated_length": 204.0,
"entropy": 0.02259049008716829,
"epoch": 1e-05,
"frac_reward_zero_std": 0.0,
"grad_norm": 5.972987651824951,
"kl": 0.0,
"learning_rate": 0.0,
"loss": 0.4677,
"num_tokens": 344589.0,
"reward": -0.5509895086288452,
"reward_std": 0.49180489778518677,
"rewards/rollout_reward_func/mean": -0.5509895086288452,
"rewards/rollout_reward_func/std": 0.5359447598457336,
"sampling/importance_sampling_ratio/max": 2.058147430419922,
"sampling/importance_sampling_ratio/mean": 0.9989001750946045,
"sampling/importance_sampling_ratio/min": 0.06982824206352234,
"sampling/sampling_logp_difference/max": 2.6617166996002197,
"sampling/sampling_logp_difference/mean": 0.007194924633949995,
"step": 1,
"step_time": 156.04581609299998
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.02259049008716829,
"epoch": 2e-05,
"grad_norm": 6.000457286834717,
"kl": 0.0,
"learning_rate": 1.4285714285714286e-06,
"loss": 0.4677,
"step": 2,
"step_time": 72.52690972699997
},
{
"clip_ratio/high_max": 0.0052298564405646175,
"clip_ratio/high_mean": 0.0026149282202823088,
"clip_ratio/low_mean": 0.0035879433271475136,
"clip_ratio/low_min": 0.00041666667675599456,
"clip_ratio/region_mean": 0.006202871503774077,
"completions/clipped_ratio": 0.0,
"completions/max_length": 13398.0,
"completions/max_terminated_length": 13398.0,
"completions/mean_length": 11894.3125,
"completions/mean_terminated_length": 11894.3125,
"completions/min_length": 7109.0,
"completions/min_terminated_length": 7109.0,
"entropy": 0.023217253852635622,
"epoch": 3e-05,
"frac_reward_zero_std": 0.0,
"grad_norm": 6.642491340637207,
"kl": 0.0027546791861823294,
"learning_rate": 2.8571428571428573e-06,
"loss": 0.2037,
"num_tokens": 752299.0,
"reward": -0.48321714997291565,
"reward_std": 0.7304716110229492,
"rewards/rollout_reward_func/mean": -0.48321714997291565,
"rewards/rollout_reward_func/std": 0.7494373917579651,
"sampling/importance_sampling_ratio/max": 3.0,
"sampling/importance_sampling_ratio/mean": 0.9993765354156494,
"sampling/importance_sampling_ratio/min": 0.08995848894119263,
"sampling/sampling_logp_difference/max": 2.408406972885132,
"sampling/sampling_logp_difference/mean": 0.009509067051112652,
"step": 3,
"step_time": 167.4341570240008
},
{
"clip_ratio/high_max": 0.0048466095759067684,
"clip_ratio/high_mean": 0.0024233047879533842,
"clip_ratio/low_mean": 0.0035862942750100046,
"clip_ratio/low_min": 0.0010411091789137572,
"clip_ratio/region_mean": 0.006009599004755728,
"entropy": 0.02273811263148673,
"epoch": 4e-05,
"grad_norm": 6.468554496765137,
"kl": 0.0035955551825281873,
"learning_rate": 4.285714285714286e-06,
"loss": 0.2014,
"step": 4,
"step_time": 75.431375376
},
{
"clip_ratio/high_max": 0.0030947362538427114,
"clip_ratio/high_mean": 0.0017841105291154236,
"clip_ratio/low_mean": 0.002839852197212167,
"clip_ratio/low_min": 0.0010658372775651515,
"clip_ratio/region_mean": 0.004623962740879506,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12874.0,
"completions/max_terminated_length": 12874.0,
"completions/mean_length": 11251.96875,
"completions/mean_terminated_length": 11251.96875,
"completions/min_length": 3238.0,
"completions/min_terminated_length": 3238.0,
"entropy": 0.024516460485756397,
"epoch": 5e-05,
"frac_reward_zero_std": 0.0,
"grad_norm": 5.106947422027588,
"kl": 0.0033718565209710505,
"learning_rate": 5.7142857142857145e-06,
"loss": -0.2472,
"num_tokens": 1138956.0,
"reward": -0.43200355768203735,
"reward_std": 0.6725019812583923,
"rewards/rollout_reward_func/mean": -0.43200355768203735,
"rewards/rollout_reward_func/std": 0.7308401465415955,
"sampling/importance_sampling_ratio/max": 2.691387414932251,
"sampling/importance_sampling_ratio/mean": 1.000849723815918,
"sampling/importance_sampling_ratio/min": 0.28908851742744446,
"sampling/sampling_logp_difference/max": 1.2410223484039307,
"sampling/sampling_logp_difference/mean": 0.00824270211160183,
"step": 5,
"step_time": 160.70129029100008
},
{
"clip_ratio/high_max": 0.0022010681277606636,
"clip_ratio/high_mean": 0.0015363333659479395,
"clip_ratio/low_mean": 0.002654661060660146,
"clip_ratio/low_min": 0.0010658372775651515,
"clip_ratio/region_mean": 0.004190994426608086,
"entropy": 0.024331653432454914,
"epoch": 6e-05,
"grad_norm": 5.248446941375732,
"kl": 0.003470558443950722,
"learning_rate": 7.142857142857143e-06,
"loss": -0.2503,
"step": 6,
"step_time": 70.39162059900036
},
{
"clip_ratio/high_max": 0.004718718817457557,
"clip_ratio/high_mean": 0.0023593594087287784,
"clip_ratio/low_mean": 0.00323545208084397,
"clip_ratio/low_min": 0.0008474673668388277,
"clip_ratio/region_mean": 0.0055948115477804095,
"completions/clipped_ratio": 0.0,
"completions/max_length": 13424.0,
"completions/max_terminated_length": 13424.0,
"completions/mean_length": 10642.4375,
"completions/mean_terminated_length": 10642.4375,
"completions/min_length": 1097.0,
"completions/min_terminated_length": 1097.0,
"entropy": 0.02454580759513192,
"epoch": 7e-05,
"frac_reward_zero_std": 0.0,
"grad_norm": 5.2174973487854,
"kl": 0.002597570359284873,
"learning_rate": 8.571428571428573e-06,
"loss": 0.4074,
"num_tokens": 1506776.0,
"reward": -0.4508303999900818,
"reward_std": 0.856971263885498,
"rewards/rollout_reward_func/mean": -0.4508303999900818,
"rewards/rollout_reward_func/std": 0.875674307346344,
"sampling/importance_sampling_ratio/max": 2.2130932807922363,
"sampling/importance_sampling_ratio/mean": 0.9991341829299927,
"sampling/importance_sampling_ratio/min": 0.09515064209699631,
"sampling/sampling_logp_difference/max": 2.3522939682006836,
"sampling/sampling_logp_difference/mean": 0.007671562489122152,
"step": 7,
"step_time": 167.14791396400005
},
{
"clip_ratio/high_max": 0.005202832107897848,
"clip_ratio/high_mean": 0.002601416053948924,
"clip_ratio/low_mean": 0.0041243805608246475,
"clip_ratio/low_min": 0.0012697646743617952,
"clip_ratio/region_mean": 0.006725796643877402,
"entropy": 0.02552157419268042,
"epoch": 8e-05,
"grad_norm": 3.9264698028564453,
"kl": 0.0049671942251734436,
"learning_rate": 1e-05,
"loss": 0.402,
"step": 8,
"step_time": 75.2512055050006
},
{
"clip_ratio/high_max": 0.006363416585372761,
"clip_ratio/high_mean": 0.004113822695217095,
"clip_ratio/low_mean": 0.0022823161561973393,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.006396138880518265,
"completions/clipped_ratio": 0.0,
"completions/max_length": 13250.0,
"completions/max_terminated_length": 13250.0,
"completions/mean_length": 11233.6875,
"completions/mean_terminated_length": 11233.6875,
"completions/min_length": 1185.0,
"completions/min_terminated_length": 1185.0,
"entropy": 0.027631424833089113,
"epoch": 9e-05,
"frac_reward_zero_std": 0.0,
"grad_norm": 6.5448150634765625,
"kl": 0.00364598065152677,
"learning_rate": 1.1428571428571429e-05,
"loss": -0.0485,
"num_tokens": 1893166.0,
"reward": -0.5741807222366333,
"reward_std": 0.40697890520095825,
"rewards/rollout_reward_func/mean": -0.5741807222366333,
"rewards/rollout_reward_func/std": 0.4363042116165161,
"sampling/importance_sampling_ratio/max": 3.0,
"sampling/importance_sampling_ratio/mean": 1.0008200407028198,
"sampling/importance_sampling_ratio/min": 0.09258205443620682,
"sampling/sampling_logp_difference/max": 2.37965989112854,
"sampling/sampling_logp_difference/mean": 0.009641825221478939,
"step": 9,
"step_time": 165.40504330000158
},
{
"clip_ratio/high_max": 0.005221198371145874,
"clip_ratio/high_mean": 0.0032498103246325627,
"clip_ratio/low_mean": 0.0037138201732886955,
"clip_ratio/low_min": 0.00042517005931586027,
"clip_ratio/region_mean": 0.006963630527025089,
"entropy": 0.028140071080997586,
"epoch": 0.0001,
"grad_norm": 5.636337757110596,
"kl": 0.005874367059732322,
"learning_rate": 1.2857142857142857e-05,
"loss": -0.0496,
"step": 10,
"step_time": 74.03838988500002
},
{
"clip_ratio/high_max": 0.009417513123480603,
"clip_ratio/high_mean": 0.0053452335559995845,
"clip_ratio/low_mean": 0.002856071077985689,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.008201304633985274,
"completions/clipped_ratio": 0.0,
"completions/max_length": 13467.0,
"completions/max_terminated_length": 13467.0,
"completions/mean_length": 10857.9375,
"completions/mean_terminated_length": 10857.9375,
"completions/min_length": 1234.0,
"completions/min_terminated_length": 1234.0,
"entropy": 0.025376274134032428,
"epoch": 0.00011,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.4508001804351807,
"kl": 0.007294285507668974,
"learning_rate": 1.4285714285714285e-05,
"loss": 0.4251,
"num_tokens": 2267468.0,
"reward": -0.41874319314956665,
"reward_std": 0.7331498861312866,
"rewards/rollout_reward_func/mean": -0.41874319314956665,
"rewards/rollout_reward_func/std": 0.7618294954299927,
"sampling/importance_sampling_ratio/max": 2.2565839290618896,
"sampling/importance_sampling_ratio/mean": 0.9988585114479065,
"sampling/importance_sampling_ratio/min": 0.19103538990020752,
"sampling/sampling_logp_difference/max": 1.6552965641021729,
"sampling/sampling_logp_difference/mean": 0.007842643186450005,
"step": 11,
"step_time": 168.32780298800117
},
{
"clip_ratio/high_max": 0.010110442468430847,
"clip_ratio/high_mean": 0.005691698199370876,
"clip_ratio/low_mean": 0.0022150053700897843,
"clip_ratio/low_min": 0.00042808218859136105,
"clip_ratio/region_mean": 0.007906703525804915,
"entropy": 0.0246156333014369,
"epoch": 0.00012,
"grad_norm": 3.3792054653167725,
"kl": 0.00864831962826429,
"learning_rate": 1.5714285714285715e-05,
"loss": 0.4308,
"step": 12,
"step_time": 75.09417690599867
},
{
"clip_ratio/high_max": 0.007237050449475646,
"clip_ratio/high_mean": 0.003618525224737823,
"clip_ratio/low_mean": 0.004828559438465163,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.008447084634099156,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12770.0,
"completions/max_terminated_length": 12770.0,
"completions/mean_length": 9700.59375,
"completions/mean_terminated_length": 9700.59375,
"completions/min_length": 678.0,
"completions/min_terminated_length": 678.0,
"entropy": 0.02448252754402347,
"epoch": 0.00013,
"frac_reward_zero_std": 0.0,
"grad_norm": 4.033958435058594,
"kl": 0.008902751103960327,
"learning_rate": 1.7142857142857145e-05,
"loss": 0.9859,
"num_tokens": 2605064.0,
"reward": -0.3038055896759033,
"reward_std": 0.6740490198135376,
"rewards/rollout_reward_func/mean": -0.3038055896759033,
"rewards/rollout_reward_func/std": 0.7864021062850952,
"sampling/importance_sampling_ratio/max": 2.7903988361358643,
"sampling/importance_sampling_ratio/mean": 1.0011733770370483,
"sampling/importance_sampling_ratio/min": 0.015171236358582973,
"sampling/sampling_logp_difference/max": 4.188354015350342,
"sampling/sampling_logp_difference/mean": 0.007430948317050934,
"step": 13,
"step_time": 154.12419534999935
},
{
"clip_ratio/high_max": 0.002730079897446558,
"clip_ratio/high_mean": 0.001365039948723279,
"clip_ratio/low_mean": 0.006960608443478122,
"clip_ratio/low_min": 0.00041666667675599456,
"clip_ratio/region_mean": 0.008325648421305232,
"entropy": 0.024137669446645305,
"epoch": 0.00014,
"grad_norm": 3.100346565246582,
"kl": 0.008414470611569413,
"learning_rate": 1.8571428571428572e-05,
"loss": 0.9784,
"step": 14,
"step_time": 69.95948773200143
},
{
"clip_ratio/high_max": 0.006266110052820295,
"clip_ratio/high_mean": 0.003558225085726008,
"clip_ratio/low_mean": 0.004969277448253706,
"clip_ratio/low_min": 0.00042517005931586027,
"clip_ratio/region_mean": 0.008527502533979714,
"completions/clipped_ratio": 0.0,
"completions/max_length": 13632.0,
"completions/max_terminated_length": 13632.0,
"completions/mean_length": 10339.125,
"completions/mean_terminated_length": 10339.125,
"completions/min_length": 1158.0,
"completions/min_terminated_length": 1158.0,
"entropy": 0.024794226861558855,
"epoch": 0.00015,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.3704771995544434,
"kl": 0.015760348596813856,
"learning_rate": 2e-05,
"loss": -0.1,
"num_tokens": 2963008.0,
"reward": -0.6195909976959229,
"reward_std": 0.6979167461395264,
"rewards/rollout_reward_func/mean": -0.6195909976959229,
"rewards/rollout_reward_func/std": 0.7056158781051636,
"sampling/importance_sampling_ratio/max": 2.3136019706726074,
"sampling/importance_sampling_ratio/mean": 1.000252366065979,
"sampling/importance_sampling_ratio/min": 0.14486655592918396,
"sampling/sampling_logp_difference/max": 1.9319422245025635,
"sampling/sampling_logp_difference/mean": 0.0070663755759596825,
"step": 15,
"step_time": 162.4636791780008
},
{
"clip_ratio/high_max": 0.008671565796248615,
"clip_ratio/high_mean": 0.005187579081393778,
"clip_ratio/low_mean": 0.0059419748722575605,
"clip_ratio/low_min": 0.00042517005931586027,
"clip_ratio/region_mean": 0.011129553953651339,
"entropy": 0.023938709287904203,
"epoch": 0.00016,
"grad_norm": 3.0196239948272705,
"kl": 0.016887567695448524,
"learning_rate": 2.1428571428571428e-05,
"loss": -0.1029,
"step": 16,
"step_time": 75.08079870300026
},
{
"clip_ratio/high_max": 0.00510084442794323,
"clip_ratio/high_mean": 0.002550422213971615,
"clip_ratio/low_mean": 0.004087600493221544,
"clip_ratio/low_min": 0.0016365568444598466,
"clip_ratio/region_mean": 0.006638022648985498,
"completions/clipped_ratio": 0.0,
"completions/max_length": 13272.0,
"completions/max_terminated_length": 13272.0,
"completions/mean_length": 10441.21875,
"completions/mean_terminated_length": 10441.21875,
"completions/min_length": 2067.0,
"completions/min_terminated_length": 2067.0,
"entropy": 0.027783271041698754,
"epoch": 0.00017,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.9961190223693848,
"kl": 0.018047706194920465,
"learning_rate": 2.2857142857142858e-05,
"loss": 1.118,
"num_tokens": 3324273.0,
"reward": -0.22455403208732605,
"reward_std": 0.9343163371086121,
"rewards/rollout_reward_func/mean": -0.22455403208732605,
"rewards/rollout_reward_func/std": 0.9539775252342224,
"sampling/importance_sampling_ratio/max": 2.838672637939453,
"sampling/importance_sampling_ratio/mean": 0.9985122680664062,
"sampling/importance_sampling_ratio/min": 0.2129591554403305,
"sampling/sampling_logp_difference/max": 1.5466549396514893,
"sampling/sampling_logp_difference/mean": 0.011087952181696892,
"step": 17,
"step_time": 162.3905202420001
},
{
"clip_ratio/high_max": 0.005912428721785545,
"clip_ratio/high_mean": 0.0029562143608927727,
"clip_ratio/low_mean": 0.0026932653854601085,
"clip_ratio/low_min": 0.0005040322430431843,
"clip_ratio/region_mean": 0.005649479775456712,
"entropy": 0.028106234036386013,
"epoch": 0.00018,
"grad_norm": 3.900911569595337,
"kl": 0.03091182082789601,
"learning_rate": 2.4285714285714288e-05,
"loss": 1.1189,
"step": 18,
"step_time": 74.16337497800077
},
{
"clip_ratio/high_max": 0.01214791223173961,
"clip_ratio/high_mean": 0.006297170388279483,
"clip_ratio/low_mean": 0.0026435064210090786,
"clip_ratio/low_min": 0.00042517005931586027,
"clip_ratio/region_mean": 0.008940676809288561,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12931.0,
"completions/max_terminated_length": 12931.0,
"completions/mean_length": 10159.4375,
"completions/mean_terminated_length": 10159.4375,
"completions/min_length": 1245.0,
"completions/min_terminated_length": 1245.0,
"entropy": 0.028414718341082335,
"epoch": 0.00019,
"frac_reward_zero_std": 0.0,
"grad_norm": 5.980042934417725,
"kl": 0.029700799204874784,
"learning_rate": 2.5714285714285714e-05,
"loss": 0.2205,
"num_tokens": 3676437.0,
"reward": -0.5079509615898132,
"reward_std": 0.6904375553131104,
"rewards/rollout_reward_func/mean": -0.5079509615898132,
"rewards/rollout_reward_func/std": 0.7216994762420654,
"sampling/importance_sampling_ratio/max": 2.166841745376587,
"sampling/importance_sampling_ratio/mean": 0.9985523223876953,
"sampling/importance_sampling_ratio/min": 0.19324928522109985,
"sampling/sampling_logp_difference/max": 1.6437742710113525,
"sampling/sampling_logp_difference/mean": 0.008595403283834457,
"step": 19,
"step_time": 156.51792565699907
},
{
"clip_ratio/high_max": 0.018410747725283727,
"clip_ratio/high_mean": 0.009882654994726181,
"clip_ratio/low_mean": 0.00612688584078569,
"clip_ratio/low_min": 0.0012755101779475808,
"clip_ratio/region_mean": 0.016009540820959955,
"entropy": 0.02830331851146184,
"epoch": 0.0002,
"grad_norm": 4.322617530822754,
"kl": 0.044774680165573955,
"learning_rate": 2.714285714285714e-05,
"loss": 0.22,
"step": 20,
"step_time": 71.70477191100008
},
{
"clip_ratio/high_max": 0.003913487191312015,
"clip_ratio/high_mean": 0.0021989916858728975,
"clip_ratio/low_mean": 0.0029591854254249483,
"clip_ratio/low_min": 0.0004562043759506196,
"clip_ratio/region_mean": 0.005158177111297846,
"completions/clipped_ratio": 0.0,
"completions/max_length": 13476.0,
"completions/max_terminated_length": 13476.0,
"completions/mean_length": 9885.6875,
"completions/mean_terminated_length": 9885.6875,
"completions/min_length": 2196.0,
"completions/min_terminated_length": 2196.0,
"entropy": 0.02020650013582781,
"epoch": 0.00021,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.4713399410247803,
"kl": 0.037866046448471025,
"learning_rate": 2.857142857142857e-05,
"loss": 1.0734,
"num_tokens": 4019800.0,
"reward": -0.24059510231018066,
"reward_std": 0.9731942415237427,
"rewards/rollout_reward_func/mean": -0.24059510231018066,
"rewards/rollout_reward_func/std": 1.004063367843628,
"sampling/importance_sampling_ratio/max": 2.5626845359802246,
"sampling/importance_sampling_ratio/mean": 0.999998927116394,
"sampling/importance_sampling_ratio/min": 0.34062984585762024,
"sampling/sampling_logp_difference/max": 1.0769588947296143,
"sampling/sampling_logp_difference/mean": 0.005168822128325701,
"step": 21,
"step_time": 156.87602913699993
},
{
"clip_ratio/high_max": 0.005465811875183135,
"clip_ratio/high_mean": 0.0027329059375915676,
"clip_ratio/low_mean": 0.003968855453422293,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.006701761391013861,
"entropy": 0.020286828803364187,
"epoch": 0.00022,
"grad_norm": 2.8511836528778076,
"kl": 0.05319512345158728,
"learning_rate": 3e-05,
"loss": 1.0666,
"step": 22,
"step_time": 73.16974372700179
},
{
"clip_ratio/high_max": 0.00847262708703056,
"clip_ratio/high_mean": 0.00423631354351528,
"clip_ratio/low_mean": 0.002743131757597439,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.006979445330216549,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12993.0,
"completions/max_terminated_length": 12993.0,
"completions/mean_length": 9312.96875,
"completions/mean_terminated_length": 9312.96875,
"completions/min_length": 1575.0,
"completions/min_terminated_length": 1575.0,
"entropy": 0.023286236566491425,
"epoch": 0.00023,
"frac_reward_zero_std": 0.0,
"grad_norm": 4.416781902313232,
"kl": 0.08850510988850147,
"learning_rate": 3.142857142857143e-05,
"loss": 0.8796,
"num_tokens": 4344597.0,
"reward": -0.15241163969039917,
"reward_std": 0.7749617099761963,
"rewards/rollout_reward_func/mean": -0.15241163969039917,
"rewards/rollout_reward_func/std": 0.971662700176239,
"sampling/importance_sampling_ratio/max": 3.0,
"sampling/importance_sampling_ratio/mean": 0.9986792802810669,
"sampling/importance_sampling_ratio/min": 0.13984200358390808,
"sampling/sampling_logp_difference/max": 1.9672420024871826,
"sampling/sampling_logp_difference/mean": 0.008968186564743519,
"step": 23,
"step_time": 149.11503398000104
},
{
"clip_ratio/high_max": 0.008956918376497924,
"clip_ratio/high_mean": 0.004696990654338151,
"clip_ratio/low_mean": 0.002034541597822681,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.006731532179401256,
"entropy": 0.02346635900903493,
"epoch": 0.00024,
"grad_norm": 2.7925429344177246,
"kl": 0.11299954203423113,
"learning_rate": 3.285714285714286e-05,
"loss": 0.8791,
"step": 24,
"step_time": 68.35307336800088
},
{
"clip_ratio/high_max": 0.007403911498840898,
"clip_ratio/high_mean": 0.003701955749420449,
"clip_ratio/low_mean": 0.0032710890081943944,
"clip_ratio/low_min": 0.00048449612222611904,
"clip_ratio/region_mean": 0.006973044728511013,
"completions/clipped_ratio": 0.0,
"completions/max_length": 13149.0,
"completions/max_terminated_length": 13149.0,
"completions/mean_length": 10183.96875,
"completions/mean_terminated_length": 10183.96875,
"completions/min_length": 1690.0,
"completions/min_terminated_length": 1690.0,
"entropy": 0.02520149474730715,
"epoch": 0.00025,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.3123133182525635,
"kl": 0.08004874712787569,
"learning_rate": 3.428571428571429e-05,
"loss": -0.2265,
"num_tokens": 4696978.0,
"reward": -0.39890268445014954,
"reward_std": 0.4381568431854248,
"rewards/rollout_reward_func/mean": -0.39890268445014954,
"rewards/rollout_reward_func/std": 0.5122197270393372,
"sampling/importance_sampling_ratio/max": 2.404775381088257,
"sampling/importance_sampling_ratio/mean": 0.9991356730461121,
"sampling/importance_sampling_ratio/min": 0.11307276040315628,
"sampling/sampling_logp_difference/max": 2.1797237396240234,
"sampling/sampling_logp_difference/mean": 0.008621575310826302,
"step": 25,
"step_time": 153.41606010800206
},
{
"clip_ratio/high_max": 0.003569191030692309,
"clip_ratio/high_mean": 0.002021337946644053,
"clip_ratio/low_mean": 0.005834054827573709,
"clip_ratio/low_min": 0.000972777372226119,
"clip_ratio/region_mean": 0.007855392774217762,
"entropy": 0.02575664728647098,
"epoch": 0.00026,
"grad_norm": 3.546144962310791,
"kl": 0.07968595699639991,
"learning_rate": 3.571428571428572e-05,
"loss": -0.2349,
"step": 26,
"step_time": 71.74039238800015
},
{
"clip_ratio/high_max": 0.011878328310558572,
"clip_ratio/high_mean": 0.007235260869492777,
"clip_ratio/low_mean": 0.002440269570797682,
"clip_ratio/low_min": 0.0017878417274914682,
"clip_ratio/region_mean": 0.009675530440290459,
"completions/clipped_ratio": 0.0,
"completions/max_length": 13152.0,
"completions/max_terminated_length": 13152.0,
"completions/mean_length": 9718.9375,
"completions/mean_terminated_length": 9718.9375,
"completions/min_length": 1111.0,
"completions/min_terminated_length": 1111.0,
"entropy": 0.04362651810515672,
"epoch": 0.00027,
"frac_reward_zero_std": 0.0,
"grad_norm": 4.255147933959961,
"kl": 0.1335945016471669,
"learning_rate": 3.7142857142857143e-05,
"loss": 0.4813,
"num_tokens": 5034383.0,
"reward": -0.2570054233074188,
"reward_std": 0.6717403531074524,
"rewards/rollout_reward_func/mean": -0.2570054233074188,
"rewards/rollout_reward_func/std": 0.800593376159668,
"sampling/importance_sampling_ratio/max": 3.0,
"sampling/importance_sampling_ratio/mean": 0.9988204836845398,
"sampling/importance_sampling_ratio/min": 0.2177802473306656,
"sampling/sampling_logp_difference/max": 1.971644401550293,
"sampling/sampling_logp_difference/mean": 0.013164675794541836,
"step": 27,
"step_time": 154.97156673099835
},
{
"clip_ratio/high_max": 0.014473556191660464,
"clip_ratio/high_mean": 0.008749909291509539,
"clip_ratio/low_mean": 0.004799299247679301,
"clip_ratio/low_min": 0.001802365412004292,
"clip_ratio/region_mean": 0.013549208524636924,
"entropy": 0.04518118337728083,
"epoch": 0.00028,
"grad_norm": 5.101240158081055,
"kl": 0.12378271645866334,
"learning_rate": 3.857142857142858e-05,
"loss": 0.4689,
"step": 28,
"step_time": 70.95561770900167
},
{
"clip_ratio/high_max": 0.005707955948309973,
"clip_ratio/high_mean": 0.0035324228374520317,
"clip_ratio/low_mean": 0.002015564765315503,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0055479876027675346,
"completions/clipped_ratio": 0.0,
"completions/max_length": 13635.0,
"completions/max_terminated_length": 13635.0,
"completions/mean_length": 10821.4375,
"completions/mean_terminated_length": 10821.4375,
"completions/min_length": 3160.0,
"completions/min_terminated_length": 3160.0,
"entropy": 0.03376710624434054,
"epoch": 0.00029,
"frac_reward_zero_std": 0.0,
"grad_norm": 6.056061267852783,
"kl": 0.10205801925621927,
"learning_rate": 4e-05,
"loss": 0.217,
"num_tokens": 5407036.0,
"reward": -0.5185161828994751,
"reward_std": 0.492318719625473,
"rewards/rollout_reward_func/mean": -0.5185161828994751,
"rewards/rollout_reward_func/std": 0.5280240774154663,
"sampling/importance_sampling_ratio/max": 3.0,
"sampling/importance_sampling_ratio/mean": 1.0005357265472412,
"sampling/importance_sampling_ratio/min": 0.12561221420764923,
"sampling/sampling_logp_difference/max": 2.0745558738708496,
"sampling/sampling_logp_difference/mean": 0.010849224403500557,
"step": 29,
"step_time": 163.0567665669996
},
{
"clip_ratio/high_max": 0.01110520790098235,
"clip_ratio/high_mean": 0.006027282943250611,
"clip_ratio/low_mean": 0.003446056245593354,
"clip_ratio/low_min": 0.00042808218859136105,
"clip_ratio/region_mean": 0.009473339232499711,
"entropy": 0.03455971780931577,
"epoch": 0.0003,
"grad_norm": 3.1213033199310303,
"kl": 0.09978742833482102,
"learning_rate": 4.1428571428571437e-05,
"loss": 0.1998,
"step": 30,
"step_time": 75.22149042100045
},
{
"clip_ratio/high_max": 0.007910042797448114,
"clip_ratio/high_mean": 0.004427617866895162,
"clip_ratio/low_mean": 0.004107993547222577,
"clip_ratio/low_min": 0.0005040322430431843,
"clip_ratio/region_mean": 0.008535611428669654,
"completions/clipped_ratio": 0.0,
"completions/max_length": 13526.0,
"completions/max_terminated_length": 13526.0,
"completions/mean_length": 10820.34375,
"completions/mean_terminated_length": 10820.34375,
"completions/min_length": 1826.0,
"completions/min_terminated_length": 1826.0,
"entropy": 0.03883296772255562,
"epoch": 0.00031,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.4302663803100586,
"kl": 0.0946993782708887,
"learning_rate": 4.2857142857142856e-05,
"loss": 0.1392,
"num_tokens": 5779856.0,
"reward": -0.35248619318008423,
"reward_std": 0.7979208827018738,
"rewards/rollout_reward_func/mean": -0.35248619318008423,
"rewards/rollout_reward_func/std": 0.8563511967658997,
"sampling/importance_sampling_ratio/max": 3.0,
"sampling/importance_sampling_ratio/mean": 0.9991589188575745,
"sampling/importance_sampling_ratio/min": 0.1591530442237854,
"sampling/sampling_logp_difference/max": 1.8378889560699463,
"sampling/sampling_logp_difference/mean": 0.010356503538787365,
"step": 31,
"step_time": 167.77319714499845
},
{
"clip_ratio/high_max": 0.013770086516160518,
"clip_ratio/high_mean": 0.0080132340954151,
"clip_ratio/low_mean": 0.006177089671837166,
"clip_ratio/low_min": 0.0004960317746736109,
"clip_ratio/region_mean": 0.014190323767252266,
"entropy": 0.04003481334075332,
"epoch": 0.00032,
"grad_norm": 2.200430154800415,
"kl": 0.0942922827671282,
"learning_rate": 4.428571428571428e-05,
"loss": 0.1266,
"step": 32,
"step_time": 75.98298019100093
},
{
"clip_ratio/high_max": 0.0056163399131037295,
"clip_ratio/high_mean": 0.0028081699565518647,
"clip_ratio/low_mean": 0.007905823076725937,
"clip_ratio/low_min": 0.004023919755127281,
"clip_ratio/region_mean": 0.010713993047829717,
"completions/clipped_ratio": 0.0,
"completions/max_length": 13762.0,
"completions/max_terminated_length": 13762.0,
"completions/mean_length": 10885.375,
"completions/mean_terminated_length": 10885.375,
"completions/min_length": 803.0,
"completions/min_terminated_length": 803.0,
"entropy": 0.0625891622621566,
"epoch": 0.00033,
"frac_reward_zero_std": 0.0,
"grad_norm": 5.534822463989258,
"kl": 0.1454108317848295,
"learning_rate": 4.5714285714285716e-05,
"loss": 0.6602,
"num_tokens": 6154484.0,
"reward": -0.13570240139961243,
"reward_std": 0.933979332447052,
"rewards/rollout_reward_func/mean": -0.13570240139961243,
"rewards/rollout_reward_func/std": 1.0047199726104736,
"sampling/importance_sampling_ratio/max": 3.0,
"sampling/importance_sampling_ratio/mean": 1.001332402229309,
"sampling/importance_sampling_ratio/min": 0.18328945338726044,
"sampling/sampling_logp_difference/max": 1.9671940803527832,
"sampling/sampling_logp_difference/mean": 0.01589260809123516,
"step": 33,
"step_time": 164.3425249209995
},
{
"clip_ratio/high_max": 0.012784746824763715,
"clip_ratio/high_mean": 0.0071671667392365634,
"clip_ratio/low_mean": 0.011323995684506372,
"clip_ratio/low_min": 0.00381069362629205,
"clip_ratio/region_mean": 0.018491162394639105,
"entropy": 0.06436851329635829,
"epoch": 0.00034,
"grad_norm": 4.530355453491211,
"kl": 0.15744604653446004,
"learning_rate": 4.714285714285714e-05,
"loss": 0.6463,
"step": 34,
"step_time": 75.79230114999973
},
{
"clip_ratio/high_max": 0.0122655353625305,
"clip_ratio/high_mean": 0.006788362079532817,
"clip_ratio/low_mean": 0.0035715404665097594,
"clip_ratio/low_min": 0.00042808218859136105,
"clip_ratio/region_mean": 0.010359902604250237,
"completions/clipped_ratio": 0.0,
"completions/max_length": 13321.0,
"completions/max_terminated_length": 13321.0,
"completions/mean_length": 10277.34375,
"completions/mean_terminated_length": 10277.34375,
"completions/min_length": 2255.0,
"completions/min_terminated_length": 2255.0,
"entropy": 0.06130359717644751,
"epoch": 0.00035,
"frac_reward_zero_std": 0.0,
"grad_norm": 6.084246635437012,
"kl": 0.20213585742749274,
"learning_rate": 4.8571428571428576e-05,
"loss": 0.491,
"num_tokens": 6509987.0,
"reward": -0.39044952392578125,
"reward_std": 0.7217355370521545,
"rewards/rollout_reward_func/mean": -0.39044952392578125,
"rewards/rollout_reward_func/std": 0.7610857486724854,
"sampling/importance_sampling_ratio/max": 2.5243656635284424,
"sampling/importance_sampling_ratio/mean": 0.998782217502594,
"sampling/importance_sampling_ratio/min": 0.02197389304637909,
"sampling/sampling_logp_difference/max": 3.8179001808166504,
"sampling/sampling_logp_difference/mean": 0.01720621809363365,
"step": 35,
"step_time": 161.66476932600017
},
{
"clip_ratio/high_max": 0.014183318184223026,
"clip_ratio/high_mean": 0.008639149455120787,
"clip_ratio/low_mean": 0.007928328239358962,
"clip_ratio/low_min": 0.002639831742271781,
"clip_ratio/region_mean": 0.01656747775268741,
"entropy": 0.06414063868578523,
"epoch": 0.00036,
"grad_norm": 4.390661239624023,
"kl": 0.22910623659845442,
"learning_rate": 5e-05,
"loss": 0.4805,
"step": 36,
"step_time": 73.4045656210019
},
{
"clip_ratio/high_max": 0.008312889549415559,
"clip_ratio/high_mean": 0.004156444774707779,
"clip_ratio/low_mean": 0.005125935596879572,
"clip_ratio/low_min": 0.0012918247375637293,
"clip_ratio/region_mean": 0.00928238031337969,
"completions/clipped_ratio": 0.0,
"completions/max_length": 13365.0,
"completions/max_terminated_length": 13365.0,
"completions/mean_length": 10076.125,
"completions/mean_terminated_length": 10076.125,
"completions/min_length": 1117.0,
"completions/min_terminated_length": 1117.0,
"entropy": 0.0663802761118859,
"epoch": 0.00037,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.7034361362457275,
"kl": 0.23728731309529394,
"learning_rate": 4.999300402366083e-05,
"loss": 0.5465,
"num_tokens": 6858892.0,
"reward": -0.17172592878341675,
"reward_std": 0.7274531126022339,
"rewards/rollout_reward_func/mean": -0.17172592878341675,
"rewards/rollout_reward_func/std": 0.7500751614570618,
"sampling/importance_sampling_ratio/max": 2.6516499519348145,
"sampling/importance_sampling_ratio/mean": 1.0006123781204224,
"sampling/importance_sampling_ratio/min": 0.22508670389652252,
"sampling/sampling_logp_difference/max": 1.491269588470459,
"sampling/sampling_logp_difference/mean": 0.01602303236722946,
"step": 37,
"step_time": 164.07891472499978
},
{
"clip_ratio/high_max": 0.01003516762284562,
"clip_ratio/high_mean": 0.005539513367693871,
"clip_ratio/low_mean": 0.014460239675827324,
"clip_ratio/low_min": 0.006212757696630433,
"clip_ratio/region_mean": 0.019999753101728857,
"entropy": 0.06495539681054652,
"epoch": 0.00038,
"grad_norm": 2.917160749435425,
"kl": 0.23570971423760056,
"learning_rate": 4.997202131530303e-05,
"loss": 0.5318,
"step": 38,
"step_time": 74.72924379499818
},
{
"clip_ratio/high_max": 0.006990043446421623,
"clip_ratio/high_mean": 0.0038956627249717712,
"clip_ratio/low_mean": 0.005693367682397366,
"clip_ratio/low_min": 0.00042229730752296746,
"clip_ratio/region_mean": 0.009589030363713391,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12719.0,
"completions/max_terminated_length": 12719.0,
"completions/mean_length": 10167.65625,
"completions/mean_terminated_length": 10167.65625,
"completions/min_length": 2475.0,
"completions/min_terminated_length": 2475.0,
"entropy": 0.06094362598378211,
"epoch": 0.00039,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.3789939880371094,
"kl": 0.19482448839698918,
"learning_rate": 4.993706753300993e-05,
"loss": 0.2761,
"num_tokens": 7211045.0,
"reward": -0.20416001975536346,
"reward_std": 0.7475550174713135,
"rewards/rollout_reward_func/mean": -0.20416001975536346,
"rewards/rollout_reward_func/std": 0.7947777509689331,
"sampling/importance_sampling_ratio/max": 3.0,
"sampling/importance_sampling_ratio/mean": 1.0014694929122925,
"sampling/importance_sampling_ratio/min": 0.28650444746017456,
"sampling/sampling_logp_difference/max": 1.9272756576538086,
"sampling/sampling_logp_difference/mean": 0.013411741703748703,
"step": 39,
"step_time": 159.4788131209989
},
{
"clip_ratio/high_max": 0.012886389653431252,
"clip_ratio/high_mean": 0.007645118064829148,
"clip_ratio/low_mean": 0.009694800595752895,
"clip_ratio/low_min": 0.00044326239731162786,
"clip_ratio/region_mean": 0.017339918646030128,
"entropy": 0.06182372476905584,
"epoch": 0.0004,
"grad_norm": 2.5175423622131348,
"kl": 0.22034673113375902,
"learning_rate": 4.988816876060381e-05,
"loss": 0.2518,
"step": 40,
"step_time": 70.61948872499943
},
{
"clip_ratio/high_max": 0.010759571479866281,
"clip_ratio/high_mean": 0.005379785739933141,
"clip_ratio/low_mean": 0.004713524816907011,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010093310542288236,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12860.0,
"completions/max_terminated_length": 12860.0,
"completions/mean_length": 10191.6875,
"completions/mean_terminated_length": 10191.6875,
"completions/min_length": 1034.0,
"completions/min_terminated_length": 1034.0,
"entropy": 0.0719709824770689,
"epoch": 0.00041,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.783087968826294,
"kl": 0.23890396440401673,
"learning_rate": 4.98253614881812e-05,
"loss": 0.8387,
"num_tokens": 7563650.0,
"reward": 0.09715636074542999,
"reward_std": 0.861005425453186,
"rewards/rollout_reward_func/mean": 0.09715636074542999,
"rewards/rollout_reward_func/std": 0.8409163355827332,
"sampling/importance_sampling_ratio/max": 2.1479194164276123,
"sampling/importance_sampling_ratio/mean": 0.9980385303497314,
"sampling/importance_sampling_ratio/min": 0.25166046619415283,
"sampling/sampling_logp_difference/max": 1.3796745538711548,
"sampling/sampling_logp_difference/mean": 0.01620703563094139,
"step": 41,
"step_time": 156.14588745600213
},
{
"clip_ratio/high_max": 0.015028952562715858,
"clip_ratio/high_mean": 0.007514476281357929,
"clip_ratio/low_mean": 0.013302808598382398,
"clip_ratio/low_min": 0.0027654054574668407,
"clip_ratio/region_mean": 0.020817284763325006,
"entropy": 0.07558500277809799,
"epoch": 0.00042,
"grad_norm": 2.8221476078033447,
"kl": 0.2473655454814434,
"learning_rate": 4.974869258488254e-05,
"loss": 0.8332,
"step": 42,
"step_time": 71.34366872400005
},
{
"clip_ratio/high_max": 0.00897236549644731,
"clip_ratio/high_mean": 0.00470022382796742,
"clip_ratio/low_mean": 0.002213932399172336,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.006914156285347417,
"completions/clipped_ratio": 0.0,
"completions/max_length": 13893.0,
"completions/max_terminated_length": 13893.0,
"completions/mean_length": 11105.28125,
"completions/mean_terminated_length": 11105.28125,
"completions/min_length": 514.0,
"completions/min_terminated_length": 514.0,
"entropy": 0.07579021051060408,
"epoch": 0.00043,
"frac_reward_zero_std": 0.0,
"grad_norm": 5.432410717010498,
"kl": 0.3065204853191972,
"learning_rate": 4.965821926391673e-05,
"loss": -0.6028,
"num_tokens": 7945588.0,
"reward": -0.1402868777513504,
"reward_std": 0.6740955114364624,
"rewards/rollout_reward_func/mean": -0.1402868777513504,
"rewards/rollout_reward_func/std": 0.7691654562950134,
"sampling/importance_sampling_ratio/max": 3.0,
"sampling/importance_sampling_ratio/mean": 0.9994540214538574,
"sampling/importance_sampling_ratio/min": 0.2775813639163971,
"sampling/sampling_logp_difference/max": 2.002950668334961,
"sampling/sampling_logp_difference/mean": 0.016589157283306122,
"step": 43,
"step_time": 166.56439664799746
},
{
"clip_ratio/high_max": 0.02129412887734361,
"clip_ratio/high_mean": 0.011415275061153807,
"clip_ratio/low_mean": 0.007396826345939189,
"clip_ratio/low_min": 0.0017404509417247027,
"clip_ratio/region_mean": 0.01881210133433342,
"entropy": 0.07551638572476804,
"epoch": 0.00044,
"grad_norm": 4.626571178436279,
"kl": 0.34492466133087873,
"learning_rate": 4.9554009039866464e-05,
"loss": -0.6141,
"step": 44,
"step_time": 76.74440019599933
},
{
"clip_ratio/high_max": 0.010172237700317055,
"clip_ratio/high_mean": 0.0050861188501585275,
"clip_ratio/low_mean": 0.00778736115898937,
"clip_ratio/low_min": 0.0032280217565130442,
"clip_ratio/region_mean": 0.012873480009147897,
"completions/clipped_ratio": 0.0,
"completions/max_length": 13578.0,
"completions/max_terminated_length": 13578.0,
"completions/mean_length": 10818.84375,
"completions/mean_terminated_length": 10818.84375,
"completions/min_length": 1938.0,
"completions/min_terminated_length": 1938.0,
"entropy": 0.0941541725769639,
"epoch": 0.00045,
"frac_reward_zero_std": 0.0,
"grad_norm": 6.4678053855896,
"kl": 0.3121062908321619,
"learning_rate": 4.9436139678306335e-05,
"loss": -0.2261,
"num_tokens": 8318532.0,
"reward": 0.15859833359718323,
"reward_std": 0.755577027797699,
"rewards/rollout_reward_func/mean": 0.15859833359718323,
"rewards/rollout_reward_func/std": 0.8585173487663269,
"sampling/importance_sampling_ratio/max": 2.691387414932251,
"sampling/importance_sampling_ratio/mean": 0.9984525442123413,
"sampling/importance_sampling_ratio/min": 0.06687777489423752,
"sampling/sampling_logp_difference/max": 2.7048885822296143,
"sampling/sampling_logp_difference/mean": 0.023697983473539352,
"step": 45,
"step_time": 164.00571862300148
},
{
"clip_ratio/high_max": 0.020086677744984627,
"clip_ratio/high_mean": 0.010638576990459114,
"clip_ratio/low_mean": 0.013478812892572023,
"clip_ratio/low_min": 0.005088503879960626,
"clip_ratio/region_mean": 0.0241173897520639,
"entropy": 0.09132221271283925,
"epoch": 0.00046,
"grad_norm": 4.049323081970215,
"kl": 0.3125941874459386,
"learning_rate": 4.930469913777124e-05,
"loss": -0.2404,
"step": 46,
"step_time": 75.26833040499969
},
{
"clip_ratio/high_max": 0.008710403984878212,
"clip_ratio/high_mean": 0.006181505916174501,
"clip_ratio/low_mean": 0.006149827502667904,
"clip_ratio/low_min": 0.0017361111240461469,
"clip_ratio/region_mean": 0.012331333418842405,
"completions/clipped_ratio": 0.0,
"completions/max_length": 13339.0,
"completions/max_terminated_length": 13339.0,
"completions/mean_length": 9838.40625,
"completions/mean_terminated_length": 9838.40625,
"completions/min_length": 1577.0,
"completions/min_terminated_length": 1577.0,
"entropy": 0.0639520538970828,
"epoch": 0.00047,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.6693246364593506,
"kl": 0.28815168514847755,
"learning_rate": 4.91597855041184e-05,
"loss": -0.4524,
"num_tokens": 8660124.0,
"reward": 0.18700893223285675,
"reward_std": 0.8370898365974426,
"rewards/rollout_reward_func/mean": 0.18700893223285675,
"rewards/rollout_reward_func/std": 0.9505638480186462,
"sampling/importance_sampling_ratio/max": 3.0,
"sampling/importance_sampling_ratio/mean": 0.9988111853599548,
"sampling/importance_sampling_ratio/min": 0.20165325701236725,
"sampling/sampling_logp_difference/max": 1.601205587387085,
"sampling/sampling_logp_difference/mean": 0.01677616499364376,
"step": 47,
"step_time": 156.50041884999882
},
{
"clip_ratio/high_max": 0.010966176458168775,
"clip_ratio/high_mean": 0.008428345259744674,
"clip_ratio/low_mean": 0.009785499976715073,
"clip_ratio/low_min": 0.0022047124803066254,
"clip_ratio/region_mean": 0.018213845294667408,
"entropy": 0.06090119876898825,
"epoch": 0.00048,
"grad_norm": 2.817453145980835,
"kl": 0.2784329962451011,
"learning_rate": 4.900150691733207e-05,
"loss": -0.4609,
"step": 48,
"step_time": 71.43669509799929
},
{
"clip_ratio/high_max": 0.012393949960824102,
"clip_ratio/high_mean": 0.007038811716483906,
"clip_ratio/low_mean": 0.004084319676621817,
"clip_ratio/low_min": 0.00041666667675599456,
"clip_ratio/region_mean": 0.011123131407657638,
"completions/clipped_ratio": 0.0,
"completions/max_length": 13544.0,
"completions/max_terminated_length": 13544.0,
"completions/mean_length": 10274.09375,
"completions/mean_terminated_length": 10274.09375,
"completions/min_length": 2238.0,
"completions/min_terminated_length": 2238.0,
"entropy": 0.06463533453643322,
"epoch": 0.00049,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.4794275760650635,
"kl": 0.23157021962106228,
"learning_rate": 4.8829981490825384e-05,
"loss": 0.6766,
"num_tokens": 9015251.0,
"reward": 0.05801337957382202,
"reward_std": 0.8594080209732056,
"rewards/rollout_reward_func/mean": 0.05801337957382202,
"rewards/rollout_reward_func/std": 0.9448277354240417,
"sampling/importance_sampling_ratio/max": 2.314483165740967,
"sampling/importance_sampling_ratio/mean": 0.998992919921875,
"sampling/importance_sampling_ratio/min": 0.08525566011667252,
"sampling/sampling_logp_difference/max": 2.4621007442474365,
"sampling/sampling_logp_difference/mean": 0.01729561574757099,
"step": 49,
"step_time": 160.99621369300075
},
{
"clip_ratio/high_max": 0.017639295605476946,
"clip_ratio/high_mean": 0.009471276862313971,
"clip_ratio/low_mean": 0.008798229115200229,
"clip_ratio/low_min": 0.002916666679084301,
"clip_ratio/region_mean": 0.018269505802891217,
"entropy": 0.06586812436580658,
"epoch": 0.0005,
"grad_norm": 3.246238946914673,
"kl": 0.23268628818914294,
"learning_rate": 4.864533722329971e-05,
"loss": 0.6675,
"step": 50,
"step_time": 74.08785445800095
},
{
"clip_ratio/high_max": 0.008645561116281897,
"clip_ratio/high_mean": 0.004785452736541629,
"clip_ratio/low_mean": 0.003304112848127261,
"clip_ratio/low_min": 0.00048449612222611904,
"clip_ratio/region_mean": 0.008089565526461229,
"completions/clipped_ratio": 0.0,
"completions/max_length": 13357.0,
"completions/max_terminated_length": 13357.0,
"completions/mean_length": 9449.75,
"completions/mean_terminated_length": 9449.75,
"completions/min_length": 1164.0,
"completions/min_terminated_length": 1164.0,
"entropy": 0.06446462532039732,
"epoch": 0.00051,
"frac_reward_zero_std": 0.0,
"grad_norm": 4.304991245269775,
"kl": 0.3380787476198748,
"learning_rate": 4.8447711903227245e-05,
"loss": -0.0874,
"num_tokens": 9343997.0,
"reward": 0.031104259192943573,
"reward_std": 0.8977135419845581,
"rewards/rollout_reward_func/mean": 0.031104259192943573,
"rewards/rollout_reward_func/std": 1.0250816345214844,
"sampling/importance_sampling_ratio/max": 3.0,
"sampling/importance_sampling_ratio/mean": 1.0004096031188965,
"sampling/importance_sampling_ratio/min": 0.18863455951213837,
"sampling/sampling_logp_difference/max": 1.6679437160491943,
"sampling/sampling_logp_difference/mean": 0.014661731198430061,
"step": 51,
"step_time": 151.50292257000183
},
{
"clip_ratio/high_max": 0.012868363002780825,
"clip_ratio/high_mean": 0.007327448722207919,
"clip_ratio/low_mean": 0.006595586746698245,
"clip_ratio/low_min": 0.0034068059467244893,
"clip_ratio/region_mean": 0.013923035527113825,
"entropy": 0.06929566781036556,
"epoch": 0.00052,
"grad_norm": 3.573127031326294,
"kl": 0.2613374365027994,
"learning_rate": 4.8237253006028074e-05,
"loss": -0.0965,
"step": 52,
"step_time": 70.4309434930019
},
{
"clip_ratio/high_max": 0.008529713173629716,
"clip_ratio/high_mean": 0.004628228620276786,
"clip_ratio/low_mean": 0.004744059449876659,
"clip_ratio/low_min": 0.00126980320783332,
"clip_ratio/region_mean": 0.0093722880264977,
"completions/clipped_ratio": 0.0,
"completions/max_length": 13527.0,
"completions/max_terminated_length": 13527.0,
"completions/mean_length": 10749.8125,
"completions/mean_terminated_length": 10749.8125,
"completions/min_length": 1542.0,
"completions/min_terminated_length": 1542.0,
"entropy": 0.0799917277181521,
"epoch": 0.00053,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.5762784481048584,
"kl": 0.2966231992468238,
"learning_rate": 4.801411758401846e-05,
"loss": 0.9423,
"num_tokens": 9714613.0,
"reward": 0.14535844326019287,
"reward_std": 1.0995876789093018,
"rewards/rollout_reward_func/mean": 0.14535844326019287,
"rewards/rollout_reward_func/std": 1.0902811288833618,
"sampling/importance_sampling_ratio/max": 2.0842576026916504,
"sampling/importance_sampling_ratio/mean": 0.9999011754989624,
"sampling/importance_sampling_ratio/min": 0.13890385627746582,
"sampling/sampling_logp_difference/max": 1.973973274230957,
"sampling/sampling_logp_difference/mean": 0.01623544469475746,
"step": 53,
"step_time": 167.5663473889981
},
{
"clip_ratio/high_max": 0.011895997042302042,
"clip_ratio/high_mean": 0.00703811485436745,
"clip_ratio/low_mean": 0.008775666501605883,
"clip_ratio/low_min": 0.0012786609295289963,
"clip_ratio/region_mean": 0.015813781414180994,
"entropy": 0.08241763606201857,
"epoch": 0.00054,
"grad_norm": 2.2500929832458496,
"kl": 0.2891239356249571,
"learning_rate": 4.777847214921259e-05,
"loss": 0.9246,
"step": 54,
"step_time": 75.45897021599922
},
{
"clip_ratio/high_max": 0.009276975266402587,
"clip_ratio/high_mean": 0.004638487633201294,
"clip_ratio/low_mean": 0.005842676997417584,
"clip_ratio/low_min": 0.0008503401186317205,
"clip_ratio/region_mean": 0.010481164674274623,
"completions/clipped_ratio": 0.0,
"completions/max_length": 13624.0,
"completions/max_terminated_length": 13624.0,
"completions/mean_length": 11374.6875,
"completions/mean_terminated_length": 11374.6875,
"completions/min_length": 2137.0,
"completions/min_terminated_length": 2137.0,
"entropy": 0.08610659511759877,
"epoch": 0.00055,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.0837926864624023,
"kl": 0.28733005002141,
"learning_rate": 4.753049254906501e-05,
"loss": 0.21,
"num_tokens": 10104624.0,
"reward": -0.09346111118793488,
"reward_std": 0.821108877658844,
"rewards/rollout_reward_func/mean": -0.09346111118793488,
"rewards/rollout_reward_func/std": 0.9060606956481934,
"sampling/importance_sampling_ratio/max": 3.0,
"sampling/importance_sampling_ratio/mean": 1.0002198219299316,
"sampling/importance_sampling_ratio/min": 0.3293769061565399,
"sampling/sampling_logp_difference/max": 1.3009889125823975,
"sampling/sampling_logp_difference/mean": 0.01777353696525097,
"step": 55,
"step_time": 169.56354937799915
},
{
"clip_ratio/high_max": 0.016487823944771662,
"clip_ratio/high_mean": 0.009136769062024541,
"clip_ratio/low_mean": 0.0069089080061530694,
"clip_ratio/low_min": 0.0030126339406706393,
"clip_ratio/region_mean": 0.01604567709728144,
"entropy": 0.09009328670799732,
"epoch": 0.00056,
"grad_norm": 3.528965473175049,
"kl": 0.28088642843067646,
"learning_rate": 4.727036383524666e-05,
"loss": 0.1973,
"step": 56,
"step_time": 75.04982400900099
},
{
"clip_ratio/high_max": 0.007419974484946579,
"clip_ratio/high_mean": 0.004121171456063166,
"clip_ratio/low_mean": 0.0033842776028905064,
"clip_ratio/low_min": 0.0012699189246632159,
"clip_ratio/region_mean": 0.007505449088057503,
"completions/clipped_ratio": 0.0,
"completions/max_length": 13511.0,
"completions/max_terminated_length": 13511.0,
"completions/mean_length": 10176.125,
"completions/mean_terminated_length": 10176.125,
"completions/min_length": 1202.0,
"completions/min_terminated_length": 1202.0,
"entropy": 0.0732960153836757,
"epoch": 0.00057,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.3770620822906494,
"kl": 0.3082230528816581,
"learning_rate": 4.699828012555243e-05,
"loss": 0.5982,
"num_tokens": 10456269.0,
"reward": -0.12018954753875732,
"reward_std": 0.9542105793952942,
"rewards/rollout_reward_func/mean": -0.12018954753875732,
"rewards/rollout_reward_func/std": 1.0915628671646118,
"sampling/importance_sampling_ratio/max": 2.9350130558013916,
"sampling/importance_sampling_ratio/mean": 0.9999834299087524,
"sampling/importance_sampling_ratio/min": 0.362204909324646,
"sampling/sampling_logp_difference/max": 1.076711893081665,
"sampling/sampling_logp_difference/mean": 0.015124820172786713,
"step": 57,
"step_time": 160.69087591700008
},
{
"clip_ratio/high_max": 0.017830504453741014,
"clip_ratio/high_mean": 0.009982116374885663,
"clip_ratio/low_mean": 0.009146305441390723,
"clip_ratio/low_min": 0.0012932273093611002,
"clip_ratio/region_mean": 0.019128421699861065,
"entropy": 0.0739774244138971,
"epoch": 0.00058,
"grad_norm": 7.0747246742248535,
"kl": 0.3264038683846593,
"learning_rate": 4.671444445904316e-05,
"loss": 0.5779,
"step": 58,
"step_time": 73.05348470599984
},
{
"clip_ratio/high_max": 0.013040319143328816,
"clip_ratio/high_mean": 0.007098863337887451,
"clip_ratio/low_mean": 0.003575551469111815,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010674414719687775,
"completions/clipped_ratio": 0.0,
"completions/max_length": 13197.0,
"completions/max_terminated_length": 13197.0,
"completions/mean_length": 8837.90625,
"completions/mean_terminated_length": 8837.90625,
"completions/min_length": 678.0,
"completions/min_terminated_length": 678.0,
"entropy": 0.09057109360583127,
"epoch": 0.00059,
"frac_reward_zero_std": 0.0,
"grad_norm": 7.666227340698242,
"kl": 0.4448018502444029,
"learning_rate": 4.641906864453027e-05,
"loss": -0.265,
"num_tokens": 10765241.0,
"reward": 0.296211302280426,
"reward_std": 1.1419236660003662,
"rewards/rollout_reward_func/mean": 0.296211302280426,
"rewards/rollout_reward_func/std": 1.1032841205596924,
"sampling/importance_sampling_ratio/max": 2.8205833435058594,
"sampling/importance_sampling_ratio/mean": 0.9991936683654785,
"sampling/importance_sampling_ratio/min": 0.14112254977226257,
"sampling/sampling_logp_difference/max": 1.9581265449523926,
"sampling/sampling_logp_difference/mean": 0.021620940417051315,
"step": 59,
"step_time": 155.945755681003
},
{
"clip_ratio/high_max": 0.02123168739490211,
"clip_ratio/high_mean": 0.011910196451935917,
"clip_ratio/low_mean": 0.015003619948402047,
"clip_ratio/low_min": 0.003669736732263118,
"clip_ratio/region_mean": 0.026913816400337964,
"entropy": 0.09111437713727355,
"epoch": 0.0006,
"grad_norm": 4.09487771987915,
"kl": 0.34631976671516895,
"learning_rate": 4.6112373102516095e-05,
"loss": -0.286,
"step": 60,
"step_time": 71.43645765800102
},
{
"clip_ratio/high_max": 0.0070640986377839,
"clip_ratio/high_mean": 0.00353204931889195,
"clip_ratio/low_mean": 0.004103203638806008,
"clip_ratio/low_min": 0.00041666667675599456,
"clip_ratio/region_mean": 0.007635252914042212,
"completions/clipped_ratio": 0.0,
"completions/max_length": 13493.0,
"completions/max_terminated_length": 13493.0,
"completions/mean_length": 9962.53125,
"completions/mean_terminated_length": 9962.53125,
"completions/min_length": 1611.0,
"completions/min_terminated_length": 1611.0,
"entropy": 0.08869605418294668,
"epoch": 0.00061,
"frac_reward_zero_std": 0.0,
"grad_norm": 5.003176689147949,
"kl": 0.32290424313396215,
"learning_rate": 4.5794586700707875e-05,
"loss": 0.4087,
"num_tokens": 11110298.0,
"reward": 0.19760021567344666,
"reward_std": 1.0259727239608765,
"rewards/rollout_reward_func/mean": 0.19760021567344666,
"rewards/rollout_reward_func/std": 1.0379148721694946,
"sampling/importance_sampling_ratio/max": 2.7306482791900635,
"sampling/importance_sampling_ratio/mean": 1.0000582933425903,
"sampling/importance_sampling_ratio/min": 0.3961387276649475,
"sampling/sampling_logp_difference/max": 1.0045390129089355,
"sampling/sampling_logp_difference/mean": 0.0211578831076622,
"step": 61,
"step_time": 160.88398975100063
},
{
"clip_ratio/high_max": 0.018062620365526527,
"clip_ratio/high_mean": 0.009733557322761044,
"clip_ratio/low_mean": 0.013738173875026405,
"clip_ratio/low_min": 0.0035855557944159955,
"clip_ratio/region_mean": 0.02347173122689128,
"entropy": 0.09201134112663567,
"epoch": 0.00062,
"grad_norm": 3.820011615753174,
"kl": 0.32759621646255255,
"learning_rate": 4.546594658322805e-05,
"loss": 0.3893,
"step": 62,
"step_time": 72.43900915299491
},
{
"clip_ratio/high_max": 0.004652095143683255,
"clip_ratio/high_mean": 0.0023260475718416274,
"clip_ratio/low_mean": 0.005644983262754977,
"clip_ratio/low_min": 0.0017199248541146517,
"clip_ratio/region_mean": 0.007971030776388943,
"completions/clipped_ratio": 0.0,
"completions/max_length": 13066.0,
"completions/max_terminated_length": 13066.0,
"completions/mean_length": 10042.84375,
"completions/mean_terminated_length": 10042.84375,
"completions/min_length": 1710.0,
"completions/min_terminated_length": 1710.0,
"entropy": 0.1067513944581151,
"epoch": 0.00063,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.7682888507843018,
"kl": 0.45440504513680935,
"learning_rate": 4.512669799364848e-05,
"loss": 0.3975,
"num_tokens": 11457778.0,
"reward": 0.3108959197998047,
"reward_std": 0.9355272054672241,
"rewards/rollout_reward_func/mean": 0.3108959197998047,
"rewards/rollout_reward_func/std": 0.9091988205909729,
"sampling/importance_sampling_ratio/max": 3.0,
"sampling/importance_sampling_ratio/mean": 1.0004922151565552,
"sampling/importance_sampling_ratio/min": 0.25997379422187805,
"sampling/sampling_logp_difference/max": 1.9117159843444824,
"sampling/sampling_logp_difference/mean": 0.02266036719083786,
"step": 63,
"step_time": 158.2137243500074
},
{
"clip_ratio/high_max": 0.011193088546860963,
"clip_ratio/high_mean": 0.0058913555985782295,
"clip_ratio/low_mean": 0.013004933163756505,
"clip_ratio/low_min": 0.005248830129858106,
"clip_ratio/region_mean": 0.018896288704127073,
"entropy": 0.10795356682501733,
"epoch": 0.00064,
"grad_norm": 3.645940065383911,
"kl": 0.470172350294888,
"learning_rate": 4.477709409198042e-05,
"loss": 0.3908,
"step": 64,
"step_time": 71.35263364000275
},
{
"clip_ratio/high_max": 0.008740827877772972,
"clip_ratio/high_mean": 0.004891247299383394,
"clip_ratio/low_mean": 0.004758857699926011,
"clip_ratio/low_min": 0.0008361297659575939,
"clip_ratio/region_mean": 0.009650105028413236,
"completions/clipped_ratio": 0.0,
"completions/max_length": 13721.0,
"completions/max_terminated_length": 13721.0,
"completions/mean_length": 10885.40625,
"completions/mean_terminated_length": 10885.40625,
"completions/min_length": 2306.0,
"completions/min_terminated_length": 2306.0,
"entropy": 0.1310987388715148,
"epoch": 0.00065,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.6566226482391357,
"kl": 0.5131763480603695,
"learning_rate": 4.441739576575714e-05,
"loss": 0.7262,
"num_tokens": 11832210.0,
"reward": 0.37453246116638184,
"reward_std": 1.1100566387176514,
"rewards/rollout_reward_func/mean": 0.37453246116638184,
"rewards/rollout_reward_func/std": 1.122207760810852,
"sampling/importance_sampling_ratio/max": 3.0,
"sampling/importance_sampling_ratio/mean": 0.9988963603973389,
"sampling/importance_sampling_ratio/min": 0.25997379422187805,
"sampling/sampling_logp_difference/max": 1.7353017330169678,
"sampling/sampling_logp_difference/mean": 0.02562958560883999,
"step": 65,
"step_time": 163.06677165699693
},
{
"clip_ratio/high_max": 0.021744761732406914,
"clip_ratio/high_mean": 0.012366945913527161,
"clip_ratio/low_mean": 0.009314247145084664,
"clip_ratio/low_min": 0.002372722141444683,
"clip_ratio/region_mean": 0.021681193175027147,
"entropy": 0.13085555145516992,
"epoch": 0.00066,
"grad_norm": 3.274498224258423,
"kl": 0.48789327405393124,
"learning_rate": 4.404787143534977e-05,
"loss": 0.7077,
"step": 66,
"step_time": 75.6290548800007
},
{
"clip_ratio/high_max": 0.005601790326181799,
"clip_ratio/high_mean": 0.0028008951630908996,
"clip_ratio/low_mean": 0.003591711341869086,
"clip_ratio/low_min": 0.0016466806118842214,
"clip_ratio/region_mean": 0.006392606504959986,
"completions/clipped_ratio": 0.0,
"completions/max_length": 13492.0,
"completions/max_terminated_length": 13492.0,
"completions/mean_length": 10049.25,
"completions/mean_terminated_length": 10049.25,
"completions/min_length": 684.0,
"completions/min_terminated_length": 684.0,
"entropy": 0.12767728650942445,
"epoch": 0.00067,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.9584059715270996,
"kl": 0.43974771071225405,
"learning_rate": 4.366879685366202e-05,
"loss": 0.3535,
"num_tokens": 12180022.0,
"reward": 0.1577407419681549,
"reward_std": 1.1268913745880127,
"rewards/rollout_reward_func/mean": 0.1577407419681549,
"rewards/rollout_reward_func/std": 1.1074063777923584,
"sampling/importance_sampling_ratio/max": 3.0,
"sampling/importance_sampling_ratio/mean": 1.001326560974121,
"sampling/importance_sampling_ratio/min": 0.42815104126930237,
"sampling/sampling_logp_difference/max": 1.152717113494873,
"sampling/sampling_logp_difference/mean": 0.021073060110211372,
"step": 67,
"step_time": 160.9693315609984
},
{
"clip_ratio/high_max": 0.017967351304832846,
"clip_ratio/high_mean": 0.008983675652416423,
"clip_ratio/low_mean": 0.017658226686762646,
"clip_ratio/low_min": 0.004194744222331792,
"clip_ratio/region_mean": 0.026641902572009712,
"entropy": 0.12733671674504876,
"epoch": 0.00068,
"grad_norm": 3.018509864807129,
"kl": 0.4440254373475909,
"learning_rate": 4.3280454900353015e-05,
"loss": 0.3265,
"step": 68,
"step_time": 75.01162057099282
},
{
"clip_ratio/high_max": 0.010869733727304265,
"clip_ratio/high_mean": 0.005718957792851143,
"clip_ratio/low_mean": 0.0034849131188821048,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.009203870882629417,
"completions/clipped_ratio": 0.0,
"completions/max_length": 13515.0,
"completions/max_terminated_length": 13515.0,
"completions/mean_length": 8650.84375,
"completions/mean_terminated_length": 8650.84375,
"completions/min_length": 387.0,
"completions/min_terminated_length": 387.0,
"entropy": 0.11021804087795317,
"epoch": 0.00069,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.1527934074401855,
"kl": 0.37845719885081053,
"learning_rate": 4.288313537074191e-05,
"loss": 0.5884,
"num_tokens": 12483099.0,
"reward": 0.2558709979057312,
"reward_std": 0.9413133859634399,
"rewards/rollout_reward_func/mean": 0.2558709979057312,
"rewards/rollout_reward_func/std": 1.0643043518066406,
"sampling/importance_sampling_ratio/max": 3.0,
"sampling/importance_sampling_ratio/mean": 0.998905599117279,
"sampling/importance_sampling_ratio/min": 0.3961387276649475,
"sampling/sampling_logp_difference/max": 1.179542064666748,
"sampling/sampling_logp_difference/mean": 0.021235648542642593,
"step": 69,
"step_time": 162.1463760960014
},
{
"clip_ratio/high_max": 0.02583102328935638,
"clip_ratio/high_mean": 0.015480009169550613,
"clip_ratio/low_mean": 0.013606639229692519,
"clip_ratio/low_min": 0.002621016465127468,
"clip_ratio/region_mean": 0.029086648602969944,
"entropy": 0.10582013777457178,
"epoch": 0.0007,
"grad_norm": 3.362034320831299,
"kl": 0.3859092304483056,
"learning_rate": 4.2477134759551676e-05,
"loss": 0.5836,
"step": 70,
"step_time": 73.605654605999
},
{
"clip_ratio/high_max": 0.003980476642027497,
"clip_ratio/high_mean": 0.0021999698801664636,
"clip_ratio/low_mean": 0.0036015229270560667,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00580149280722253,
"completions/clipped_ratio": 0.0,
"completions/max_length": 13375.0,
"completions/max_terminated_length": 13375.0,
"completions/mean_length": 9448.03125,
"completions/mean_terminated_length": 9448.03125,
"completions/min_length": 1765.0,
"completions/min_terminated_length": 1765.0,
"entropy": 0.08842878974974155,
"epoch": 0.00071,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.0361220836639404,
"kl": 0.3907409608364105,
"learning_rate": 4.206275603965376e-05,
"loss": 0.4317,
"num_tokens": 12811546.0,
"reward": 0.10175018012523651,
"reward_std": 1.2597105503082275,
"rewards/rollout_reward_func/mean": 0.10175018012523651,
"rewards/rollout_reward_func/std": 1.224672555923462,
"sampling/importance_sampling_ratio/max": 2.153721809387207,
"sampling/importance_sampling_ratio/mean": 0.9977552890777588,
"sampling/importance_sampling_ratio/min": 0.25212857127189636,
"sampling/sampling_logp_difference/max": 1.377816081047058,
"sampling/sampling_logp_difference/mean": 0.017261814326047897,
"step": 71,
"step_time": 158.47965298599775
},
{
"clip_ratio/high_max": 0.012883998395409435,
"clip_ratio/high_mean": 0.0064419991977047175,
"clip_ratio/low_mean": 0.004444706108188257,
"clip_ratio/low_min": 0.0017248203221242875,
"clip_ratio/region_mean": 0.010886705276789144,
"entropy": 0.09079739707522094,
"epoch": 0.00072,
"grad_norm": 2.404242753982544,
"kl": 0.3891353765502572,
"learning_rate": 4.1640308435978284e-05,
"loss": 0.4232,
"step": 72,
"step_time": 73.1000140619999
},
{
"clip_ratio/high_max": 0.00914364744676277,
"clip_ratio/high_mean": 0.005490152951097116,
"clip_ratio/low_mean": 0.004159306932706386,
"clip_ratio/low_min": 0.0016835585120134056,
"clip_ratio/region_mean": 0.009649459869251586,
"completions/clipped_ratio": 0.0,
"completions/max_length": 13773.0,
"completions/max_terminated_length": 13773.0,
"completions/mean_length": 9316.125,
"completions/mean_terminated_length": 9316.125,
"completions/min_length": 1200.0,
"completions/min_terminated_length": 1200.0,
"entropy": 0.09313291660510004,
"epoch": 0.00073,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.408350944519043,
"kl": 0.34883329924196005,
"learning_rate": 4.121010719475882e-05,
"loss": 0.0489,
"num_tokens": 13135643.0,
"reward": 0.2852872610092163,
"reward_std": 1.1488020420074463,
"rewards/rollout_reward_func/mean": 0.2852872610092163,
"rewards/rollout_reward_func/std": 1.1279585361480713,
"sampling/importance_sampling_ratio/max": 3.0,
"sampling/importance_sampling_ratio/mean": 1.0009342432022095,
"sampling/importance_sampling_ratio/min": 0.2820616066455841,
"sampling/sampling_logp_difference/max": 1.31003737449646,
"sampling/sampling_logp_difference/mean": 0.01842338591814041,
"step": 73,
"step_time": 154.65388823600188
},
{
"clip_ratio/high_max": 0.013598717050626874,
"clip_ratio/high_mean": 0.0074385893531143665,
"clip_ratio/low_mean": 0.011851943374495022,
"clip_ratio/low_min": 0.005019015457946807,
"clip_ratio/region_mean": 0.019290532698505558,
"entropy": 0.09284176700748503,
"epoch": 0.00074,
"grad_norm": 2.88838267326355,
"kl": 0.3572141509503126,
"learning_rate": 4.077247334828387e-05,
"loss": 0.0384,
"step": 74,
"step_time": 73.51275038800486
},
{
"clip_ratio/high_max": 0.008469170046737418,
"clip_ratio/high_mean": 0.0051088373438687995,
"clip_ratio/low_mean": 0.004827196316909976,
"clip_ratio/low_min": 0.00041946308920159936,
"clip_ratio/region_mean": 0.00993603361712303,
"completions/clipped_ratio": 0.0,
"completions/max_length": 13591.0,
"completions/max_terminated_length": 13591.0,
"completions/mean_length": 10868.78125,
"completions/mean_terminated_length": 10868.78125,
"completions/min_length": 2652.0,
"completions/min_terminated_length": 2652.0,
"entropy": 0.11976136197336018,
"epoch": 0.00075,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.328458070755005,
"kl": 0.47066329792141914,
"learning_rate": 4.032773347533051e-05,
"loss": -0.0744,
"num_tokens": 13509593.0,
"reward": 0.33224326372146606,
"reward_std": 1.1032413244247437,
"rewards/rollout_reward_func/mean": 0.33224326372146606,
"rewards/rollout_reward_func/std": 1.0542694330215454,
"sampling/importance_sampling_ratio/max": 2.090331792831421,
"sampling/importance_sampling_ratio/mean": 0.9978272318840027,
"sampling/importance_sampling_ratio/min": 0.3244995176792145,
"sampling/sampling_logp_difference/max": 1.1254712343215942,
"sampling/sampling_logp_difference/mean": 0.01953265815973282,
"step": 75,
"step_time": 162.99350261499603
},
{
"clip_ratio/high_max": 0.013260606676340103,
"clip_ratio/high_mean": 0.007932637934572995,
"clip_ratio/low_mean": 0.010386872600065544,
"clip_ratio/low_min": 0.003419152199057862,
"clip_ratio/region_mean": 0.01831951050553471,
"entropy": 0.12087863381020725,
"epoch": 0.00076,
"grad_norm": 2.9509475231170654,
"kl": 0.47699476033449173,
"learning_rate": 3.9876219457459105e-05,
"loss": -0.0953,
"step": 76,
"step_time": 74.80923694300145
},
{
"clip_ratio/high_max": 0.0033680717169772834,
"clip_ratio/high_mean": 0.0019300988496979699,
"clip_ratio/low_mean": 0.004023112080176361,
"clip_ratio/low_min": 0.001683558599324897,
"clip_ratio/region_mean": 0.005953210958978161,
"completions/clipped_ratio": 0.0,
"completions/max_length": 13043.0,
"completions/max_terminated_length": 13043.0,
"completions/mean_length": 10891.125,
"completions/mean_terminated_length": 10891.125,
"completions/min_length": 2403.0,
"completions/min_terminated_length": 2403.0,
"entropy": 0.11249894462525845,
"epoch": 0.00077,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.528323173522949,
"kl": 0.45352290105074644,
"learning_rate": 3.9418268231350794e-05,
"loss": 0.4185,
"num_tokens": 13884272.0,
"reward": 0.25572478771209717,
"reward_std": 1.1789345741271973,
"rewards/rollout_reward_func/mean": 0.25572478771209717,
"rewards/rollout_reward_func/std": 1.1531264781951904,
"sampling/importance_sampling_ratio/max": 3.0,
"sampling/importance_sampling_ratio/mean": 0.9990130662918091,
"sampling/importance_sampling_ratio/min": 0.20195981860160828,
"sampling/sampling_logp_difference/max": 1.5996865034103394,
"sampling/sampling_logp_difference/mean": 0.018811069428920746,
"step": 77,
"step_time": 164.1396708480006
},
{
"clip_ratio/high_max": 0.009418375324457884,
"clip_ratio/high_mean": 0.006461102922912687,
"clip_ratio/low_mean": 0.011049802458728664,
"clip_ratio/low_min": 0.004669325251597911,
"clip_ratio/region_mean": 0.01751090532343369,
"entropy": 0.11076454306021333,
"epoch": 0.00078,
"grad_norm": 2.8760058879852295,
"kl": 0.4641501298174262,
"learning_rate": 3.8954221537372784e-05,
"loss": 0.4054,
"step": 78,
"step_time": 72.70563589599624
},
{
"clip_ratio/high_max": 0.005599236872512847,
"clip_ratio/high_mean": 0.0027996184362564236,
"clip_ratio/low_mean": 0.004529917219770141,
"clip_ratio/low_min": 0.00042517005931586027,
"clip_ratio/region_mean": 0.007329535641474649,
"completions/clipped_ratio": 0.0,
"completions/max_length": 13665.0,
"completions/max_terminated_length": 13665.0,
"completions/mean_length": 10477.25,
"completions/mean_terminated_length": 10477.25,
"completions/min_length": 3171.0,
"completions/min_terminated_length": 3171.0,
"entropy": 0.10928369383327663,
"epoch": 0.00079,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.4090378284454346,
"kl": 0.504779901355505,
"learning_rate": 3.848442566455879e-05,
"loss": 0.2142,
"num_tokens": 14245453.0,
"reward": 0.39725062251091003,
"reward_std": 1.1237173080444336,
"rewards/rollout_reward_func/mean": 0.39725062251091003,
"rewards/rollout_reward_func/std": 1.0794570446014404,
"sampling/importance_sampling_ratio/max": 2.5544402599334717,
"sampling/importance_sampling_ratio/mean": 1.0012173652648926,
"sampling/importance_sampling_ratio/min": 0.12977543473243713,
"sampling/sampling_logp_difference/max": 2.04194974899292,
"sampling/sampling_logp_difference/mean": 0.01983896642923355,
"step": 79,
"step_time": 164.4674953970025
},
{
"clip_ratio/high_max": 0.00996807124465704,
"clip_ratio/high_mean": 0.005246640677796677,
"clip_ratio/low_mean": 0.01202807053050492,
"clip_ratio/low_min": 0.006143381004221737,
"clip_ratio/region_mean": 0.017274711281061172,
"entropy": 0.10772595112212002,
"epoch": 0.0008,
"grad_norm": 3.7778007984161377,
"kl": 0.5062860492616892,
"learning_rate": 3.800923119219528e-05,
"loss": 0.1939,
"step": 80,
"step_time": 74.33145354599947
},
{
"clip_ratio/high_max": 0.009137832559645176,
"clip_ratio/high_mean": 0.00524506566580385,
"clip_ratio/low_mean": 0.007272154442034662,
"clip_ratio/low_min": 0.002546385396271944,
"clip_ratio/region_mean": 0.012517220136942342,
"completions/clipped_ratio": 0.0,
"completions/max_length": 13498.0,
"completions/max_terminated_length": 13498.0,
"completions/mean_length": 9313.21875,
"completions/mean_terminated_length": 9313.21875,
"completions/min_length": 1181.0,
"completions/min_terminated_length": 1181.0,
"entropy": 0.08699527697172016,
"epoch": 0.00081,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.986673593521118,
"kl": 0.5458228345960379,
"learning_rate": 3.752899272820599e-05,
"loss": -0.1808,
"num_tokens": 14569692.0,
"reward": 0.2744262218475342,
"reward_std": 1.180228352546692,
"rewards/rollout_reward_func/mean": 0.2744262218475342,
"rewards/rollout_reward_func/std": 1.1685067415237427,
"sampling/importance_sampling_ratio/max": 2.5544402599334717,
"sampling/importance_sampling_ratio/mean": 1.0001392364501953,
"sampling/importance_sampling_ratio/min": 0.21354326605796814,
"sampling/sampling_logp_difference/max": 1.543915867805481,
"sampling/sampling_logp_difference/mean": 0.017606310546398163,
"step": 81,
"step_time": 156.98116288799793
},
{
"clip_ratio/high_max": 0.0205975886201486,
"clip_ratio/high_mean": 0.011662664590403438,
"clip_ratio/low_mean": 0.006897521496284753,
"clip_ratio/low_min": 0.0013111887965351343,
"clip_ratio/region_mean": 0.01856018614489585,
"entropy": 0.08844376017805189,
"epoch": 0.00082,
"grad_norm": 3.179837465286255,
"kl": 0.49759334325790405,
"learning_rate": 3.7044068644530266e-05,
"loss": -0.1825,
"step": 82,
"step_time": 72.62646461700206
},
{
"clip_ratio/high_max": 0.00906756124459207,
"clip_ratio/high_mean": 0.00509181636152789,
"clip_ratio/low_mean": 0.0038351798575604334,
"clip_ratio/low_min": 0.0012641340435948223,
"clip_ratio/region_mean": 0.008926996189984493,
"completions/clipped_ratio": 0.0,
"completions/max_length": 13279.0,
"completions/max_terminated_length": 13279.0,
"completions/mean_length": 9933.28125,
"completions/mean_terminated_length": 9933.28125,
"completions/min_length": 661.0,
"completions/min_terminated_length": 661.0,
"entropy": 0.09674874134361744,
"epoch": 0.00083,
"frac_reward_zero_std": 0.0,
"grad_norm": 4.155754566192627,
"kl": 0.5548635525628924,
"learning_rate": 3.6554820809692434e-05,
"loss": 0.5622,
"num_tokens": 14913694.0,
"reward": 0.4627847671508789,
"reward_std": 1.0783663988113403,
"rewards/rollout_reward_func/mean": 0.4627847671508789,
"rewards/rollout_reward_func/std": 1.0711275339126587,
"sampling/importance_sampling_ratio/max": 2.7722387313842773,
"sampling/importance_sampling_ratio/mean": 1.0004757642745972,
"sampling/importance_sampling_ratio/min": 0.5378848314285278,
"sampling/sampling_logp_difference/max": 1.0196552276611328,
"sampling/sampling_logp_difference/mean": 0.017515596002340317,
"step": 83,
"step_time": 157.01505748699674
},
{
"clip_ratio/high_max": 0.014731630391906947,
"clip_ratio/high_mean": 0.0073658151959534734,
"clip_ratio/low_mean": 0.0047907959669828415,
"clip_ratio/low_min": 0.0017347846878692508,
"clip_ratio/region_mean": 0.012156611250247806,
"entropy": 0.09983515413478017,
"epoch": 0.00084,
"grad_norm": 2.53806471824646,
"kl": 0.5409666234627366,
"learning_rate": 3.606161431876201e-05,
"loss": 0.5411,
"step": 84,
"step_time": 71.1588070430007
}
],
"logging_steps": 1.0,
"max_steps": 150,
"num_input_tokens_seen": 14913694,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}