bir-real-gin / trainer_state.json
Gege24's picture
Upload task output 1
98740f8 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.00075,
"eval_steps": 500,
"global_step": 75,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1843.0,
"completions/max_terminated_length": 1843.0,
"completions/mean_length": 1586.6875,
"completions/mean_terminated_length": 1586.6875,
"completions/min_length": 274.0,
"completions/min_terminated_length": 274.0,
"entropy": 2.3952305614948273,
"epoch": 1e-05,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.051290396600961685,
"kl": 0.0,
"learning_rate": 0.0,
"loss": -0.0056,
"num_tokens": 71951.0,
"reward": -8.338340759277344,
"reward_std": 11.816058158874512,
"rewards/rollout_reward_func/mean": -8.338340759277344,
"rewards/rollout_reward_func/std": 12.670792579650879,
"sampling/importance_sampling_ratio/max": 0.24942533671855927,
"sampling/importance_sampling_ratio/mean": 0.020489878952503204,
"sampling/importance_sampling_ratio/min": 3.589864120350601e-15,
"sampling/sampling_logp_difference/max": 13.334996223449707,
"sampling/sampling_logp_difference/mean": 0.37166523933410645,
"step": 1,
"step_time": 40.07221162000002
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 2.3952305614948273,
"epoch": 2e-05,
"grad_norm": 0.051755864173173904,
"kl": 0.0,
"learning_rate": 2.8571428571428575e-07,
"loss": -0.0056,
"step": 2,
"step_time": 6.337008879999928
},
{
"clip_ratio/high_max": 0.005321558099240065,
"clip_ratio/high_mean": 0.003962862421758473,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.003962862421758473,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1870.0,
"completions/max_terminated_length": 1870.0,
"completions/mean_length": 1689.21875,
"completions/mean_terminated_length": 1689.21875,
"completions/min_length": 1392.0,
"completions/min_terminated_length": 1392.0,
"entropy": 2.241749197244644,
"epoch": 3e-05,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.032931093126535416,
"kl": 0.0010736112235463224,
"learning_rate": 5.714285714285715e-07,
"loss": 0.0001,
"num_tokens": 147616.0,
"reward": -8.11627197265625,
"reward_std": 9.453106880187988,
"rewards/rollout_reward_func/mean": -8.11627197265625,
"rewards/rollout_reward_func/std": 10.086986541748047,
"sampling/importance_sampling_ratio/max": 0.038908205926418304,
"sampling/importance_sampling_ratio/mean": 0.013574006035923958,
"sampling/importance_sampling_ratio/min": 9.307732536101287e-13,
"sampling/sampling_logp_difference/max": 8.446062088012695,
"sampling/sampling_logp_difference/mean": 0.23850713670253754,
"step": 3,
"step_time": 42.26439957300016
},
{
"clip_ratio/high_max": 0.010532152839004993,
"clip_ratio/high_mean": 0.005266076419502497,
"clip_ratio/low_mean": 0.0013297871919348836,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00659586361143738,
"entropy": 2.2402877509593964,
"epoch": 4e-05,
"grad_norm": 0.030311495065689087,
"kl": 0.0013166169228497893,
"learning_rate": 8.571428571428572e-07,
"loss": 0.0001,
"step": 4,
"step_time": 6.949824775000025
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0014204545877873898,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0014204545877873898,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1775.0,
"completions/max_terminated_length": 1775.0,
"completions/mean_length": 1671.5,
"completions/mean_terminated_length": 1671.5,
"completions/min_length": 758.0,
"completions/min_terminated_length": 758.0,
"entropy": 2.280731201171875,
"epoch": 5e-05,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.035965703427791595,
"kl": 0.0009596906966180541,
"learning_rate": 1.142857142857143e-06,
"loss": -0.0034,
"num_tokens": 222393.0,
"reward": -2.6346850395202637,
"reward_std": 16.439598083496094,
"rewards/rollout_reward_func/mean": -2.6346850395202637,
"rewards/rollout_reward_func/std": 17.489192962646484,
"sampling/importance_sampling_ratio/max": 0.07573997974395752,
"sampling/importance_sampling_ratio/mean": 0.014259650371968746,
"sampling/importance_sampling_ratio/min": 0.00022367587371263653,
"sampling/sampling_logp_difference/max": 1.2608689069747925,
"sampling/sampling_logp_difference/mean": 0.21580657362937927,
"step": 5,
"step_time": 42.164671801000054
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0012499999720603228,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0012499999720603228,
"entropy": 2.279596298933029,
"epoch": 6e-05,
"grad_norm": 0.038707196712493896,
"kl": 0.0009941701391653623,
"learning_rate": 1.4285714285714286e-06,
"loss": -0.0034,
"step": 6,
"step_time": 6.197612690000028
},
{
"clip_ratio/high_max": 0.008154743583872914,
"clip_ratio/high_mean": 0.004077371791936457,
"clip_ratio/low_mean": 0.0013888889225199819,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005466260714456439,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1863.0,
"completions/max_terminated_length": 1863.0,
"completions/mean_length": 1571.40625,
"completions/mean_terminated_length": 1571.40625,
"completions/min_length": 311.0,
"completions/min_terminated_length": 311.0,
"entropy": 2.281369060277939,
"epoch": 7e-05,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.043208107352256775,
"kl": 0.0007937230257084593,
"learning_rate": 1.7142857142857145e-06,
"loss": 0.0023,
"num_tokens": 294768.0,
"reward": -12.273736953735352,
"reward_std": 11.2011137008667,
"rewards/rollout_reward_func/mean": -12.273736953735352,
"rewards/rollout_reward_func/std": 10.951950073242188,
"sampling/importance_sampling_ratio/max": 0.2913915514945984,
"sampling/importance_sampling_ratio/mean": 0.029564352706074715,
"sampling/importance_sampling_ratio/min": 1.11443969016186e-13,
"sampling/sampling_logp_difference/max": 11.392870903015137,
"sampling/sampling_logp_difference/mean": 0.24621161818504333,
"step": 7,
"step_time": 41.23813219300007
},
{
"clip_ratio/high_max": 0.006393861956894398,
"clip_ratio/high_mean": 0.003196930978447199,
"clip_ratio/low_mean": 0.0026959646493196487,
"clip_ratio/low_min": 0.0025510203558951616,
"clip_ratio/region_mean": 0.005892895627766848,
"entropy": 2.281323105096817,
"epoch": 8e-05,
"grad_norm": 0.037575479596853256,
"kl": 0.0011317383105051704,
"learning_rate": 2.0000000000000003e-06,
"loss": 0.0023,
"step": 8,
"step_time": 6.953839896999966
},
{
"clip_ratio/high_max": 0.009784075664356351,
"clip_ratio/high_mean": 0.006221824907697737,
"clip_ratio/low_mean": 0.0013297871919348836,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.007551612099632621,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1839.0,
"completions/max_terminated_length": 1839.0,
"completions/mean_length": 1631.96875,
"completions/mean_terminated_length": 1631.96875,
"completions/min_length": 758.0,
"completions/min_terminated_length": 758.0,
"entropy": 2.2518777698278427,
"epoch": 9e-05,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.024869710206985474,
"kl": 0.0008677325677126646,
"learning_rate": 2.285714285714286e-06,
"loss": 0.002,
"num_tokens": 368485.0,
"reward": -5.027488708496094,
"reward_std": 9.159595489501953,
"rewards/rollout_reward_func/mean": -5.027488708496094,
"rewards/rollout_reward_func/std": 10.158669471740723,
"sampling/importance_sampling_ratio/max": 0.0989551916718483,
"sampling/importance_sampling_ratio/mean": 0.01718847081065178,
"sampling/importance_sampling_ratio/min": 3.3642640756559317e-11,
"sampling/sampling_logp_difference/max": 9.060235977172852,
"sampling/sampling_logp_difference/mean": 0.25719159841537476,
"step": 9,
"step_time": 45.75393526599987
},
{
"clip_ratio/high_max": 0.004852556856349111,
"clip_ratio/high_mean": 0.0024262784281745553,
"clip_ratio/low_mean": 0.0013297871919348836,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.003756065620109439,
"entropy": 2.247532531619072,
"epoch": 0.0001,
"grad_norm": 0.025674326345324516,
"kl": 0.0010098924503836315,
"learning_rate": 2.571428571428571e-06,
"loss": 0.002,
"step": 10,
"step_time": 6.291163318000031
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0013020833721384406,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0013020833721384406,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 1848.0,
"completions/max_terminated_length": 1848.0,
"completions/mean_length": 1693.3125,
"completions/mean_terminated_length": 1702.54833984375,
"completions/min_length": 1389.0,
"completions/min_terminated_length": 1389.0,
"entropy": 2.2023140490055084,
"epoch": 0.00011,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.025482358410954475,
"kl": 0.0009734490522532724,
"learning_rate": 2.8571428571428573e-06,
"loss": -0.0018,
"num_tokens": 444310.0,
"reward": -6.636538028717041,
"reward_std": 13.808101654052734,
"rewards/rollout_reward_func/mean": -6.636538028717041,
"rewards/rollout_reward_func/std": 13.634783744812012,
"sampling/importance_sampling_ratio/max": 0.04252217337489128,
"sampling/importance_sampling_ratio/mean": 0.015543580055236816,
"sampling/importance_sampling_ratio/min": 8.082223128483037e-25,
"sampling/sampling_logp_difference/max": 17.322786331176758,
"sampling/sampling_logp_difference/mean": 0.3324906826019287,
"step": 11,
"step_time": 49.50245009800017
},
{
"clip_ratio/high_max": 0.005110554862767458,
"clip_ratio/high_mean": 0.002555277431383729,
"clip_ratio/low_mean": 0.0014204545877873898,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.003975732019171119,
"entropy": 2.2029761970043182,
"epoch": 0.00012,
"grad_norm": 0.030912674963474274,
"kl": 0.0009688141508377157,
"learning_rate": 3.142857142857143e-06,
"loss": -0.0018,
"step": 12,
"step_time": 7.69119761200011
},
{
"clip_ratio/high_max": 0.013187056640163064,
"clip_ratio/high_mean": 0.006593528320081532,
"clip_ratio/low_mean": 0.0013586956774815917,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.007952223997563124,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1780.0,
"completions/max_terminated_length": 1780.0,
"completions/mean_length": 1663.0,
"completions/mean_terminated_length": 1663.0,
"completions/min_length": 1240.0,
"completions/min_terminated_length": 1240.0,
"entropy": 2.258611023426056,
"epoch": 0.00013,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.026398880407214165,
"kl": 0.0010122761159436777,
"learning_rate": 3.428571428571429e-06,
"loss": 0.0013,
"num_tokens": 519593.0,
"reward": -8.102012634277344,
"reward_std": 10.601648330688477,
"rewards/rollout_reward_func/mean": -8.102012634277344,
"rewards/rollout_reward_func/std": 11.444524765014648,
"sampling/importance_sampling_ratio/max": 0.0403943695127964,
"sampling/importance_sampling_ratio/mean": 0.01241688709706068,
"sampling/importance_sampling_ratio/min": 6.642031217564404e-14,
"sampling/sampling_logp_difference/max": 11.826959609985352,
"sampling/sampling_logp_difference/mean": 0.2676909267902374,
"step": 13,
"step_time": 51.398302784000066
},
{
"clip_ratio/high_max": 0.005434782709926367,
"clip_ratio/high_mean": 0.0027173913549631834,
"clip_ratio/low_mean": 0.002418017713353038,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005135408951900899,
"entropy": 2.251807004213333,
"epoch": 0.00014,
"grad_norm": 0.027899302542209625,
"kl": 0.001143605462857522,
"learning_rate": 3.7142857142857146e-06,
"loss": 0.0013,
"step": 14,
"step_time": 6.214198535999913
},
{
"clip_ratio/high_max": 0.010549242608249187,
"clip_ratio/high_mean": 0.0066044083796441555,
"clip_ratio/low_mean": 0.002659574383869767,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.009263982763513923,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1846.0,
"completions/max_terminated_length": 1846.0,
"completions/mean_length": 1673.4375,
"completions/mean_terminated_length": 1673.4375,
"completions/min_length": 1376.0,
"completions/min_terminated_length": 1376.0,
"entropy": 2.269966244697571,
"epoch": 0.00015,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.028693798929452896,
"kl": 0.0012732810355373658,
"learning_rate": 4.000000000000001e-06,
"loss": -0.0002,
"num_tokens": 594885.0,
"reward": -9.916947364807129,
"reward_std": 10.726478576660156,
"rewards/rollout_reward_func/mean": -9.916947364807129,
"rewards/rollout_reward_func/std": 11.980547904968262,
"sampling/importance_sampling_ratio/max": 0.02854122966527939,
"sampling/importance_sampling_ratio/mean": 0.01421053521335125,
"sampling/importance_sampling_ratio/min": 2.1058147088061363e-13,
"sampling/sampling_logp_difference/max": 9.987288475036621,
"sampling/sampling_logp_difference/mean": 0.2753928303718567,
"step": 15,
"step_time": 52.00545187799992
},
{
"clip_ratio/high_max": 0.007995169144123793,
"clip_ratio/high_mean": 0.003997584572061896,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.003997584572061896,
"entropy": 2.2705667912960052,
"epoch": 0.00016,
"grad_norm": 0.021315351128578186,
"kl": 0.0012536912836367264,
"learning_rate": 4.2857142857142855e-06,
"loss": -0.0002,
"step": 16,
"step_time": 7.3722107160000405
},
{
"clip_ratio/high_max": 0.0069444444961845875,
"clip_ratio/high_mean": 0.0034722222480922937,
"clip_ratio/low_mean": 0.00638433254789561,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.009856554795987904,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2086.0,
"completions/max_terminated_length": 2086.0,
"completions/mean_length": 1863.21875,
"completions/mean_terminated_length": 1863.21875,
"completions/min_length": 740.0,
"completions/min_terminated_length": 740.0,
"entropy": 2.299890086054802,
"epoch": 0.00017,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.01769721694290638,
"kl": 0.0012724917105515487,
"learning_rate": 4.571428571428572e-06,
"loss": 0.0017,
"num_tokens": 676591.0,
"reward": -8.684309959411621,
"reward_std": 9.59238338470459,
"rewards/rollout_reward_func/mean": -8.684309959411621,
"rewards/rollout_reward_func/std": 10.695920944213867,
"sampling/importance_sampling_ratio/max": 0.041902750730514526,
"sampling/importance_sampling_ratio/mean": 0.009395781904459,
"sampling/importance_sampling_ratio/min": 3.6452324480957535e-20,
"sampling/sampling_logp_difference/max": 12.486509323120117,
"sampling/sampling_logp_difference/mean": 0.3327711820602417,
"step": 17,
"step_time": 58.969859735
},
{
"clip_ratio/high_max": 0.0069444444961845875,
"clip_ratio/high_mean": 0.0034722222480922937,
"clip_ratio/low_mean": 0.004615322104655206,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0080875443527475,
"entropy": 2.297407776117325,
"epoch": 0.00018,
"grad_norm": 0.01824762113392353,
"kl": 0.001823120612243656,
"learning_rate": 4.857142857142858e-06,
"loss": 0.0017,
"step": 18,
"step_time": 6.952743392999764
},
{
"clip_ratio/high_max": 0.009362624026834965,
"clip_ratio/high_mean": 0.005838719545863569,
"clip_ratio/low_mean": 0.00231799460016191,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.008156714029610157,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2051.0,
"completions/max_terminated_length": 2051.0,
"completions/mean_length": 1801.53125,
"completions/mean_terminated_length": 1801.53125,
"completions/min_length": 527.0,
"completions/min_terminated_length": 527.0,
"entropy": 2.322386711835861,
"epoch": 0.00019,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.02586973085999489,
"kl": 0.0017871049421955831,
"learning_rate": 5.142857142857142e-06,
"loss": -0.0024,
"num_tokens": 756167.0,
"reward": -7.00605583190918,
"reward_std": 16.651020050048828,
"rewards/rollout_reward_func/mean": -7.00605583190918,
"rewards/rollout_reward_func/std": 18.44582176208496,
"sampling/importance_sampling_ratio/max": 0.07095864415168762,
"sampling/importance_sampling_ratio/mean": 0.011077712289988995,
"sampling/importance_sampling_ratio/min": 2.652188067117779e-20,
"sampling/sampling_logp_difference/max": 15.625991821289062,
"sampling/sampling_logp_difference/mean": 0.3300427198410034,
"step": 19,
"step_time": 58.73655633899966
},
{
"clip_ratio/high_max": 0.0068662879057228565,
"clip_ratio/high_mean": 0.0034331439528614283,
"clip_ratio/low_mean": 0.0032655425602570176,
"clip_ratio/low_min": 0.0019841270986944437,
"clip_ratio/region_mean": 0.006698686513118446,
"entropy": 2.3259487748146057,
"epoch": 0.0002,
"grad_norm": 0.030740659683942795,
"kl": 0.0018880682764574885,
"learning_rate": 5.428571428571429e-06,
"loss": -0.0023,
"step": 20,
"step_time": 7.368399762000195
},
{
"clip_ratio/high_max": 0.0018939394503831863,
"clip_ratio/high_mean": 0.0009469697251915932,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0009469697251915932,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2070.0,
"completions/max_terminated_length": 2070.0,
"completions/mean_length": 1879.1875,
"completions/mean_terminated_length": 1879.1875,
"completions/min_length": 700.0,
"completions/min_terminated_length": 700.0,
"entropy": 2.2097203731536865,
"epoch": 0.00021,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.02687995322048664,
"kl": 0.001916136905492749,
"learning_rate": 5.7142857142857145e-06,
"loss": -0.0002,
"num_tokens": 838107.0,
"reward": -2.38564395904541,
"reward_std": 11.885160446166992,
"rewards/rollout_reward_func/mean": -2.38564395904541,
"rewards/rollout_reward_func/std": 15.615160942077637,
"sampling/importance_sampling_ratio/max": 0.08406510949134827,
"sampling/importance_sampling_ratio/mean": 0.012403802014887333,
"sampling/importance_sampling_ratio/min": 7.145396301283091e-16,
"sampling/sampling_logp_difference/max": 12.1469087600708,
"sampling/sampling_logp_difference/mean": 0.2651059329509735,
"step": 21,
"step_time": 59.30298631099993
},
{
"clip_ratio/high_max": 0.0018939394503831863,
"clip_ratio/high_mean": 0.0009469697251915932,
"clip_ratio/low_mean": 0.0012019231216982007,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.002148892846889794,
"entropy": 2.212069094181061,
"epoch": 0.00022,
"grad_norm": 0.02957003004848957,
"kl": 0.0021696449111914262,
"learning_rate": 6e-06,
"loss": -0.0002,
"step": 22,
"step_time": 6.887106450000374
},
{
"clip_ratio/high_max": 0.002358490601181984,
"clip_ratio/high_mean": 0.001179245300590992,
"clip_ratio/low_mean": 0.0033517052652314305,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0045309505658224225,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2068.0,
"completions/max_terminated_length": 2068.0,
"completions/mean_length": 1889.875,
"completions/mean_terminated_length": 1889.875,
"completions/min_length": 1525.0,
"completions/min_terminated_length": 1525.0,
"entropy": 2.1723521649837494,
"epoch": 0.00023,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.022851742804050446,
"kl": 0.003631043611676432,
"learning_rate": 6.285714285714286e-06,
"loss": -0.0045,
"num_tokens": 919971.0,
"reward": -6.60746955871582,
"reward_std": 11.911310195922852,
"rewards/rollout_reward_func/mean": -6.60746955871582,
"rewards/rollout_reward_func/std": 12.598552703857422,
"sampling/importance_sampling_ratio/max": 0.022619599476456642,
"sampling/importance_sampling_ratio/mean": 0.00866013765335083,
"sampling/importance_sampling_ratio/min": 1.8946926625573762e-16,
"sampling/sampling_logp_difference/max": 12.770915031433105,
"sampling/sampling_logp_difference/mean": 0.2685585618019104,
"step": 23,
"step_time": 66.98576966499968
},
{
"clip_ratio/high_max": 0.004673305433243513,
"clip_ratio/high_mean": 0.0023366527166217566,
"clip_ratio/low_mean": 0.0012254902394488454,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.003562142956070602,
"entropy": 2.1722511053085327,
"epoch": 0.00024,
"grad_norm": 0.026614658534526825,
"kl": 0.004136091796681285,
"learning_rate": 6.571428571428572e-06,
"loss": -0.0045,
"step": 24,
"step_time": 6.895327041999735
},
{
"clip_ratio/high_max": 0.005816170945763588,
"clip_ratio/high_mean": 0.002908085472881794,
"clip_ratio/low_mean": 0.002100988756865263,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005009074229747057,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2360.0,
"completions/max_terminated_length": 2360.0,
"completions/mean_length": 2112.0625,
"completions/mean_terminated_length": 2112.0625,
"completions/min_length": 386.0,
"completions/min_terminated_length": 386.0,
"entropy": 2.1737034767866135,
"epoch": 0.00025,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.013818159699440002,
"kl": 0.0030502460795105435,
"learning_rate": 6.857142857142858e-06,
"loss": 0.0031,
"num_tokens": 1008470.0,
"reward": -4.425507068634033,
"reward_std": 14.518598556518555,
"rewards/rollout_reward_func/mean": -4.425507068634033,
"rewards/rollout_reward_func/std": 15.62421989440918,
"sampling/importance_sampling_ratio/max": 0.2350578010082245,
"sampling/importance_sampling_ratio/mean": 0.013511145487427711,
"sampling/importance_sampling_ratio/min": 2.6986267040271933e-18,
"sampling/sampling_logp_difference/max": 18.317167282104492,
"sampling/sampling_logp_difference/mean": 0.2883983850479126,
"step": 25,
"step_time": 75.20574624999995
},
{
"clip_ratio/high_max": 0.0036231884732842445,
"clip_ratio/high_mean": 0.0018115942366421223,
"clip_ratio/low_mean": 0.003012447035871446,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.004824041272513568,
"entropy": 2.1722775399684906,
"epoch": 0.00026,
"grad_norm": 0.013290848582983017,
"kl": 0.0030411222542170435,
"learning_rate": 7.1428571428571436e-06,
"loss": 0.0031,
"step": 26,
"step_time": 7.616344220999963
},
{
"clip_ratio/high_max": 0.006330503383651376,
"clip_ratio/high_mean": 0.003165251691825688,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.003165251691825688,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2344.0,
"completions/max_terminated_length": 2344.0,
"completions/mean_length": 2142.9375,
"completions/mean_terminated_length": 2142.9375,
"completions/min_length": 634.0,
"completions/min_terminated_length": 634.0,
"entropy": 2.320455104112625,
"epoch": 0.00027,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.01896420307457447,
"kl": 0.002832541649695486,
"learning_rate": 7.428571428571429e-06,
"loss": 0.0027,
"num_tokens": 1098577.0,
"reward": -13.397226333618164,
"reward_std": 12.59660816192627,
"rewards/rollout_reward_func/mean": -13.397226333618164,
"rewards/rollout_reward_func/std": 14.914674758911133,
"sampling/importance_sampling_ratio/max": 0.1061810627579689,
"sampling/importance_sampling_ratio/mean": 0.008775782771408558,
"sampling/importance_sampling_ratio/min": 7.311078333300841e-31,
"sampling/sampling_logp_difference/max": 18.154491424560547,
"sampling/sampling_logp_difference/mean": 0.3568466603755951,
"step": 27,
"step_time": 76.95750552300024
},
{
"clip_ratio/high_max": 0.00859679956920445,
"clip_ratio/high_mean": 0.004298399784602225,
"clip_ratio/low_mean": 0.0019767729099839926,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.006275172694586217,
"entropy": 2.319840043783188,
"epoch": 0.00028,
"grad_norm": 0.01972121186554432,
"kl": 0.003210447379387915,
"learning_rate": 7.714285714285716e-06,
"loss": 0.0027,
"step": 28,
"step_time": 7.581604469000013
},
{
"clip_ratio/high_max": 0.004324290552176535,
"clip_ratio/high_mean": 0.0021621452760882676,
"clip_ratio/low_mean": 0.0010593220358714461,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0032214673119597137,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 2252.0,
"completions/max_terminated_length": 2252.0,
"completions/mean_length": 2059.75,
"completions/mean_terminated_length": 2059.258056640625,
"completions/min_length": 654.0,
"completions/min_terminated_length": 654.0,
"entropy": 2.2485389709472656,
"epoch": 0.00029,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.017093749716877937,
"kl": 0.0025801300653256476,
"learning_rate": 8.000000000000001e-06,
"loss": -0.0023,
"num_tokens": 1185175.0,
"reward": -10.598124504089355,
"reward_std": 11.341144561767578,
"rewards/rollout_reward_func/mean": -10.598124504089355,
"rewards/rollout_reward_func/std": 12.110883712768555,
"sampling/importance_sampling_ratio/max": 0.04109551012516022,
"sampling/importance_sampling_ratio/mean": 0.006022544577717781,
"sampling/importance_sampling_ratio/min": 2.8597005269629276e-29,
"sampling/sampling_logp_difference/max": 13.06977367401123,
"sampling/sampling_logp_difference/mean": 0.28142935037612915,
"step": 29,
"step_time": 77.39934379400052
},
{
"clip_ratio/high_max": 0.010767225176095963,
"clip_ratio/high_mean": 0.005383612588047981,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005383612588047981,
"entropy": 2.2509743869304657,
"epoch": 0.0003,
"grad_norm": 0.01782657578587532,
"kl": 0.002700227312743664,
"learning_rate": 8.285714285714287e-06,
"loss": -0.0023,
"step": 30,
"step_time": 7.306394946999944
},
{
"clip_ratio/high_max": 0.008653939235955477,
"clip_ratio/high_mean": 0.004326969617977738,
"clip_ratio/low_mean": 0.0040421567391604185,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.008369126415345818,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2400.0,
"completions/max_terminated_length": 2400.0,
"completions/mean_length": 2102.53125,
"completions/mean_terminated_length": 2102.53125,
"completions/min_length": 718.0,
"completions/min_terminated_length": 718.0,
"entropy": 2.3924789130687714,
"epoch": 0.00031,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.02806187979876995,
"kl": 0.0026975919608958066,
"learning_rate": 8.571428571428571e-06,
"loss": 0.0024,
"num_tokens": 1273982.0,
"reward": -8.974254608154297,
"reward_std": 14.775591850280762,
"rewards/rollout_reward_func/mean": -8.974254608154297,
"rewards/rollout_reward_func/std": 14.762958526611328,
"sampling/importance_sampling_ratio/max": 0.08338890224695206,
"sampling/importance_sampling_ratio/mean": 0.00720847537741065,
"sampling/importance_sampling_ratio/min": 3.938114484780906e-20,
"sampling/sampling_logp_difference/max": 12.979628562927246,
"sampling/sampling_logp_difference/mean": 0.3850451111793518,
"step": 31,
"step_time": 78.26030889900039
},
{
"clip_ratio/high_max": 0.013500758213922381,
"clip_ratio/high_mean": 0.006750379106961191,
"clip_ratio/low_mean": 0.0008680555620230734,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.007618434610776603,
"entropy": 2.3886922001838684,
"epoch": 0.00032,
"grad_norm": 0.02739325352013111,
"kl": 0.0021556210485869087,
"learning_rate": 8.857142857142858e-06,
"loss": 0.0024,
"step": 32,
"step_time": 7.6260315599995465
},
{
"clip_ratio/high_max": 0.005404347903095186,
"clip_ratio/high_mean": 0.002702173951547593,
"clip_ratio/low_mean": 0.001923076924867928,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.004625250876415521,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2525.0,
"completions/max_terminated_length": 2525.0,
"completions/mean_length": 2267.0,
"completions/mean_terminated_length": 2267.0,
"completions/min_length": 1508.0,
"completions/min_terminated_length": 1508.0,
"entropy": 2.2084928154945374,
"epoch": 0.00033,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.00787295587360859,
"kl": 0.0015827158640604466,
"learning_rate": 9.142857142857144e-06,
"loss": -0.0001,
"num_tokens": 1367561.0,
"reward": -5.180222988128662,
"reward_std": 17.71061134338379,
"rewards/rollout_reward_func/mean": -5.180222988128662,
"rewards/rollout_reward_func/std": 20.794387817382812,
"sampling/importance_sampling_ratio/max": 0.011497444473206997,
"sampling/importance_sampling_ratio/mean": 0.004206728655844927,
"sampling/importance_sampling_ratio/min": 2.4751660744964696e-22,
"sampling/sampling_logp_difference/max": 21.120386123657227,
"sampling/sampling_logp_difference/mean": 0.29462122917175293,
"step": 33,
"step_time": 80.87171731000012
},
{
"clip_ratio/high_max": 0.003289473708719015,
"clip_ratio/high_mean": 0.0016447368543595076,
"clip_ratio/low_mean": 0.0037393163074739277,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005384053161833435,
"entropy": 2.217702329158783,
"epoch": 0.00034,
"grad_norm": 0.01408409047871828,
"kl": 0.002227816090453416,
"learning_rate": 9.42857142857143e-06,
"loss": -0.0001,
"step": 34,
"step_time": 8.029501602999972
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2603.0,
"completions/max_terminated_length": 2603.0,
"completions/mean_length": 2335.4375,
"completions/mean_terminated_length": 2335.4375,
"completions/min_length": 891.0,
"completions/min_terminated_length": 891.0,
"entropy": 2.327633857727051,
"epoch": 0.00035,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.01498799491673708,
"kl": 0.0023266323769348674,
"learning_rate": 9.714285714285715e-06,
"loss": 0.0035,
"num_tokens": 1463740.0,
"reward": -9.247687339782715,
"reward_std": 12.140151977539062,
"rewards/rollout_reward_func/mean": -9.247687339782715,
"rewards/rollout_reward_func/std": 13.351397514343262,
"sampling/importance_sampling_ratio/max": 0.0675792247056961,
"sampling/importance_sampling_ratio/mean": 0.005599465221166611,
"sampling/importance_sampling_ratio/min": 7.004530503774031e-41,
"sampling/sampling_logp_difference/max": 17.645164489746094,
"sampling/sampling_logp_difference/mean": 0.38000231981277466,
"step": 35,
"step_time": 80.66461238600027
},
{
"clip_ratio/high_max": 0.009387426427565515,
"clip_ratio/high_mean": 0.0046937132137827575,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0046937132137827575,
"entropy": 2.3282170593738556,
"epoch": 0.00036,
"grad_norm": 0.016069183126091957,
"kl": 0.0027750690060202032,
"learning_rate": 1e-05,
"loss": 0.0035,
"step": 36,
"step_time": 8.139321300999654
},
{
"clip_ratio/high_max": 0.0018382353009656072,
"clip_ratio/high_mean": 0.0009191176504828036,
"clip_ratio/low_mean": 0.0010080644860863686,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0019271821365691721,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 2526.0,
"completions/max_terminated_length": 2526.0,
"completions/mean_length": 2198.28125,
"completions/mean_terminated_length": 2192.419189453125,
"completions/min_length": 582.0,
"completions/min_terminated_length": 582.0,
"entropy": 2.2252254486083984,
"epoch": 0.00037,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.0396745391190052,
"kl": 0.0024436857129330747,
"learning_rate": 9.999999999948591e-06,
"loss": 0.0037,
"num_tokens": 1555280.0,
"reward": -10.988698959350586,
"reward_std": 11.484918594360352,
"rewards/rollout_reward_func/mean": -10.988698959350586,
"rewards/rollout_reward_func/std": 13.74112606048584,
"sampling/importance_sampling_ratio/max": 0.1169009730219841,
"sampling/importance_sampling_ratio/mean": 0.009122872725129128,
"sampling/importance_sampling_ratio/min": 6.878056424215178e-17,
"sampling/sampling_logp_difference/max": 17.073537826538086,
"sampling/sampling_logp_difference/mean": 0.2771769165992737,
"step": 37,
"step_time": 79.2830818729999
},
{
"clip_ratio/high_max": 0.001923076924867928,
"clip_ratio/high_mean": 0.000961538462433964,
"clip_ratio/low_mean": 0.0015243901871144772,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.002485928649548441,
"entropy": 2.230591207742691,
"epoch": 0.00038,
"grad_norm": 0.0169509444385767,
"kl": 0.002311948119313456,
"learning_rate": 9.999999999794362e-06,
"loss": 0.0037,
"step": 38,
"step_time": 8.018443904999458
},
{
"clip_ratio/high_max": 0.003639505594037473,
"clip_ratio/high_mean": 0.0018197527970187366,
"clip_ratio/low_mean": 0.0008333333535119891,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0026530861505307257,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2566.0,
"completions/max_terminated_length": 2566.0,
"completions/mean_length": 2258.4375,
"completions/mean_terminated_length": 2258.4375,
"completions/min_length": 1440.0,
"completions/min_terminated_length": 1440.0,
"entropy": 2.2081351578235626,
"epoch": 0.00039,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.009336345829069614,
"kl": 0.0021164097925066017,
"learning_rate": 9.999999999537309e-06,
"loss": 0.0004,
"num_tokens": 1649454.0,
"reward": -10.517179489135742,
"reward_std": 9.59807014465332,
"rewards/rollout_reward_func/mean": -10.517179489135742,
"rewards/rollout_reward_func/std": 9.964343070983887,
"sampling/importance_sampling_ratio/max": 0.012029355391860008,
"sampling/importance_sampling_ratio/mean": 0.003381735645234585,
"sampling/importance_sampling_ratio/min": 1.5594409263266153e-17,
"sampling/sampling_logp_difference/max": 18.87306022644043,
"sampling/sampling_logp_difference/mean": 0.362891286611557,
"step": 39,
"step_time": 80.33267479000028
},
{
"clip_ratio/high_max": 0.009275426273234189,
"clip_ratio/high_mean": 0.005543510254938155,
"clip_ratio/low_mean": 0.0009765625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.006520072813145816,
"entropy": 2.216031640768051,
"epoch": 0.0004,
"grad_norm": 0.010434896685183048,
"kl": 0.001769829061231576,
"learning_rate": 9.999999999177437e-06,
"loss": 0.0004,
"step": 40,
"step_time": 8.050674242000014
},
{
"clip_ratio/high_max": 0.00331838964484632,
"clip_ratio/high_mean": 0.00165919482242316,
"clip_ratio/low_mean": 0.001736446050927043,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.003395640873350203,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2887.0,
"completions/max_terminated_length": 2887.0,
"completions/mean_length": 2683.46875,
"completions/mean_terminated_length": 2683.46875,
"completions/min_length": 2241.0,
"completions/min_terminated_length": 2241.0,
"entropy": 2.259254366159439,
"epoch": 0.00041,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.012253516353666782,
"kl": 0.0026411395665491,
"learning_rate": 9.999999998714745e-06,
"loss": 0.0007,
"num_tokens": 1756670.0,
"reward": -11.294092178344727,
"reward_std": 12.929180145263672,
"rewards/rollout_reward_func/mean": -11.294092178344727,
"rewards/rollout_reward_func/std": 12.838841438293457,
"sampling/importance_sampling_ratio/max": 0.005970899015665054,
"sampling/importance_sampling_ratio/mean": 0.0017320181941613555,
"sampling/importance_sampling_ratio/min": 1.393524575839586e-16,
"sampling/sampling_logp_difference/max": 12.961386680603027,
"sampling/sampling_logp_difference/mean": 0.3150481879711151,
"step": 41,
"step_time": 91.85784664900052
},
{
"clip_ratio/high_max": 0.0034722222480922937,
"clip_ratio/high_mean": 0.0017361111240461469,
"clip_ratio/low_mean": 0.0017123287543654442,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00344843982020393,
"entropy": 2.2678737342357635,
"epoch": 0.00042,
"grad_norm": 0.009364346042275429,
"kl": 0.002545906128943898,
"learning_rate": 9.999999998149234e-06,
"loss": 0.0007,
"step": 42,
"step_time": 8.943521398999792
},
{
"clip_ratio/high_max": 0.0031066687079146504,
"clip_ratio/high_mean": 0.0015533343539573252,
"clip_ratio/low_mean": 0.0063974635559134185,
"clip_ratio/low_min": 0.0017857142956927419,
"clip_ratio/region_mean": 0.007950797851663083,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2973.0,
"completions/max_terminated_length": 2973.0,
"completions/mean_length": 2525.125,
"completions/mean_terminated_length": 2525.125,
"completions/min_length": 375.0,
"completions/min_terminated_length": 375.0,
"entropy": 2.2919381260871887,
"epoch": 0.00043,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.012219212017953396,
"kl": 0.002040403662249446,
"learning_rate": 9.999999997480901e-06,
"loss": -0.0004,
"num_tokens": 1858672.0,
"reward": -11.096410751342773,
"reward_std": 12.537565231323242,
"rewards/rollout_reward_func/mean": -11.096410751342773,
"rewards/rollout_reward_func/std": 12.965230941772461,
"sampling/importance_sampling_ratio/max": 0.17080777883529663,
"sampling/importance_sampling_ratio/mean": 0.0070959883742034435,
"sampling/importance_sampling_ratio/min": 2.0908910633189203e-31,
"sampling/sampling_logp_difference/max": 18.829500198364258,
"sampling/sampling_logp_difference/mean": 0.3667473793029785,
"step": 43,
"step_time": 91.86805722899953
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0007183908019214869,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0007183908019214869,
"entropy": 2.2995532751083374,
"epoch": 0.00044,
"grad_norm": 0.011969598941504955,
"kl": 0.0019854862766806036,
"learning_rate": 9.999999996709749e-06,
"loss": -0.0004,
"step": 44,
"step_time": 9.18175687799976
},
{
"clip_ratio/high_max": 0.012487898580729961,
"clip_ratio/high_mean": 0.007980060297995806,
"clip_ratio/low_mean": 0.0025955072487704456,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010575567546766251,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 2880.0,
"completions/max_terminated_length": 2880.0,
"completions/mean_length": 2663.9375,
"completions/mean_terminated_length": 2665.419189453125,
"completions/min_length": 2200.0,
"completions/min_terminated_length": 2200.0,
"entropy": 2.3340508341789246,
"epoch": 0.00045,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.02003672532737255,
"kl": 0.002692480251425877,
"learning_rate": 9.999999995835775e-06,
"loss": 0.0006,
"num_tokens": 1965272.0,
"reward": -13.600004196166992,
"reward_std": 14.701010704040527,
"rewards/rollout_reward_func/mean": -13.600004196166992,
"rewards/rollout_reward_func/std": 14.582523345947266,
"sampling/importance_sampling_ratio/max": 0.005967509467154741,
"sampling/importance_sampling_ratio/mean": 0.001800880883820355,
"sampling/importance_sampling_ratio/min": 3.8321957037099815e-21,
"sampling/sampling_logp_difference/max": 16.676918029785156,
"sampling/sampling_logp_difference/mean": 0.38897034525871277,
"step": 45,
"step_time": 90.54392121900082
},
{
"clip_ratio/high_max": 0.0064415192464366555,
"clip_ratio/high_mean": 0.00408881512703374,
"clip_ratio/low_mean": 0.0017152255750261247,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005804040702059865,
"entropy": 2.334753155708313,
"epoch": 0.00046,
"grad_norm": 0.009386158548295498,
"kl": 0.0021710961300414056,
"learning_rate": 9.999999994858982e-06,
"loss": 0.0006,
"step": 46,
"step_time": 10.22533744600014
},
{
"clip_ratio/high_max": 0.003598798648454249,
"clip_ratio/high_mean": 0.002667454886250198,
"clip_ratio/low_mean": 0.0009057971183210611,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.003573252004571259,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2865.0,
"completions/max_terminated_length": 2865.0,
"completions/mean_length": 2587.71875,
"completions/mean_terminated_length": 2587.71875,
"completions/min_length": 1094.0,
"completions/min_terminated_length": 1094.0,
"entropy": 2.2846151292324066,
"epoch": 0.00047,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.01407544780522585,
"kl": 0.003350261024024803,
"learning_rate": 9.999999993779367e-06,
"loss": -0.0001,
"num_tokens": 2069829.0,
"reward": -9.709592819213867,
"reward_std": 8.844062805175781,
"rewards/rollout_reward_func/mean": -9.709592819213867,
"rewards/rollout_reward_func/std": 9.250896453857422,
"sampling/importance_sampling_ratio/max": 0.03837426379323006,
"sampling/importance_sampling_ratio/mean": 0.003228831337764859,
"sampling/importance_sampling_ratio/min": 1.3482284797513267e-27,
"sampling/sampling_logp_difference/max": 19.295942306518555,
"sampling/sampling_logp_difference/mean": 0.35356980562210083,
"step": 47,
"step_time": 92.8158456350011
},
{
"clip_ratio/high_max": 0.008580301189795136,
"clip_ratio/high_mean": 0.004290150594897568,
"clip_ratio/low_mean": 0.001728165545500815,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.006018316023983061,
"entropy": 2.2845455408096313,
"epoch": 0.00048,
"grad_norm": 0.013240635395050049,
"kl": 0.003361153867444955,
"learning_rate": 9.999999992596935e-06,
"loss": -0.0001,
"step": 48,
"step_time": 8.908816245000708
},
{
"clip_ratio/high_max": 0.0016025641234591603,
"clip_ratio/high_mean": 0.0008012820617295802,
"clip_ratio/low_mean": 0.0010775862028822303,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0018788682646118104,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 3007.0,
"completions/max_terminated_length": 3007.0,
"completions/mean_length": 2473.59375,
"completions/mean_terminated_length": 2460.774169921875,
"completions/min_length": 185.0,
"completions/min_terminated_length": 185.0,
"entropy": 2.234508216381073,
"epoch": 0.00049,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.013633599504828453,
"kl": 0.0016216770527535118,
"learning_rate": 9.999999991311679e-06,
"loss": -0.0006,
"num_tokens": 2169988.0,
"reward": -11.157156944274902,
"reward_std": 9.950779914855957,
"rewards/rollout_reward_func/mean": -11.157156944274902,
"rewards/rollout_reward_func/std": 11.374809265136719,
"sampling/importance_sampling_ratio/max": 0.28713250160217285,
"sampling/importance_sampling_ratio/mean": 0.018523240461945534,
"sampling/importance_sampling_ratio/min": 2.006449198810831e-20,
"sampling/sampling_logp_difference/max": 18.636167526245117,
"sampling/sampling_logp_difference/mean": 0.3026620149612427,
"step": 49,
"step_time": 87.0453093810006
},
{
"clip_ratio/high_max": 0.009976116823963821,
"clip_ratio/high_mean": 0.006601028784643859,
"clip_ratio/low_mean": 0.002857419603969902,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.009458448505029082,
"entropy": 2.2327195405960083,
"epoch": 0.0005,
"grad_norm": 0.013543589971959591,
"kl": 0.001988013260415755,
"learning_rate": 9.999999989923604e-06,
"loss": -0.0006,
"step": 50,
"step_time": 10.626580007000484
},
{
"clip_ratio/high_max": 0.0028511597774922848,
"clip_ratio/high_mean": 0.0014255798887461424,
"clip_ratio/low_mean": 0.0024358974769711494,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.003861477307509631,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2977.0,
"completions/max_terminated_length": 2977.0,
"completions/mean_length": 2764.375,
"completions/mean_terminated_length": 2764.375,
"completions/min_length": 2371.0,
"completions/min_terminated_length": 2371.0,
"entropy": 2.372197538614273,
"epoch": 0.00051,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.00828390009701252,
"kl": 0.002092195674777031,
"learning_rate": 9.999999988432709e-06,
"loss": -0.0009,
"num_tokens": 2279279.0,
"reward": -9.928876876831055,
"reward_std": 11.465417861938477,
"rewards/rollout_reward_func/mean": -9.928876876831055,
"rewards/rollout_reward_func/std": 12.258295059204102,
"sampling/importance_sampling_ratio/max": 0.005203698296099901,
"sampling/importance_sampling_ratio/mean": 0.0008916730294004083,
"sampling/importance_sampling_ratio/min": 6.668663636418266e-29,
"sampling/sampling_logp_difference/max": 15.371074676513672,
"sampling/sampling_logp_difference/mean": 0.41038042306900024,
"step": 51,
"step_time": 97.36285335000048
},
{
"clip_ratio/high_max": 0.004987157532013953,
"clip_ratio/high_mean": 0.0024935787660069764,
"clip_ratio/low_mean": 0.002425754675641656,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.004919333383440971,
"entropy": 2.3701044023036957,
"epoch": 0.00052,
"grad_norm": 0.009349919855594635,
"kl": 0.0021661788632627577,
"learning_rate": 9.999999986838993e-06,
"loss": -0.001,
"step": 52,
"step_time": 9.234601858999667
},
{
"clip_ratio/high_max": 0.0029559049289673567,
"clip_ratio/high_mean": 0.0014779524644836783,
"clip_ratio/low_mean": 0.0007022471982054412,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0021801996626891196,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3143.0,
"completions/max_terminated_length": 3143.0,
"completions/mean_length": 2911.6875,
"completions/mean_terminated_length": 2911.6875,
"completions/min_length": 1747.0,
"completions/min_terminated_length": 1747.0,
"entropy": 2.095397859811783,
"epoch": 0.00053,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.007544730789959431,
"kl": 0.0014459962039836682,
"learning_rate": 9.999999985142457e-06,
"loss": 0.0005,
"num_tokens": 2393822.0,
"reward": -9.227861404418945,
"reward_std": 13.615285873413086,
"rewards/rollout_reward_func/mean": -9.227861404418945,
"rewards/rollout_reward_func/std": 13.280508995056152,
"sampling/importance_sampling_ratio/max": 0.00818893313407898,
"sampling/importance_sampling_ratio/mean": 0.0016003338387236,
"sampling/importance_sampling_ratio/min": 1.1838428666831955e-38,
"sampling/sampling_logp_difference/max": 19.036222457885742,
"sampling/sampling_logp_difference/mean": 0.27891838550567627,
"step": 53,
"step_time": 94.85144506799998
},
{
"clip_ratio/high_max": 0.007602313125971705,
"clip_ratio/high_mean": 0.004487969708861783,
"clip_ratio/low_mean": 0.0009469697251915932,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005434939434053376,
"entropy": 2.092746779322624,
"epoch": 0.00054,
"grad_norm": 0.007749093230813742,
"kl": 0.001536271462100558,
"learning_rate": 9.999999983343101e-06,
"loss": 0.0005,
"step": 54,
"step_time": 10.820570559000771
},
{
"clip_ratio/high_max": 0.0016666667070239782,
"clip_ratio/high_mean": 0.0008333333535119891,
"clip_ratio/low_mean": 0.0015822785208001733,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0024156119325198233,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 3139.0,
"completions/max_terminated_length": 3139.0,
"completions/mean_length": 2945.40625,
"completions/mean_terminated_length": 2943.9677734375,
"completions/min_length": 2679.0,
"completions/min_terminated_length": 2679.0,
"entropy": 2.1801984012126923,
"epoch": 0.00055,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.006448639556765556,
"kl": 0.0018499534926377237,
"learning_rate": 9.999999981440923e-06,
"loss": -0.0001,
"num_tokens": 2508892.0,
"reward": -13.496590614318848,
"reward_std": 9.91647720336914,
"rewards/rollout_reward_func/mean": -13.496590614318848,
"rewards/rollout_reward_func/std": 10.541536331176758,
"sampling/importance_sampling_ratio/max": 0.004021179396659136,
"sampling/importance_sampling_ratio/mean": 0.0013151702005416155,
"sampling/importance_sampling_ratio/min": 1.5261994142437563e-12,
"sampling/sampling_logp_difference/max": 8.930527687072754,
"sampling/sampling_logp_difference/mean": 0.2391294538974762,
"step": 55,
"step_time": 97.94088426899998
},
{
"clip_ratio/high_max": 0.004934780183248222,
"clip_ratio/high_mean": 0.0032585294102318585,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0032585294102318585,
"entropy": 2.1794978380203247,
"epoch": 0.00056,
"grad_norm": 0.006780738476663828,
"kl": 0.0018694552418310195,
"learning_rate": 9.999999979435926e-06,
"loss": -0.0001,
"step": 56,
"step_time": 9.654078997999477
},
{
"clip_ratio/high_max": 0.005736267426982522,
"clip_ratio/high_mean": 0.002868133713491261,
"clip_ratio/low_mean": 0.002268664597067982,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005136798310559243,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 3438.0,
"completions/max_terminated_length": 3438.0,
"completions/mean_length": 3171.25,
"completions/mean_terminated_length": 3164.322509765625,
"completions/min_length": 2483.0,
"completions/min_terminated_length": 2483.0,
"entropy": 2.1486852020025253,
"epoch": 0.00057,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.005101061426103115,
"kl": 0.001483209984144196,
"learning_rate": 9.999999977328107e-06,
"loss": 0.0002,
"num_tokens": 2631790.0,
"reward": -14.85343074798584,
"reward_std": 14.20746898651123,
"rewards/rollout_reward_func/mean": -14.85343074798584,
"rewards/rollout_reward_func/std": 14.325145721435547,
"sampling/importance_sampling_ratio/max": 0.005566018167883158,
"sampling/importance_sampling_ratio/mean": 0.0012064384063705802,
"sampling/importance_sampling_ratio/min": 5.148599772634619e-16,
"sampling/sampling_logp_difference/max": 12.547440528869629,
"sampling/sampling_logp_difference/mean": 0.24704553186893463,
"step": 57,
"step_time": 104.96052551200137
},
{
"clip_ratio/high_max": 0.004558469052426517,
"clip_ratio/high_mean": 0.0022792345262132585,
"clip_ratio/low_mean": 0.0007716049440205097,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0030508394702337682,
"entropy": 2.1473026871681213,
"epoch": 0.00058,
"grad_norm": 0.004236625507473946,
"kl": 0.001287619597860612,
"learning_rate": 9.99999997511747e-06,
"loss": 0.0002,
"step": 58,
"step_time": 11.613219467000363
},
{
"clip_ratio/high_max": 0.0030487803742289543,
"clip_ratio/high_mean": 0.0015243901871144772,
"clip_ratio/low_mean": 0.005422163347247988,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.006946553534362465,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3445.0,
"completions/max_terminated_length": 3445.0,
"completions/mean_length": 3177.1875,
"completions/mean_terminated_length": 3177.1875,
"completions/min_length": 903.0,
"completions/min_terminated_length": 903.0,
"entropy": 2.161461815237999,
"epoch": 0.00059,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.012280529364943504,
"kl": 0.002224226642283611,
"learning_rate": 9.999999972804012e-06,
"loss": -0.0006,
"num_tokens": 2754194.0,
"reward": -13.905904769897461,
"reward_std": 10.438383102416992,
"rewards/rollout_reward_func/mean": -13.905904769897461,
"rewards/rollout_reward_func/std": 11.29019832611084,
"sampling/importance_sampling_ratio/max": 0.02268226072192192,
"sampling/importance_sampling_ratio/mean": 0.0018939973087981343,
"sampling/importance_sampling_ratio/min": 2.697007293984262e-28,
"sampling/sampling_logp_difference/max": 17.155614852905273,
"sampling/sampling_logp_difference/mean": 0.2826978266239166,
"step": 59,
"step_time": 100.61191374800092
},
{
"clip_ratio/high_max": 0.008298700326122344,
"clip_ratio/high_mean": 0.004149350163061172,
"clip_ratio/low_mean": 0.006984907551668584,
"clip_ratio/low_min": 0.0015432098880410194,
"clip_ratio/region_mean": 0.011134257889352739,
"entropy": 2.1591842770576477,
"epoch": 0.0006,
"grad_norm": 0.01143638975918293,
"kl": 0.0023646633271710016,
"learning_rate": 9.999999970387732e-06,
"loss": -0.0006,
"step": 60,
"step_time": 10.382511724000324
},
{
"clip_ratio/high_max": 0.0015432098880410194,
"clip_ratio/high_mean": 0.0007716049440205097,
"clip_ratio/low_mean": 0.002259911096189171,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.003031516040209681,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3416.0,
"completions/max_terminated_length": 3416.0,
"completions/mean_length": 3135.84375,
"completions/mean_terminated_length": 3135.84375,
"completions/min_length": 195.0,
"completions/min_terminated_length": 195.0,
"entropy": 2.217515081167221,
"epoch": 0.00061,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.006441728211939335,
"kl": 0.0010476857314642984,
"learning_rate": 9.999999967868633e-06,
"loss": 0.0009,
"num_tokens": 2876044.0,
"reward": -14.489093780517578,
"reward_std": 11.084500312805176,
"rewards/rollout_reward_func/mean": -14.489093780517578,
"rewards/rollout_reward_func/std": 12.003732681274414,
"sampling/importance_sampling_ratio/max": 0.25691941380500793,
"sampling/importance_sampling_ratio/mean": 0.008919022977352142,
"sampling/importance_sampling_ratio/min": 6.2882773049331024e-24,
"sampling/sampling_logp_difference/max": 12.479897499084473,
"sampling/sampling_logp_difference/mean": 0.29724666476249695,
"step": 61,
"step_time": 102.78593670800046
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0007812500116415322,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0007812500116415322,
"entropy": 2.212161585688591,
"epoch": 0.00062,
"grad_norm": 0.005762570537626743,
"kl": 0.001050639031745959,
"learning_rate": 9.999999965246713e-06,
"loss": 0.0009,
"step": 62,
"step_time": 10.780980200999693
},
{
"clip_ratio/high_max": 0.00550881412345916,
"clip_ratio/high_mean": 0.00275440706172958,
"clip_ratio/low_mean": 0.0014889392768964171,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.004243346338625997,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3413.0,
"completions/max_terminated_length": 3413.0,
"completions/mean_length": 2979.78125,
"completions/mean_terminated_length": 2979.78125,
"completions/min_length": 280.0,
"completions/min_terminated_length": 280.0,
"entropy": 2.111812949180603,
"epoch": 0.00063,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.03990187868475914,
"kl": 0.001456589438021183,
"learning_rate": 9.999999962521974e-06,
"loss": -0.0043,
"num_tokens": 2992561.0,
"reward": -9.381729125976562,
"reward_std": 10.014938354492188,
"rewards/rollout_reward_func/mean": -9.381729125976562,
"rewards/rollout_reward_func/std": 10.544285774230957,
"sampling/importance_sampling_ratio/max": 0.23116491734981537,
"sampling/importance_sampling_ratio/mean": 0.008492819964885712,
"sampling/importance_sampling_ratio/min": 1.2417058145283537e-26,
"sampling/sampling_logp_difference/max": 17.79161262512207,
"sampling/sampling_logp_difference/mean": 0.31178855895996094,
"step": 63,
"step_time": 105.0415441539999
},
{
"clip_ratio/high_max": 0.004596683429554105,
"clip_ratio/high_mean": 0.0022983417147770524,
"clip_ratio/low_mean": 0.002260544220916927,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.004558885877486318,
"entropy": 2.108848959207535,
"epoch": 0.00064,
"grad_norm": 0.039187680929899216,
"kl": 0.00118074486090336,
"learning_rate": 9.999999959694412e-06,
"loss": -0.0042,
"step": 64,
"step_time": 10.22475211300025
},
{
"clip_ratio/high_max": 0.0014204545877873898,
"clip_ratio/high_mean": 0.0014204545877873898,
"clip_ratio/low_mean": 0.0014124744920991361,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.002832929079886526,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3596.0,
"completions/max_terminated_length": 3596.0,
"completions/mean_length": 3147.21875,
"completions/mean_terminated_length": 3147.21875,
"completions/min_length": 487.0,
"completions/min_terminated_length": 487.0,
"entropy": 2.150854080915451,
"epoch": 0.00065,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.009450007230043411,
"kl": 0.0011926015722565353,
"learning_rate": 9.999999956764034e-06,
"loss": -0.0003,
"num_tokens": 3114343.0,
"reward": -11.12032413482666,
"reward_std": 18.984477996826172,
"rewards/rollout_reward_func/mean": -11.12032413482666,
"rewards/rollout_reward_func/std": 21.110273361206055,
"sampling/importance_sampling_ratio/max": 0.04695241525769234,
"sampling/importance_sampling_ratio/mean": 0.002132077468559146,
"sampling/importance_sampling_ratio/min": 4.1585366301487205e-15,
"sampling/sampling_logp_difference/max": 12.402358055114746,
"sampling/sampling_logp_difference/mean": 0.2630866467952728,
"step": 65,
"step_time": 106.09026804299992
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0006793478387407959,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0006793478387407959,
"entropy": 2.1468848437070847,
"epoch": 0.00066,
"grad_norm": 0.006407527253031731,
"kl": 0.0014346542666316964,
"learning_rate": 9.999999953730833e-06,
"loss": -0.0003,
"step": 66,
"step_time": 10.735191880000002
},
{
"clip_ratio/high_max": 0.0025255101500079036,
"clip_ratio/high_mean": 0.0012627550750039518,
"clip_ratio/low_mean": 0.0006443298771046102,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.001907084952108562,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3504.0,
"completions/max_terminated_length": 3504.0,
"completions/mean_length": 2932.6875,
"completions/mean_terminated_length": 2932.6875,
"completions/min_length": 194.0,
"completions/min_terminated_length": 194.0,
"entropy": 2.0600955486297607,
"epoch": 0.00067,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.018519844859838486,
"kl": 0.001339123715297319,
"learning_rate": 9.99999995059481e-06,
"loss": 0.0045,
"num_tokens": 3229150.0,
"reward": -14.67924690246582,
"reward_std": 13.668952941894531,
"rewards/rollout_reward_func/mean": -14.67924690246582,
"rewards/rollout_reward_func/std": 13.640398979187012,
"sampling/importance_sampling_ratio/max": 0.251010000705719,
"sampling/importance_sampling_ratio/mean": 0.013134480454027653,
"sampling/importance_sampling_ratio/min": 3.752377749352698e-26,
"sampling/sampling_logp_difference/max": 17.058979034423828,
"sampling/sampling_logp_difference/mean": 0.27970272302627563,
"step": 67,
"step_time": 98.07139246399856
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0014369714772328734,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0014369714772328734,
"entropy": 2.056602507829666,
"epoch": 0.00068,
"grad_norm": 0.018268967047333717,
"kl": 0.0017321475461358204,
"learning_rate": 9.99999994735597e-06,
"loss": 0.0045,
"step": 68,
"step_time": 10.463535391000278
},
{
"clip_ratio/high_max": 0.0014367816038429737,
"clip_ratio/high_mean": 0.0013193523045629263,
"clip_ratio/low_mean": 0.00210745120421052,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.003426803508773446,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3584.0,
"completions/max_terminated_length": 3584.0,
"completions/mean_length": 2997.875,
"completions/mean_terminated_length": 2997.875,
"completions/min_length": 387.0,
"completions/min_terminated_length": 387.0,
"entropy": 2.179373413324356,
"epoch": 0.00069,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.021575244143605232,
"kl": 0.0025506179299554788,
"learning_rate": 9.999999944014306e-06,
"loss": 0.003,
"num_tokens": 3346039.0,
"reward": -12.414968490600586,
"reward_std": 17.504671096801758,
"rewards/rollout_reward_func/mean": -12.414968490600586,
"rewards/rollout_reward_func/std": 18.227092742919922,
"sampling/importance_sampling_ratio/max": 0.16923412680625916,
"sampling/importance_sampling_ratio/mean": 0.007539688143879175,
"sampling/importance_sampling_ratio/min": 5.152622264460075e-33,
"sampling/sampling_logp_difference/max": 20.113666534423828,
"sampling/sampling_logp_difference/mean": 0.35040098428726196,
"step": 69,
"step_time": 98.19044223200035
},
{
"clip_ratio/high_max": 0.006122596096247435,
"clip_ratio/high_mean": 0.0036622595507651567,
"clip_ratio/low_mean": 0.0029755067662335932,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00663776631699875,
"entropy": 2.1783816814422607,
"epoch": 0.0007,
"grad_norm": 0.014992612414062023,
"kl": 0.0022299339179880917,
"learning_rate": 9.999999940569825e-06,
"loss": 0.003,
"step": 70,
"step_time": 10.582511048000015
},
{
"clip_ratio/high_max": 0.0025389643851667643,
"clip_ratio/high_mean": 0.0012694821925833821,
"clip_ratio/low_mean": 0.0027696280158124864,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.004039110150188208,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3695.0,
"completions/max_terminated_length": 3695.0,
"completions/mean_length": 3466.03125,
"completions/mean_terminated_length": 3466.03125,
"completions/min_length": 2833.0,
"completions/min_terminated_length": 2833.0,
"entropy": 2.0841893553733826,
"epoch": 0.00071,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.015231302939355373,
"kl": 0.002882544751628302,
"learning_rate": 9.999999937022522e-06,
"loss": -0.0002,
"num_tokens": 3477790.0,
"reward": -11.157001495361328,
"reward_std": 9.339273452758789,
"rewards/rollout_reward_func/mean": -11.157001495361328,
"rewards/rollout_reward_func/std": 10.505151748657227,
"sampling/importance_sampling_ratio/max": 0.002954554045572877,
"sampling/importance_sampling_ratio/mean": 0.0008290851255878806,
"sampling/importance_sampling_ratio/min": 4.875183740817929e-34,
"sampling/sampling_logp_difference/max": 12.618881225585938,
"sampling/sampling_logp_difference/mean": 0.25001800060272217,
"step": 71,
"step_time": 109.58023633999892
},
{
"clip_ratio/high_max": 0.0010683761211112142,
"clip_ratio/high_mean": 0.0005341880605556071,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0005341880605556071,
"entropy": 2.0846132934093475,
"epoch": 0.00072,
"grad_norm": 0.009933187626302242,
"kl": 0.002236059895949438,
"learning_rate": 9.999999933372398e-06,
"loss": -0.0002,
"step": 72,
"step_time": 10.986284594999688
},
{
"clip_ratio/high_max": 0.010420707054436207,
"clip_ratio/high_mean": 0.006717559706885368,
"clip_ratio/low_mean": 0.006201559328474104,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.012919119151774794,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3855.0,
"completions/max_terminated_length": 3855.0,
"completions/mean_length": 3299.0,
"completions/mean_terminated_length": 3299.0,
"completions/min_length": 727.0,
"completions/min_terminated_length": 727.0,
"entropy": 2.229044407606125,
"epoch": 0.00073,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.014358018524944782,
"kl": 0.0021708139684051275,
"learning_rate": 9.999999929619456e-06,
"loss": -0.0008,
"num_tokens": 3603657.0,
"reward": -13.489961624145508,
"reward_std": 14.68436336517334,
"rewards/rollout_reward_func/mean": -13.489961624145508,
"rewards/rollout_reward_func/std": 15.650424003601074,
"sampling/importance_sampling_ratio/max": 0.059434566646814346,
"sampling/importance_sampling_ratio/mean": 0.002726445673033595,
"sampling/importance_sampling_ratio/min": 5.1101291228379537e-39,
"sampling/sampling_logp_difference/max": 17.89449691772461,
"sampling/sampling_logp_difference/mean": 0.3607054650783539,
"step": 73,
"step_time": 111.9296666260002
},
{
"clip_ratio/high_max": 0.010632613790221512,
"clip_ratio/high_mean": 0.0059673485811799765,
"clip_ratio/low_mean": 0.0011278195888735354,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.007095168228261173,
"entropy": 2.2271364331245422,
"epoch": 0.00074,
"grad_norm": 0.01777312532067299,
"kl": 0.0019678229436976835,
"learning_rate": 9.99999992576369e-06,
"loss": -0.0008,
"step": 74,
"step_time": 11.365622766999422
},
{
"clip_ratio/high_max": 0.007670001010410488,
"clip_ratio/high_mean": 0.004492895153816789,
"clip_ratio/low_mean": 0.005284888495225459,
"clip_ratio/low_min": 0.0012886597542092204,
"clip_ratio/region_mean": 0.009777783416211605,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 3973.0,
"completions/max_terminated_length": 3973.0,
"completions/mean_length": 3476.15625,
"completions/mean_terminated_length": 3471.67724609375,
"completions/min_length": 1666.0,
"completions/min_terminated_length": 1666.0,
"entropy": 2.1275693476200104,
"epoch": 0.00075,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.00603072764351964,
"kl": 0.0015271206430043094,
"learning_rate": 9.999999921805106e-06,
"loss": -0.0,
"num_tokens": 3735493.0,
"reward": -10.968659400939941,
"reward_std": 9.847198486328125,
"rewards/rollout_reward_func/mean": -10.968659400939941,
"rewards/rollout_reward_func/std": 9.763160705566406,
"sampling/importance_sampling_ratio/max": 0.007295163348317146,
"sampling/importance_sampling_ratio/mean": 0.0008178789867088199,
"sampling/importance_sampling_ratio/min": 3.297161219819182e-30,
"sampling/sampling_logp_difference/max": 16.79660987854004,
"sampling/sampling_logp_difference/mean": 0.2770848870277405,
"step": 75,
"step_time": 116.77516848400046
}
],
"logging_steps": 1.0,
"max_steps": 600000,
"num_input_tokens_seen": 3735493,
"num_train_epochs": 6,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}