LLM_project / checkpoint-4491 /trainer_state.json
narySt's picture
Upload folder using huggingface_hub
a22bb1f verified
Invalid JSON: Unexpected token 'I', ..."ad_norm": Infinity, "... is not valid JSON
{
"best_global_step": 4491,
"best_metric": 0.41118884086608887,
"best_model_checkpoint": "models/grpo_toxic_qwen/checkpoint-4491",
"epoch": 0.9996661101836394,
"eval_steps": 2696,
"global_step": 4491,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.890625,
"completions/max_length": 128.0,
"completions/max_terminated_length": 106.0,
"completions/mean_length": 119.59375,
"completions/mean_terminated_length": 51.142860412597656,
"completions/min_length": 15.0,
"completions/min_terminated_length": 15.0,
"epoch": 0.00022259321090706732,
"frac_reward_zero_std": 0.0,
"grad_norm": 5.030076026916504,
"kl": 3.605344500101637e-05,
"learning_rate": 0.0,
"loss": -0.0286,
"num_tokens": 9462.0,
"reward": -6.696479797363281,
"reward_std": 2.205897808074951,
"rewards/RewardModelWrapper/mean": -6.696479797363281,
"rewards/RewardModelWrapper/std": 2.596616506576538,
"step": 1
},
{
"clip_ratio/high_max": 0.00045590819666228654,
"clip_ratio/high_mean": 0.00045590819666228654,
"clip_ratio/low_mean": 9.893491918848333e-05,
"clip_ratio/low_min": 9.893491918848333e-05,
"clip_ratio/region_mean": 0.0005548431188205485,
"completions/clipped_ratio": 0.91015625,
"completions/max_length": 128.0,
"completions/max_terminated_length": 115.375,
"completions/mean_length": 124.541015625,
"completions/mean_terminated_length": 88.15992164611816,
"completions/min_length": 53.8125,
"completions/min_terminated_length": 53.8125,
"epoch": 0.011129660545353366,
"frac_reward_zero_std": 0.0,
"grad_norm": 4.771069526672363,
"kl": 0.0014390296781742536,
"learning_rate": 7.350000000000001e-07,
"loss": -0.0097,
"num_tokens": 164224.0,
"reward": -6.273432105779648,
"reward_std": 2.3787402510643005,
"rewards/RewardModelWrapper/mean": -6.273432105779648,
"rewards/RewardModelWrapper/std": 3.4789108261466026,
"step": 50
},
{
"clip_ratio/high_max": 0.0075913356387172825,
"clip_ratio/high_mean": 0.0075913356387172825,
"clip_ratio/low_mean": 0.003807623453612905,
"clip_ratio/low_min": 0.003807623453612905,
"clip_ratio/region_mean": 0.011398959086218383,
"completions/clipped_ratio": 0.8915441176470589,
"completions/max_length": 128.0,
"completions/max_terminated_length": 110.17647058823529,
"completions/mean_length": 123.2251838235294,
"completions/mean_terminated_length": 81.49435559441062,
"completions/min_length": 44.470588235294116,
"completions/min_terminated_length": 44.470588235294116,
"epoch": 0.022259321090706732,
"frac_reward_zero_std": 0.0,
"grad_norm": 4.885649681091309,
"kl": 0.019224860495887695,
"learning_rate": 1.485e-06,
"loss": -0.0113,
"num_tokens": 327613.0,
"reward": -5.39674503663007,
"reward_std": 2.7843008882859173,
"rewards/RewardModelWrapper/mean": -5.39674503663007,
"rewards/RewardModelWrapper/std": 3.8948283475988053,
"step": 100
},
{
"clip_ratio/high_max": 0.01675744824227877,
"clip_ratio/high_mean": 0.01675744824227877,
"clip_ratio/low_mean": 0.012073511610215065,
"clip_ratio/low_min": 0.012073511610215065,
"clip_ratio/region_mean": 0.028830959817860276,
"completions/clipped_ratio": 0.9091796875,
"completions/max_length": 128.0,
"completions/max_terminated_length": 107.3125,
"completions/mean_length": 124.3720703125,
"completions/mean_terminated_length": 81.7018609046936,
"completions/min_length": 54.6875,
"completions/min_terminated_length": 46.6875,
"epoch": 0.0333889816360601,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.0026164054870605,
"kl": 0.04671986572444439,
"learning_rate": 2.235e-06,
"loss": 0.0052,
"num_tokens": 482354.0,
"reward": -5.547765076160431,
"reward_std": 2.73693485558033,
"rewards/RewardModelWrapper/mean": -5.547765076160431,
"rewards/RewardModelWrapper/std": 3.441145323216915,
"step": 150
},
{
"clip_ratio/high_max": 0.02414312065928243,
"clip_ratio/high_mean": 0.02414312065928243,
"clip_ratio/low_mean": 0.017463966414215975,
"clip_ratio/low_min": 0.017463966414215975,
"clip_ratio/region_mean": 0.04160708721727133,
"completions/clipped_ratio": 0.9200367647058824,
"completions/max_length": 128.0,
"completions/max_terminated_length": 119.6470588235294,
"completions/mean_length": 125.29503676470588,
"completions/mean_terminated_length": 94.28872680664062,
"completions/min_length": 65.58823529411765,
"completions/min_terminated_length": 65.58823529411765,
"epoch": 0.044518642181413465,
"frac_reward_zero_std": 0.007352941176470588,
"grad_norm": 3.9181442260742188,
"kl": 0.0877579689398408,
"learning_rate": 2.97e-06,
"loss": 0.0105,
"num_tokens": 648123.0,
"reward": -4.304881698944989,
"reward_std": 3.38148234872257,
"rewards/RewardModelWrapper/mean": -4.304881698944989,
"rewards/RewardModelWrapper/std": 4.617817443959853,
"step": 200
},
{
"clip_ratio/high_max": 0.029861916538793595,
"clip_ratio/high_mean": 0.029861916538793595,
"clip_ratio/low_mean": 0.023766413825796917,
"clip_ratio/low_min": 0.023766413825796917,
"clip_ratio/region_mean": 0.05362833026330918,
"completions/clipped_ratio": 0.9172794117647058,
"completions/max_length": 128.0,
"completions/max_terminated_length": 116.0,
"completions/mean_length": 124.69761029411765,
"completions/mean_terminated_length": 89.32857289033778,
"completions/min_length": 53.23529411764706,
"completions/min_terminated_length": 53.23529411764706,
"epoch": 0.05564830272676683,
"frac_reward_zero_std": 0.0,
"grad_norm": 6.056563377380371,
"kl": 0.14030909642577172,
"learning_rate": 2.9836102890962895e-06,
"loss": 0.0228,
"num_tokens": 813050.0,
"reward": -4.152413817013011,
"reward_std": 3.3082274689393887,
"rewards/RewardModelWrapper/mean": -4.152413817013011,
"rewards/RewardModelWrapper/std": 4.367143616956823,
"step": 250
},
{
"clip_ratio/high_max": 0.030323101801332086,
"clip_ratio/high_mean": 0.030323101801332086,
"clip_ratio/low_mean": 0.021581946768565105,
"clip_ratio/low_min": 0.021581946768565105,
"clip_ratio/region_mean": 0.051905048433691266,
"completions/clipped_ratio": 0.9248046875,
"completions/max_length": 128.0,
"completions/max_terminated_length": 116.1875,
"completions/mean_length": 125.275390625,
"completions/mean_terminated_length": 89.46597385406494,
"completions/min_length": 56.5625,
"completions/min_terminated_length": 56.5625,
"epoch": 0.0667779632721202,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.983767032623291,
"kl": 0.1569075232744217,
"learning_rate": 2.966537673571591e-06,
"loss": 0.0317,
"num_tokens": 969156.0,
"reward": -3.388260453939438,
"reward_std": 3.2063718885183334,
"rewards/RewardModelWrapper/mean": -3.388260453939438,
"rewards/RewardModelWrapper/std": 4.789341554045677,
"step": 300
},
{
"clip_ratio/high_max": 0.027688504084944724,
"clip_ratio/high_mean": 0.027688504084944724,
"clip_ratio/low_mean": 0.019530020136153327,
"clip_ratio/low_min": 0.019530020136153327,
"clip_ratio/region_mean": 0.04721852412912995,
"completions/clipped_ratio": 0.9191176470588235,
"completions/max_length": 128.0,
"completions/max_terminated_length": 120.05882352941177,
"completions/mean_length": 124.81709558823529,
"completions/mean_terminated_length": 90.29201911477482,
"completions/min_length": 52.64705882352941,
"completions/min_terminated_length": 52.64705882352941,
"epoch": 0.07790762381747357,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.5877625942230225,
"kl": 0.1643798241391778,
"learning_rate": 2.9494650580468926e-06,
"loss": 0.0293,
"num_tokens": 1134229.0,
"reward": -3.141713114345775,
"reward_std": 3.3975463895236744,
"rewards/RewardModelWrapper/mean": -3.141713114345775,
"rewards/RewardModelWrapper/std": 4.820348431082333,
"step": 350
},
{
"clip_ratio/high_max": 0.028169492546003313,
"clip_ratio/high_mean": 0.028169492546003313,
"clip_ratio/low_mean": 0.019790295051643626,
"clip_ratio/low_min": 0.019790295051643626,
"clip_ratio/region_mean": 0.04795978774316609,
"completions/clipped_ratio": 0.9310661764705882,
"completions/max_length": 128.0,
"completions/max_terminated_length": 108.11764705882354,
"completions/mean_length": 125.64889705882354,
"completions/mean_terminated_length": 87.01379753561581,
"completions/min_length": 62.1764705882353,
"completions/min_terminated_length": 54.64705882352941,
"epoch": 0.08903728436282693,
"frac_reward_zero_std": 0.0,
"grad_norm": 5.695188999176025,
"kl": 0.30343326754868033,
"learning_rate": 2.933416799453676e-06,
"loss": 0.0748,
"num_tokens": 1300167.0,
"reward": -3.474738233229693,
"reward_std": 3.482299538219676,
"rewards/RewardModelWrapper/mean": -3.474738233229693,
"rewards/RewardModelWrapper/std": 4.745730189716115,
"step": 400
},
{
"clip_ratio/high_max": 0.029925933612976224,
"clip_ratio/high_mean": 0.029925933612976224,
"clip_ratio/low_mean": 0.019293442433699966,
"clip_ratio/low_min": 0.019293442433699966,
"clip_ratio/region_mean": 0.04921937589067966,
"completions/clipped_ratio": 0.943359375,
"completions/max_length": 128.0,
"completions/max_terminated_length": 110.9375,
"completions/mean_length": 126.4140625,
"completions/mean_terminated_length": 95.56250047683716,
"completions/min_length": 79.5,
"completions/min_terminated_length": 71.5,
"epoch": 0.1001669449081803,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.636467218399048,
"kl": 0.19514246992766857,
"learning_rate": 2.9163441839289777e-06,
"loss": 0.0415,
"num_tokens": 1457135.0,
"reward": -3.0616072714328766,
"reward_std": 3.3436961472034454,
"rewards/RewardModelWrapper/mean": -3.0616072714328766,
"rewards/RewardModelWrapper/std": 4.945626050233841,
"step": 450
},
{
"clip_ratio/high_max": 0.027343249125406147,
"clip_ratio/high_mean": 0.027343249125406147,
"clip_ratio/low_mean": 0.01768903057440184,
"clip_ratio/low_min": 0.01768903057440184,
"clip_ratio/region_mean": 0.04503227963577956,
"completions/clipped_ratio": 0.9393382352941176,
"completions/max_length": 128.0,
"completions/max_terminated_length": 109.47058823529412,
"completions/mean_length": 126.07444852941177,
"completions/mean_terminated_length": 92.27339037726907,
"completions/min_length": 73.88235294117646,
"completions/min_terminated_length": 66.3529411764706,
"epoch": 0.11129660545353366,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.624467134475708,
"kl": 0.19471221148967743,
"learning_rate": 2.8992715684042796e-06,
"loss": 0.0459,
"num_tokens": 1623608.0,
"reward": -3.0403577299679028,
"reward_std": 3.5023320422453037,
"rewards/RewardModelWrapper/mean": -3.0403577299679028,
"rewards/RewardModelWrapper/std": 4.758344790514777,
"step": 500
},
{
"clip_ratio/high_max": 0.026099461197154596,
"clip_ratio/high_mean": 0.026099461197154596,
"clip_ratio/low_mean": 0.01860616845311597,
"clip_ratio/low_min": 0.01860616845311597,
"clip_ratio/region_mean": 0.04470562972594053,
"completions/clipped_ratio": 0.9209558823529411,
"completions/max_length": 128.0,
"completions/max_terminated_length": 111.3529411764706,
"completions/mean_length": 125.32536764705883,
"completions/mean_terminated_length": 86.94334905287799,
"completions/min_length": 54.64705882352941,
"completions/min_terminated_length": 47.11764705882353,
"epoch": 0.12242626599888703,
"frac_reward_zero_std": 0.0,
"grad_norm": 5.117679119110107,
"kl": 0.1926309671998024,
"learning_rate": 2.882198952879581e-06,
"loss": 0.0424,
"num_tokens": 1789042.0,
"reward": -3.364777831470265,
"reward_std": 3.6073132402756634,
"rewards/RewardModelWrapper/mean": -3.364777831470265,
"rewards/RewardModelWrapper/std": 4.984979461221134,
"step": 550
},
{
"clip_ratio/high_max": 0.027654693657532335,
"clip_ratio/high_mean": 0.027654693657532335,
"clip_ratio/low_mean": 0.01964853117824532,
"clip_ratio/low_min": 0.01964853117824532,
"clip_ratio/region_mean": 0.047303224778734144,
"completions/clipped_ratio": 0.8984375,
"completions/max_length": 128.0,
"completions/max_terminated_length": 124.5,
"completions/mean_length": 124.734375,
"completions/mean_terminated_length": 98.57239484786987,
"completions/min_length": 61.25,
"completions/min_terminated_length": 61.25,
"epoch": 0.1335559265442404,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.241142511367798,
"kl": 0.211693402081728,
"learning_rate": 2.865126337354883e-06,
"loss": 0.0498,
"num_tokens": 1944610.0,
"reward": -2.732655808329582,
"reward_std": 3.617541193962097,
"rewards/RewardModelWrapper/mean": -2.732655808329582,
"rewards/RewardModelWrapper/std": 4.809614151716232,
"step": 600
},
{
"clip_ratio/high_max": 0.027527469391934574,
"clip_ratio/high_mean": 0.027527469391934574,
"clip_ratio/low_mean": 0.019259323065634815,
"clip_ratio/low_min": 0.019259323065634815,
"clip_ratio/region_mean": 0.046786792553029956,
"completions/clipped_ratio": 0.8933823529411765,
"completions/max_length": 128.0,
"completions/max_terminated_length": 121.47058823529412,
"completions/mean_length": 124.16727941176471,
"completions/mean_terminated_length": 96.0017848295324,
"completions/min_length": 59.23529411764706,
"completions/min_terminated_length": 59.23529411764706,
"epoch": 0.14468558708959378,
"frac_reward_zero_std": 0.0,
"grad_norm": 4.524644374847412,
"kl": 0.2376550894230604,
"learning_rate": 2.8480537218301847e-06,
"loss": 0.0528,
"num_tokens": 2109128.0,
"reward": -1.8917093557469986,
"reward_std": 3.8112815267899456,
"rewards/RewardModelWrapper/mean": -1.8917093557469986,
"rewards/RewardModelWrapper/std": 5.167453260982738,
"step": 650
},
{
"clip_ratio/high_max": 0.027425415357574822,
"clip_ratio/high_mean": 0.027425415357574822,
"clip_ratio/low_mean": 0.01982414353871718,
"clip_ratio/low_min": 0.01982414353871718,
"clip_ratio/region_mean": 0.04724955870769918,
"completions/clipped_ratio": 0.8602941176470589,
"completions/max_length": 128.0,
"completions/max_terminated_length": 121.05882352941177,
"completions/mean_length": 123.19117647058823,
"completions/mean_terminated_length": 94.26595889820771,
"completions/min_length": 52.0,
"completions/min_terminated_length": 52.0,
"epoch": 0.15581524763494714,
"frac_reward_zero_std": 0.0,
"grad_norm": 5.250056743621826,
"kl": 0.22009442321956157,
"learning_rate": 2.830981106305486e-06,
"loss": 0.044,
"num_tokens": 2272320.0,
"reward": -2.427665850695442,
"reward_std": 3.78492192661061,
"rewards/RewardModelWrapper/mean": -2.427665850695442,
"rewards/RewardModelWrapper/std": 4.859750719631419,
"step": 700
},
{
"clip_ratio/high_max": 0.02454757507191971,
"clip_ratio/high_mean": 0.02454757507191971,
"clip_ratio/low_mean": 0.0160788345040055,
"clip_ratio/low_min": 0.0160788345040055,
"clip_ratio/region_mean": 0.04062640947755426,
"completions/clipped_ratio": 0.8837890625,
"completions/max_length": 128.0,
"completions/max_terminated_length": 117.25,
"completions/mean_length": 123.8193359375,
"completions/mean_terminated_length": 92.61992502212524,
"completions/min_length": 56.125,
"completions/min_terminated_length": 56.125,
"epoch": 0.1669449081803005,
"frac_reward_zero_std": 0.0,
"grad_norm": 6.143310070037842,
"kl": 0.2187542901188135,
"learning_rate": 2.8139084907807877e-06,
"loss": 0.0458,
"num_tokens": 2426567.0,
"reward": -2.639084130525589,
"reward_std": 4.0981148183345795,
"rewards/RewardModelWrapper/mean": -2.639084130525589,
"rewards/RewardModelWrapper/std": 5.267414927482605,
"step": 750
},
{
"clip_ratio/high_max": 0.023827595426701008,
"clip_ratio/high_mean": 0.023827595426701008,
"clip_ratio/low_mean": 0.01665229408070445,
"clip_ratio/low_min": 0.01665229408070445,
"clip_ratio/region_mean": 0.04047988944686949,
"completions/clipped_ratio": 0.9172794117647058,
"completions/max_length": 128.0,
"completions/max_terminated_length": 117.41176470588235,
"completions/mean_length": 124.52849264705883,
"completions/mean_terminated_length": 86.05495004092946,
"completions/min_length": 53.294117647058826,
"completions/min_terminated_length": 53.294117647058826,
"epoch": 0.17807456872565386,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.82859206199646,
"kl": 0.2341267079859972,
"learning_rate": 2.7968358752560893e-06,
"loss": 0.0495,
"num_tokens": 2591422.0,
"reward": -1.696559471242568,
"reward_std": 4.100044530980727,
"rewards/RewardModelWrapper/mean": -1.696559471242568,
"rewards/RewardModelWrapper/std": 5.4215626155628875,
"step": 800
},
{
"clip_ratio/high_max": 0.025062179565429686,
"clip_ratio/high_mean": 0.025062179565429686,
"clip_ratio/low_mean": 0.018277215642156078,
"clip_ratio/low_min": 0.018277215642156078,
"clip_ratio/region_mean": 0.04333939506206661,
"completions/clipped_ratio": 0.9292279411764706,
"completions/max_length": 128.0,
"completions/max_terminated_length": 113.58823529411765,
"completions/mean_length": 125.45588235294117,
"completions/mean_terminated_length": 91.19166834214154,
"completions/min_length": 58.94117647058823,
"completions/min_terminated_length": 58.94117647058823,
"epoch": 0.18920422927100725,
"frac_reward_zero_std": 0.0,
"grad_norm": 4.91913366317749,
"kl": 0.22518106378614902,
"learning_rate": 2.779763259731391e-06,
"loss": 0.0531,
"num_tokens": 2757254.0,
"reward": -0.3187214837354772,
"reward_std": 5.127424436457017,
"rewards/RewardModelWrapper/mean": -0.3187214837354772,
"rewards/RewardModelWrapper/std": 5.87655990263995,
"step": 850
},
{
"clip_ratio/high_max": 0.02293195443926379,
"clip_ratio/high_mean": 0.02293195443926379,
"clip_ratio/low_mean": 0.017691890239948407,
"clip_ratio/low_min": 0.017691890239948407,
"clip_ratio/region_mean": 0.040623844610527156,
"completions/clipped_ratio": 0.9091796875,
"completions/max_length": 128.0,
"completions/max_terminated_length": 119.875,
"completions/mean_length": 124.306640625,
"completions/mean_terminated_length": 88.5287561416626,
"completions/min_length": 47.5,
"completions/min_terminated_length": 47.5,
"epoch": 0.2003338898163606,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.8705227375030518,
"kl": 0.23920009069144726,
"learning_rate": 2.7626906442066923e-06,
"loss": 0.0608,
"num_tokens": 2912304.0,
"reward": -0.5163702219724655,
"reward_std": 5.298731863498688,
"rewards/RewardModelWrapper/mean": -0.5163702219724655,
"rewards/RewardModelWrapper/std": 5.84825000166893,
"step": 900
},
{
"clip_ratio/high_max": 0.02397001946810633,
"clip_ratio/high_mean": 0.02397001946810633,
"clip_ratio/low_mean": 0.016966249566758053,
"clip_ratio/low_min": 0.016966249566758053,
"clip_ratio/region_mean": 0.040936269152443854,
"completions/clipped_ratio": 0.9053308823529411,
"completions/max_length": 128.0,
"completions/max_terminated_length": 112.17647058823529,
"completions/mean_length": 124.65533088235294,
"completions/mean_terminated_length": 90.35452988568474,
"completions/min_length": 57.64705882352941,
"completions/min_terminated_length": 57.64705882352941,
"epoch": 0.21146355036171396,
"frac_reward_zero_std": 0.0,
"grad_norm": 4.009652137756348,
"kl": 0.28723155200481415,
"learning_rate": 2.7456180286819943e-06,
"loss": 0.0623,
"num_tokens": 3077033.0,
"reward": 0.3360518918317907,
"reward_std": 5.125342537375057,
"rewards/RewardModelWrapper/mean": 0.3360518918317907,
"rewards/RewardModelWrapper/std": 5.78782990399529,
"step": 950
},
{
"clip_ratio/high_max": 0.025908510715235023,
"clip_ratio/high_mean": 0.025908510715235023,
"clip_ratio/low_mean": 0.017599179263343104,
"clip_ratio/low_min": 0.017599179263343104,
"clip_ratio/region_mean": 0.04350769010838121,
"completions/clipped_ratio": 0.9172794117647058,
"completions/max_length": 128.0,
"completions/max_terminated_length": 121.41176470588235,
"completions/mean_length": 125.03125,
"completions/mean_terminated_length": 94.79173772475299,
"completions/min_length": 61.88235294117647,
"completions/min_terminated_length": 61.88235294117647,
"epoch": 0.22259321090706732,
"frac_reward_zero_std": 0.0,
"grad_norm": 7.159637928009033,
"kl": 0.35069699488580225,
"learning_rate": 2.728545413157296e-06,
"loss": 0.0847,
"num_tokens": 3242123.0,
"reward": 1.627946559120627,
"reward_std": 4.790118554059197,
"rewards/RewardModelWrapper/mean": 1.627946559120627,
"rewards/RewardModelWrapper/std": 5.393552022821763,
"step": 1000
},
{
"clip_ratio/high_max": 0.024083305108360945,
"clip_ratio/high_mean": 0.024083305108360945,
"clip_ratio/low_mean": 0.013416973181592766,
"clip_ratio/low_min": 0.013416973181592766,
"clip_ratio/region_mean": 0.0375002783536911,
"completions/clipped_ratio": 0.9267578125,
"completions/max_length": 128.0,
"completions/max_terminated_length": 112.8125,
"completions/mean_length": 125.0126953125,
"completions/mean_terminated_length": 87.55602884292603,
"completions/min_length": 57.0,
"completions/min_terminated_length": 57.0,
"epoch": 0.2337228714524207,
"frac_reward_zero_std": 0.0,
"grad_norm": 5.147945880889893,
"kl": 0.3470122530311346,
"learning_rate": 2.7114727976325973e-06,
"loss": 0.089,
"num_tokens": 3397872.0,
"reward": 0.24295206367969513,
"reward_std": 5.033507749438286,
"rewards/RewardModelWrapper/mean": 0.24295206367969513,
"rewards/RewardModelWrapper/std": 5.808434098958969,
"step": 1050
},
{
"clip_ratio/high_max": 0.024642590049188583,
"clip_ratio/high_mean": 0.024642590049188583,
"clip_ratio/low_mean": 0.013819608901976609,
"clip_ratio/low_min": 0.013819608901976609,
"clip_ratio/region_mean": 0.0384621987817809,
"completions/clipped_ratio": 0.9264705882352942,
"completions/max_length": 128.0,
"completions/max_terminated_length": 112.70588235294117,
"completions/mean_length": 125.2408088235294,
"completions/mean_terminated_length": 90.793908960679,
"completions/min_length": 65.17647058823529,
"completions/min_terminated_length": 65.17647058823529,
"epoch": 0.24485253199777407,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.4217491149902344,
"kl": 0.377669473439455,
"learning_rate": 2.694400182107899e-06,
"loss": 0.0977,
"num_tokens": 3563110.0,
"reward": 0.4081239700317383,
"reward_std": 5.032071225783405,
"rewards/RewardModelWrapper/mean": 0.4081239700317383,
"rewards/RewardModelWrapper/std": 5.893623436198515,
"step": 1100
},
{
"clip_ratio/high_max": 0.022081555526237934,
"clip_ratio/high_mean": 0.022081555526237934,
"clip_ratio/low_mean": 0.015956819643906783,
"clip_ratio/low_min": 0.015956819643906783,
"clip_ratio/region_mean": 0.038038374953903255,
"completions/clipped_ratio": 0.9365808823529411,
"completions/max_length": 128.0,
"completions/max_terminated_length": 95.0,
"completions/mean_length": 125.22426470588235,
"completions/mean_terminated_length": 73.54575303021599,
"completions/min_length": 63.94117647058823,
"completions/min_terminated_length": 48.88235294117647,
"epoch": 0.25598219254312743,
"frac_reward_zero_std": 0.0,
"grad_norm": 4.748858451843262,
"kl": 0.42920990511775015,
"learning_rate": 2.677327566583201e-06,
"loss": 0.1117,
"num_tokens": 3728050.0,
"reward": 1.730754810221055,
"reward_std": 4.819248423856847,
"rewards/RewardModelWrapper/mean": 1.730754810221055,
"rewards/RewardModelWrapper/std": 5.504547006943646,
"step": 1150
},
{
"clip_ratio/high_max": 0.02080470887827687,
"clip_ratio/high_mean": 0.02080470887827687,
"clip_ratio/low_mean": 0.01475024281651713,
"clip_ratio/low_min": 0.01475024281651713,
"clip_ratio/region_mean": 0.03555495172040537,
"completions/clipped_ratio": 0.962890625,
"completions/max_length": 128.0,
"completions/max_terminated_length": 98.625,
"completions/mean_length": 126.416015625,
"completions/mean_terminated_length": 81.55208349227905,
"completions/min_length": 72.375,
"completions/min_terminated_length": 64.375,
"epoch": 0.2671118530884808,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.4883859157562256,
"kl": 0.42720063477754594,
"learning_rate": 2.6602549510585024e-06,
"loss": 0.1195,
"num_tokens": 3884876.0,
"reward": 1.9971511512994766,
"reward_std": 4.786640420556068,
"rewards/RewardModelWrapper/mean": 1.9971511512994766,
"rewards/RewardModelWrapper/std": 5.766968697309494,
"step": 1200
},
{
"clip_ratio/high_max": 0.023233274864032864,
"clip_ratio/high_mean": 0.023233274864032864,
"clip_ratio/low_mean": 0.01158983559376793,
"clip_ratio/low_min": 0.01158983559376793,
"clip_ratio/region_mean": 0.03482311038998887,
"completions/clipped_ratio": 0.9347426470588235,
"completions/max_length": 128.0,
"completions/max_terminated_length": 114.17647058823529,
"completions/mean_length": 125.17738970588235,
"completions/mean_terminated_length": 89.91648954503677,
"completions/min_length": 64.29411764705883,
"completions/min_terminated_length": 64.29411764705883,
"epoch": 0.27824151363383415,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.5916192531585693,
"kl": 0.3887044958770275,
"learning_rate": 2.643182335533804e-06,
"loss": 0.1015,
"num_tokens": 4050013.0,
"reward": 0.8245974989498362,
"reward_std": 5.001701130586512,
"rewards/RewardModelWrapper/mean": 0.8245974989498362,
"rewards/RewardModelWrapper/std": 5.80830400130328,
"step": 1250
},
{
"clip_ratio/high_max": 0.020529154643882067,
"clip_ratio/high_mean": 0.020529154643882067,
"clip_ratio/low_mean": 0.015356352158414665,
"clip_ratio/low_min": 0.015356352158414665,
"clip_ratio/region_mean": 0.03588550680316985,
"completions/clipped_ratio": 0.9512867647058824,
"completions/max_length": 128.0,
"completions/max_terminated_length": 100.76470588235294,
"completions/mean_length": 125.86305147058823,
"completions/mean_terminated_length": 81.50539353314568,
"completions/min_length": 63.470588235294116,
"completions/min_terminated_length": 55.94117647058823,
"epoch": 0.28937117417918756,
"frac_reward_zero_std": 0.0,
"grad_norm": 6.05077600479126,
"kl": 0.4404847612977028,
"learning_rate": 2.6261097200091054e-06,
"loss": 0.1208,
"num_tokens": 4215832.0,
"reward": 2.3118093013763428,
"reward_std": 4.841920866685755,
"rewards/RewardModelWrapper/mean": 2.3118093013763428,
"rewards/RewardModelWrapper/std": 5.525171279907227,
"step": 1300
},
{
"clip_ratio/high_max": 0.024388792894314976,
"clip_ratio/high_mean": 0.024388792894314976,
"clip_ratio/low_mean": 0.015166401157330256,
"clip_ratio/low_min": 0.015166401157330256,
"clip_ratio/region_mean": 0.039555194084532556,
"completions/clipped_ratio": 0.9345703125,
"completions/max_length": 128.0,
"completions/max_terminated_length": 111.5,
"completions/mean_length": 125.7822265625,
"completions/mean_terminated_length": 88.69479322433472,
"completions/min_length": 64.0625,
"completions/min_terminated_length": 56.0625,
"epoch": 0.3005008347245409,
"frac_reward_zero_std": 0.0,
"grad_norm": 4.280036926269531,
"kl": 0.4519739609956741,
"learning_rate": 2.609037104484407e-06,
"loss": 0.1237,
"num_tokens": 4372361.0,
"reward": 2.7925052791833878,
"reward_std": 4.665284767746925,
"rewards/RewardModelWrapper/mean": 2.7925052791833878,
"rewards/RewardModelWrapper/std": 5.4118489027023315,
"step": 1350
},
{
"clip_ratio/high_max": 0.023544567436911166,
"clip_ratio/high_mean": 0.023544567436911166,
"clip_ratio/low_mean": 0.01299051069712732,
"clip_ratio/low_min": 0.01299051069712732,
"clip_ratio/region_mean": 0.03653507822658866,
"completions/clipped_ratio": 0.9292279411764706,
"completions/max_length": 128.0,
"completions/max_terminated_length": 111.82352941176471,
"completions/mean_length": 124.65165441176471,
"completions/mean_terminated_length": 82.68410469503964,
"completions/min_length": 46.8235294117647,
"completions/min_terminated_length": 46.8235294117647,
"epoch": 0.3116304952698943,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.7401034832000732,
"kl": 0.4922306627035141,
"learning_rate": 2.591964488959709e-06,
"loss": 0.1315,
"num_tokens": 4537846.0,
"reward": 2.862899471731747,
"reward_std": 4.948802695554845,
"rewards/RewardModelWrapper/mean": 2.862899471731747,
"rewards/RewardModelWrapper/std": 5.503605421851663,
"step": 1400
},
{
"clip_ratio/high_max": 0.025161673842230812,
"clip_ratio/high_mean": 0.025161673842230812,
"clip_ratio/low_mean": 0.012126781771657989,
"clip_ratio/low_min": 0.012126781771657989,
"clip_ratio/region_mean": 0.037288455746602264,
"completions/clipped_ratio": 0.9292279411764706,
"completions/max_length": 128.0,
"completions/max_terminated_length": 117.17647058823529,
"completions/mean_length": 125.05882352941177,
"completions/mean_terminated_length": 90.7926357493681,
"completions/min_length": 62.11764705882353,
"completions/min_terminated_length": 62.11764705882353,
"epoch": 0.32276015581524764,
"frac_reward_zero_std": 0.0,
"grad_norm": 5.467922687530518,
"kl": 0.46787314653396606,
"learning_rate": 2.5748918734350105e-06,
"loss": 0.1207,
"num_tokens": 4702966.0,
"reward": 1.3220273045932545,
"reward_std": 4.946936158572926,
"rewards/RewardModelWrapper/mean": 1.3220273045932545,
"rewards/RewardModelWrapper/std": 5.816557715920841,
"step": 1450
},
{
"clip_ratio/high_max": 0.025372368972748516,
"clip_ratio/high_mean": 0.025372368972748516,
"clip_ratio/low_mean": 0.01208616121119121,
"clip_ratio/low_min": 0.01208616121119121,
"clip_ratio/region_mean": 0.03745853026397526,
"completions/clipped_ratio": 0.9599609375,
"completions/max_length": 128.0,
"completions/max_terminated_length": 88.5625,
"completions/mean_length": 126.3486328125,
"completions/mean_terminated_length": 74.44687557220459,
"completions/min_length": 73.875,
"completions/min_terminated_length": 57.875,
"epoch": 0.333889816360601,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.3412301540374756,
"kl": 0.5128468088805676,
"learning_rate": 2.557819257910312e-06,
"loss": 0.1397,
"num_tokens": 4860219.0,
"reward": 2.0575065165758133,
"reward_std": 5.155221775174141,
"rewards/RewardModelWrapper/mean": 2.0575065165758133,
"rewards/RewardModelWrapper/std": 5.655524164438248,
"step": 1500
},
{
"clip_ratio/high_max": 0.023876634621992708,
"clip_ratio/high_mean": 0.023876634621992708,
"clip_ratio/low_mean": 0.013062482952955179,
"clip_ratio/low_min": 0.013062482952955179,
"clip_ratio/region_mean": 0.03693911746609956,
"completions/clipped_ratio": 0.9604779411764706,
"completions/max_length": 128.0,
"completions/max_terminated_length": 84.47058823529412,
"completions/mean_length": 126.6001838235294,
"completions/mean_terminated_length": 73.35098131965188,
"completions/min_length": 92.58823529411765,
"completions/min_terminated_length": 62.470588235294116,
"epoch": 0.34501947690595436,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.4934329986572266,
"kl": 0.5125479310750961,
"learning_rate": 2.5407466423856135e-06,
"loss": 0.139,
"num_tokens": 5027488.0,
"reward": 2.7399597448461197,
"reward_std": 4.745976616354549,
"rewards/RewardModelWrapper/mean": 2.7399597448461197,
"rewards/RewardModelWrapper/std": 5.300730144276338,
"step": 1550
},
{
"clip_ratio/high_max": 0.02330939914332703,
"clip_ratio/high_mean": 0.02330939914332703,
"clip_ratio/low_mean": 0.009550860303861555,
"clip_ratio/low_min": 0.009550860303861555,
"clip_ratio/region_mean": 0.032860259409062564,
"completions/clipped_ratio": 0.9641544117647058,
"completions/max_length": 128.0,
"completions/max_terminated_length": 95.70588235294117,
"completions/mean_length": 126.61397058823529,
"completions/mean_terminated_length": 81.39460844152114,
"completions/min_length": 81.88235294117646,
"completions/min_terminated_length": 66.82352941176471,
"epoch": 0.3561491374513077,
"frac_reward_zero_std": 0.0,
"grad_norm": 4.760233402252197,
"kl": 0.5104251652956009,
"learning_rate": 2.523674026860915e-06,
"loss": 0.1396,
"num_tokens": 5193996.0,
"reward": 1.9389969741596895,
"reward_std": 5.14070810991175,
"rewards/RewardModelWrapper/mean": 1.9389969741596895,
"rewards/RewardModelWrapper/std": 5.778058921589571,
"step": 1600
},
{
"clip_ratio/high_max": 0.023669966620218474,
"clip_ratio/high_mean": 0.023669966620218474,
"clip_ratio/low_mean": 0.012192065346171147,
"clip_ratio/low_min": 0.012192065346171147,
"clip_ratio/region_mean": 0.035862031998112796,
"completions/clipped_ratio": 0.9599609375,
"completions/max_length": 128.0,
"completions/max_terminated_length": 105.625,
"completions/mean_length": 126.4326171875,
"completions/mean_terminated_length": 91.0947916507721,
"completions/min_length": 77.5,
"completions/min_terminated_length": 77.5,
"epoch": 0.3672787979966611,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.496079206466675,
"kl": 0.5115452679991722,
"learning_rate": 2.5066014113362166e-06,
"loss": 0.1421,
"num_tokens": 5350823.0,
"reward": 2.4139109551906586,
"reward_std": 4.767535001039505,
"rewards/RewardModelWrapper/mean": 2.4139109551906586,
"rewards/RewardModelWrapper/std": 5.584080070257187,
"step": 1650
},
{
"clip_ratio/high_max": 0.026008948455564677,
"clip_ratio/high_mean": 0.026008948455564677,
"clip_ratio/low_mean": 0.008926556244841777,
"clip_ratio/low_min": 0.008926556244841777,
"clip_ratio/region_mean": 0.0349355046171695,
"completions/clipped_ratio": 0.9604779411764706,
"completions/max_length": 128.0,
"completions/max_terminated_length": 105.94117647058823,
"completions/mean_length": 126.34926470588235,
"completions/mean_terminated_length": 89.24902052037856,
"completions/min_length": 71.0,
"completions/min_terminated_length": 71.0,
"epoch": 0.3784084585420145,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.9608895778656006,
"kl": 0.5186072036623954,
"learning_rate": 2.489528795811518e-06,
"loss": 0.1427,
"num_tokens": 5517627.0,
"reward": 1.1372435163049137,
"reward_std": 5.190363294938031,
"rewards/RewardModelWrapper/mean": 1.1372435163049137,
"rewards/RewardModelWrapper/std": 5.777948155122645,
"step": 1700
},
{
"clip_ratio/high_max": 0.022060031631262973,
"clip_ratio/high_mean": 0.022060031631262973,
"clip_ratio/low_mean": 0.012312272182898596,
"clip_ratio/low_min": 0.012312272182898596,
"clip_ratio/region_mean": 0.03437230377923697,
"completions/clipped_ratio": 0.9632352941176471,
"completions/max_length": 128.0,
"completions/max_terminated_length": 102.82352941176471,
"completions/mean_length": 126.49540441176471,
"completions/mean_terminated_length": 88.83088302612305,
"completions/min_length": 76.3529411764706,
"completions/min_terminated_length": 68.82352941176471,
"epoch": 0.38953811908736785,
"frac_reward_zero_std": 0.0,
"grad_norm": 5.0580315589904785,
"kl": 0.5839428542554379,
"learning_rate": 2.4724561802868197e-06,
"loss": 0.1614,
"num_tokens": 5684102.0,
"reward": 2.689769050654243,
"reward_std": 4.625988932216869,
"rewards/RewardModelWrapper/mean": 2.689769050654243,
"rewards/RewardModelWrapper/std": 5.245194827809053,
"step": 1750
},
{
"clip_ratio/high_max": 0.02739144684630446,
"clip_ratio/high_mean": 0.02739144684630446,
"clip_ratio/low_mean": 0.012341015862475616,
"clip_ratio/low_min": 0.012341015862475616,
"clip_ratio/region_mean": 0.03973246271605604,
"completions/clipped_ratio": 0.95703125,
"completions/max_length": 128.0,
"completions/max_terminated_length": 103.5625,
"completions/mean_length": 126.060546875,
"completions/mean_terminated_length": 83.71354246139526,
"completions/min_length": 61.3125,
"completions/min_terminated_length": 61.3125,
"epoch": 0.4006677796327212,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.872232437133789,
"kl": 0.5619081328809261,
"learning_rate": 2.4553835647621216e-06,
"loss": 0.1509,
"num_tokens": 5840596.0,
"reward": 2.3125159442424774,
"reward_std": 4.9210382997989655,
"rewards/RewardModelWrapper/mean": 2.3125159442424774,
"rewards/RewardModelWrapper/std": 5.377374470233917,
"step": 1800
},
{
"clip_ratio/high_max": 0.024034175912383944,
"clip_ratio/high_mean": 0.024034175912383944,
"clip_ratio/low_mean": 0.009776253007003107,
"clip_ratio/low_min": 0.009776253007003107,
"clip_ratio/region_mean": 0.03381042889552191,
"completions/clipped_ratio": 0.9549632352941176,
"completions/max_length": 128.0,
"completions/max_terminated_length": 101.41176470588235,
"completions/mean_length": 125.99724264705883,
"completions/mean_terminated_length": 81.53921688304229,
"completions/min_length": 65.23529411764706,
"completions/min_terminated_length": 57.705882352941174,
"epoch": 0.41179744017807457,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.8941831588745117,
"kl": 0.615523195117712,
"learning_rate": 2.4383109492374236e-06,
"loss": 0.1677,
"num_tokens": 6007113.0,
"reward": 2.1543740524965176,
"reward_std": 5.060079883126652,
"rewards/RewardModelWrapper/mean": 2.1543740524965176,
"rewards/RewardModelWrapper/std": 5.475296539418838,
"step": 1850
},
{
"clip_ratio/high_max": 0.022410094959195704,
"clip_ratio/high_mean": 0.022410094959195704,
"clip_ratio/low_mean": 0.012868442094186321,
"clip_ratio/low_min": 0.012868442094186321,
"clip_ratio/region_mean": 0.035278537014964965,
"completions/clipped_ratio": 0.9604779411764706,
"completions/max_length": 128.0,
"completions/max_terminated_length": 91.94117647058823,
"completions/mean_length": 126.30238970588235,
"completions/mean_terminated_length": 79.55490246941062,
"completions/min_length": 71.88235294117646,
"completions/min_terminated_length": 64.3529411764706,
"epoch": 0.42292710072342793,
"frac_reward_zero_std": 0.0,
"grad_norm": 5.654233932495117,
"kl": 0.6534902662038803,
"learning_rate": 2.421238333712725e-06,
"loss": 0.1843,
"num_tokens": 6174378.0,
"reward": 2.3925238006255207,
"reward_std": 4.879058487275067,
"rewards/RewardModelWrapper/mean": 2.3925238006255207,
"rewards/RewardModelWrapper/std": 5.418860211091883,
"step": 1900
},
{
"clip_ratio/high_max": 0.021921868621138856,
"clip_ratio/high_mean": 0.021921868621138856,
"clip_ratio/low_mean": 0.011612088698893786,
"clip_ratio/low_min": 0.011612088698893786,
"clip_ratio/region_mean": 0.03353395750047639,
"completions/clipped_ratio": 0.94140625,
"completions/max_length": 128.0,
"completions/max_terminated_length": 107.3125,
"completions/mean_length": 125.32421875,
"completions/mean_terminated_length": 82.90129089355469,
"completions/min_length": 54.5,
"completions/min_terminated_length": 54.5,
"epoch": 0.4340567612687813,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.8473002910614014,
"kl": 0.6599007929861546,
"learning_rate": 2.4041657181880266e-06,
"loss": 0.1769,
"num_tokens": 6330166.0,
"reward": 2.618502587080002,
"reward_std": 4.749881863594055,
"rewards/RewardModelWrapper/mean": 2.618502587080002,
"rewards/RewardModelWrapper/std": 5.46898752450943,
"step": 1950
},
{
"clip_ratio/high_max": 0.02155641552293673,
"clip_ratio/high_mean": 0.02155641552293673,
"clip_ratio/low_mean": 0.009289601502241568,
"clip_ratio/low_min": 0.009289601502241568,
"clip_ratio/region_mean": 0.030846016986761243,
"completions/clipped_ratio": 0.9568014705882353,
"completions/max_length": 128.0,
"completions/max_terminated_length": 101.88235294117646,
"completions/mean_length": 126.37040441176471,
"completions/mean_terminated_length": 83.31176578297334,
"completions/min_length": 72.88235294117646,
"completions/min_terminated_length": 65.3529411764706,
"epoch": 0.44518642181413465,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.8109655380249023,
"kl": 0.7074493160843849,
"learning_rate": 2.387093102663328e-06,
"loss": 0.1974,
"num_tokens": 6496761.0,
"reward": 3.343541706309599,
"reward_std": 4.7992883710300225,
"rewards/RewardModelWrapper/mean": 3.343541706309599,
"rewards/RewardModelWrapper/std": 5.472757451674518,
"step": 2000
},
{
"clip_ratio/high_max": 0.027084801244782283,
"clip_ratio/high_mean": 0.027084801244782283,
"clip_ratio/low_mean": 0.006253871699154843,
"clip_ratio/low_min": 0.006253871699154843,
"clip_ratio/region_mean": 0.03333867286099121,
"completions/clipped_ratio": 0.9503676470588235,
"completions/max_length": 128.0,
"completions/max_terminated_length": 94.88235294117646,
"completions/mean_length": 125.38786764705883,
"completions/mean_terminated_length": 66.87544497321633,
"completions/min_length": 58.1764705882353,
"completions/min_terminated_length": 43.11764705882353,
"epoch": 0.45631608235948806,
"frac_reward_zero_std": 0.0,
"grad_norm": 6.87650203704834,
"kl": 0.6578731602430343,
"learning_rate": 2.3700204871386297e-06,
"loss": 0.1804,
"num_tokens": 6662615.0,
"reward": 1.8048853032729204,
"reward_std": 5.33220240649055,
"rewards/RewardModelWrapper/mean": 1.8048853032729204,
"rewards/RewardModelWrapper/std": 5.8003731334910675,
"step": 2050
},
{
"clip_ratio/high_max": 0.022799394286703318,
"clip_ratio/high_mean": 0.022799394286703318,
"clip_ratio/low_mean": 0.008315351814671886,
"clip_ratio/low_min": 0.008315351814671886,
"clip_ratio/region_mean": 0.03111474617384374,
"completions/clipped_ratio": 0.9609375,
"completions/max_length": 128.0,
"completions/max_terminated_length": 87.0625,
"completions/mean_length": 125.9619140625,
"completions/mean_terminated_length": 69.97916746139526,
"completions/min_length": 70.25,
"completions/min_terminated_length": 54.25,
"epoch": 0.4674457429048414,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.432864189147949,
"kl": 0.7079492492973805,
"learning_rate": 2.3529478716139312e-06,
"loss": 0.1956,
"num_tokens": 6819392.0,
"reward": 2.50741083920002,
"reward_std": 5.116829484701157,
"rewards/RewardModelWrapper/mean": 2.50741083920002,
"rewards/RewardModelWrapper/std": 5.797209560871124,
"step": 2100
},
{
"clip_ratio/high_max": 0.022400263713207094,
"clip_ratio/high_mean": 0.022400263713207094,
"clip_ratio/low_mean": 0.008116541813942604,
"clip_ratio/low_min": 0.008116541813942604,
"clip_ratio/region_mean": 0.030516805413644762,
"completions/clipped_ratio": 0.9347426470588235,
"completions/max_length": 128.0,
"completions/max_terminated_length": 100.88235294117646,
"completions/mean_length": 125.04779411764706,
"completions/mean_terminated_length": 75.39313866110409,
"completions/min_length": 51.1764705882353,
"completions/min_terminated_length": 43.64705882352941,
"epoch": 0.4785754034501948,
"frac_reward_zero_std": 0.0,
"grad_norm": 4.323696136474609,
"kl": 0.7352806448936462,
"learning_rate": 2.3358752560892328e-06,
"loss": 0.1983,
"num_tokens": 6984364.0,
"reward": 2.347130256540635,
"reward_std": 5.3196556708391975,
"rewards/RewardModelWrapper/mean": 2.347130256540635,
"rewards/RewardModelWrapper/std": 5.771211035111371,
"step": 2150
},
{
"clip_ratio/high_max": 0.021681377917993815,
"clip_ratio/high_mean": 0.021681377917993815,
"clip_ratio/low_mean": 0.01035779433674179,
"clip_ratio/low_min": 0.01035779433674179,
"clip_ratio/region_mean": 0.03203917214414105,
"completions/clipped_ratio": 0.9466911764705882,
"completions/max_length": 128.0,
"completions/max_terminated_length": 109.29411764705883,
"completions/mean_length": 125.82996323529412,
"completions/mean_terminated_length": 90.63718593821807,
"completions/min_length": 71.23529411764706,
"completions/min_terminated_length": 71.23529411764706,
"epoch": 0.48970506399554814,
"frac_reward_zero_std": 0.0,
"grad_norm": 4.768919944763184,
"kl": 0.7503913494944573,
"learning_rate": 2.3188026405645343e-06,
"loss": 0.2126,
"num_tokens": 7150483.0,
"reward": 2.530949129777796,
"reward_std": 5.144188319935518,
"rewards/RewardModelWrapper/mean": 2.530949129777796,
"rewards/RewardModelWrapper/std": 5.636184664333568,
"step": 2200
},
{
"clip_ratio/high_max": 0.02238714267965406,
"clip_ratio/high_mean": 0.02238714267965406,
"clip_ratio/low_mean": 0.008641490781737957,
"clip_ratio/low_min": 0.008641490781737957,
"clip_ratio/region_mean": 0.031028633578680454,
"completions/clipped_ratio": 0.927734375,
"completions/max_length": 128.0,
"completions/max_terminated_length": 97.6875,
"completions/mean_length": 125.126953125,
"completions/mean_terminated_length": 79.61108827590942,
"completions/min_length": 60.875,
"completions/min_terminated_length": 52.875,
"epoch": 0.5008347245409015,
"frac_reward_zero_std": 0.0,
"grad_norm": 4.544048309326172,
"kl": 0.8463270646333695,
"learning_rate": 2.3017300250398363e-06,
"loss": 0.234,
"num_tokens": 7305749.0,
"reward": 3.097047299146652,
"reward_std": 5.260514736175537,
"rewards/RewardModelWrapper/mean": 3.097047299146652,
"rewards/RewardModelWrapper/std": 5.711855351924896,
"step": 2250
},
{
"clip_ratio/high_max": 0.0228908458375372,
"clip_ratio/high_mean": 0.0228908458375372,
"clip_ratio/low_mean": 0.009188006882905029,
"clip_ratio/low_min": 0.009188006882905029,
"clip_ratio/region_mean": 0.03207885263953358,
"completions/clipped_ratio": 0.96875,
"completions/max_length": 128.0,
"completions/max_terminated_length": 95.58823529411765,
"completions/mean_length": 126.76011029411765,
"completions/mean_terminated_length": 83.82843219532685,
"completions/min_length": 76.82352941176471,
"completions/min_terminated_length": 69.29411764705883,
"epoch": 0.5119643850862549,
"frac_reward_zero_std": 0.0,
"grad_norm": 4.981594562530518,
"kl": 0.8844366371631622,
"learning_rate": 2.284657409515138e-06,
"loss": 0.256,
"num_tokens": 7472592.0,
"reward": 3.071570908322054,
"reward_std": 5.142256512361414,
"rewards/RewardModelWrapper/mean": 3.071570908322054,
"rewards/RewardModelWrapper/std": 5.772335641524371,
"step": 2300
},
{
"clip_ratio/high_max": 0.0240246270573698,
"clip_ratio/high_mean": 0.0240246270573698,
"clip_ratio/low_mean": 0.0069138467891025355,
"clip_ratio/low_min": 0.0069138467891025355,
"clip_ratio/region_mean": 0.03093847391428426,
"completions/clipped_ratio": 0.9448529411764706,
"completions/max_length": 128.0,
"completions/max_terminated_length": 104.76470588235294,
"completions/mean_length": 125.68014705882354,
"completions/mean_terminated_length": 84.08382460650276,
"completions/min_length": 64.76470588235294,
"completions/min_terminated_length": 57.23529411764706,
"epoch": 0.5230940456316082,
"frac_reward_zero_std": 0.0,
"grad_norm": 5.83440637588501,
"kl": 0.9639204081892967,
"learning_rate": 2.2675847939904393e-06,
"loss": 0.2686,
"num_tokens": 7637628.0,
"reward": 2.8617815410389618,
"reward_std": 5.505871576421401,
"rewards/RewardModelWrapper/mean": 2.8617815410389618,
"rewards/RewardModelWrapper/std": 5.944927496068618,
"step": 2350
},
{
"clip_ratio/high_max": 0.02327756991609931,
"clip_ratio/high_mean": 0.02327756991609931,
"clip_ratio/low_mean": 0.011412573783891275,
"clip_ratio/low_min": 0.011412573783891275,
"clip_ratio/region_mean": 0.03469014364061877,
"completions/clipped_ratio": 0.9521484375,
"completions/max_length": 128.0,
"completions/max_terminated_length": 112.875,
"completions/mean_length": 126.125,
"completions/mean_terminated_length": 91.77031326293945,
"completions/min_length": 70.3125,
"completions/min_terminated_length": 70.3125,
"epoch": 0.5342237061769616,
"frac_reward_zero_std": 0.0,
"grad_norm": 4.203379154205322,
"kl": 1.0460551810264587,
"learning_rate": 2.250512178465741e-06,
"loss": 0.299,
"num_tokens": 7793988.0,
"reward": 3.6445817947387695,
"reward_std": 5.2445206344127655,
"rewards/RewardModelWrapper/mean": 3.6445817947387695,
"rewards/RewardModelWrapper/std": 5.754371851682663,
"step": 2400
},
{
"clip_ratio/high_max": 0.025683601254131647,
"clip_ratio/high_mean": 0.025683601254131647,
"clip_ratio/low_mean": 0.007094714913982898,
"clip_ratio/low_min": 0.007094714913982898,
"clip_ratio/region_mean": 0.032778316254261884,
"completions/clipped_ratio": 0.9310661764705882,
"completions/max_length": 128.0,
"completions/max_terminated_length": 113.58823529411765,
"completions/mean_length": 125.19117647058823,
"completions/mean_terminated_length": 87.4491610807531,
"completions/min_length": 52.588235294117645,
"completions/min_terminated_length": 52.588235294117645,
"epoch": 0.5453533667223149,
"frac_reward_zero_std": 0.0,
"grad_norm": 4.791234970092773,
"kl": 1.0378785887360573,
"learning_rate": 2.233439562941043e-06,
"loss": 0.2908,
"num_tokens": 7958828.0,
"reward": 2.4743111414067886,
"reward_std": 5.666090853074017,
"rewards/RewardModelWrapper/mean": 2.4743111414067886,
"rewards/RewardModelWrapper/std": 6.052795522353229,
"step": 2450
},
{
"clip_ratio/high_max": 0.022869902374222876,
"clip_ratio/high_mean": 0.022869902374222876,
"clip_ratio/low_mean": 0.010338929877325426,
"clip_ratio/low_min": 0.010338929877325426,
"clip_ratio/region_mean": 0.03320883221458644,
"completions/clipped_ratio": 0.9448529411764706,
"completions/max_length": 128.0,
"completions/max_terminated_length": 106.6470588235294,
"completions/mean_length": 125.87040441176471,
"completions/mean_terminated_length": 87.59656883688534,
"completions/min_length": 75.76470588235294,
"completions/min_terminated_length": 68.23529411764706,
"epoch": 0.5564830272676683,
"frac_reward_zero_std": 0.0,
"grad_norm": 7.271297931671143,
"kl": 1.135116419494152,
"learning_rate": 2.2163669474163444e-06,
"loss": 0.3229,
"num_tokens": 8125183.0,
"reward": 2.681288887472714,
"reward_std": 5.512399000280044,
"rewards/RewardModelWrapper/mean": 2.681288887472714,
"rewards/RewardModelWrapper/std": 6.263462291044347,
"step": 2500
},
{
"clip_ratio/high_max": 0.024373745566699655,
"clip_ratio/high_mean": 0.024373745566699655,
"clip_ratio/low_mean": 0.007875631948991213,
"clip_ratio/low_min": 0.007875631948991213,
"clip_ratio/region_mean": 0.032249377460684625,
"completions/clipped_ratio": 0.962890625,
"completions/max_length": 128.0,
"completions/max_terminated_length": 99.3125,
"completions/mean_length": 126.6416015625,
"completions/mean_terminated_length": 84.72916746139526,
"completions/min_length": 75.75,
"completions/min_terminated_length": 67.75,
"epoch": 0.5676126878130217,
"frac_reward_zero_std": 0.0,
"grad_norm": 4.9515485763549805,
"kl": 1.1067718014121055,
"learning_rate": 2.199294331891646e-06,
"loss": 0.3174,
"num_tokens": 8282648.0,
"reward": 2.5410157814621925,
"reward_std": 5.60416579246521,
"rewards/RewardModelWrapper/mean": 2.5410157814621925,
"rewards/RewardModelWrapper/std": 6.249917358160019,
"step": 2550
},
{
"clip_ratio/high_max": 0.021070915756281464,
"clip_ratio/high_mean": 0.021070915756281464,
"clip_ratio/low_mean": 0.010609990251832641,
"clip_ratio/low_min": 0.010609990251832641,
"clip_ratio/region_mean": 0.03168090590508655,
"completions/clipped_ratio": 0.9613970588235294,
"completions/max_length": 128.0,
"completions/max_terminated_length": 89.23529411764706,
"completions/mean_length": 126.42463235294117,
"completions/mean_terminated_length": 78.3034320158117,
"completions/min_length": 81.52941176470588,
"completions/min_terminated_length": 66.47058823529412,
"epoch": 0.5787423483583751,
"frac_reward_zero_std": 0.0,
"grad_norm": 7.162291049957275,
"kl": 1.2047522097826004,
"learning_rate": 2.1822217163669474e-06,
"loss": 0.3462,
"num_tokens": 8449478.0,
"reward": 3.0924135095932903,
"reward_std": 5.470459377064424,
"rewards/RewardModelWrapper/mean": 3.0924135095932903,
"rewards/RewardModelWrapper/std": 6.024646282196045,
"step": 2600
},
{
"clip_ratio/high_max": 0.02261253957170993,
"clip_ratio/high_mean": 0.02261253957170993,
"clip_ratio/low_mean": 0.008833104789373466,
"clip_ratio/low_min": 0.008833104789373466,
"clip_ratio/region_mean": 0.0314456443907693,
"completions/clipped_ratio": 0.9494485294117647,
"completions/max_length": 128.0,
"completions/max_terminated_length": 94.29411764705883,
"completions/mean_length": 125.73161764705883,
"completions/mean_terminated_length": 72.89117723352769,
"completions/min_length": 66.52941176470588,
"completions/min_terminated_length": 51.470588235294116,
"epoch": 0.5898720089037285,
"frac_reward_zero_std": 0.0,
"grad_norm": 6.318300247192383,
"kl": 1.2395999401807785,
"learning_rate": 2.165149100842249e-06,
"loss": 0.3561,
"num_tokens": 8615194.0,
"reward": 2.5635701067307415,
"reward_std": 5.7780221490299,
"rewards/RewardModelWrapper/mean": 2.5635701067307415,
"rewards/RewardModelWrapper/std": 6.476823947008918,
"step": 2650
},
{
"clip_ratio/high_max": 0.02310706490650773,
"clip_ratio/high_mean": 0.02310706490650773,
"clip_ratio/low_mean": 0.008465991305129136,
"clip_ratio/low_min": 0.008465991305129136,
"clip_ratio/region_mean": 0.03157305620610714,
"completions/clipped_ratio": 0.9462890625,
"completions/max_length": 128.0,
"completions/max_terminated_length": 107.625,
"completions/mean_length": 125.8720703125,
"completions/mean_terminated_length": 88.43675756454468,
"completions/min_length": 65.875,
"completions/min_terminated_length": 65.875,
"epoch": 0.6010016694490818,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.8498454093933105,
"kl": 1.2743730303645133,
"learning_rate": 2.148076485317551e-06,
"loss": 0.3642,
"num_tokens": 8771759.0,
"reward": 3.0489635169506073,
"reward_std": 5.676127910614014,
"rewards/RewardModelWrapper/mean": 3.0489635169506073,
"rewards/RewardModelWrapper/std": 6.18413832783699,
"step": 2700
},
{
"clip_ratio/high_max": 0.017379222289891912,
"clip_ratio/high_mean": 0.017379222289891912,
"clip_ratio/low_mean": 0.012123786294832826,
"clip_ratio/low_min": 0.012123786294832826,
"clip_ratio/region_mean": 0.029503008612664416,
"completions/clipped_ratio": 0.9613970588235294,
"completions/max_length": 128.0,
"completions/max_terminated_length": 87.82352941176471,
"completions/mean_length": 126.33823529411765,
"completions/mean_terminated_length": 70.85490282844094,
"completions/min_length": 75.94117647058823,
"completions/min_terminated_length": 53.35294117647059,
"epoch": 0.6121313299944352,
"frac_reward_zero_std": 0.0,
"grad_norm": 5.328779697418213,
"kl": 1.5358505266904832,
"learning_rate": 2.1320282267243345e-06,
"loss": 0.44,
"num_tokens": 8938623.0,
"reward": 4.035378414041856,
"reward_std": 5.26687082122354,
"rewards/RewardModelWrapper/mean": 4.035378414041856,
"rewards/RewardModelWrapper/std": 6.010923722211053,
"step": 2750
},
{
"clip_ratio/high_max": 0.02342768482863903,
"clip_ratio/high_mean": 0.02342768482863903,
"clip_ratio/low_mean": 0.007425281075702514,
"clip_ratio/low_min": 0.007425281075702514,
"clip_ratio/region_mean": 0.03085296612116508,
"completions/clipped_ratio": 0.9476102941176471,
"completions/max_length": 128.0,
"completions/max_terminated_length": 102.82352941176471,
"completions/mean_length": 125.83272058823529,
"completions/mean_terminated_length": 83.69166744456572,
"completions/min_length": 70.94117647058823,
"completions/min_terminated_length": 63.411764705882355,
"epoch": 0.6232609905397886,
"frac_reward_zero_std": 0.0,
"grad_norm": 13.94502067565918,
"kl": 1.3090015414357186,
"learning_rate": 2.114955611199636e-06,
"loss": 0.3749,
"num_tokens": 9104641.0,
"reward": 3.50168057049022,
"reward_std": 5.636927548576804,
"rewards/RewardModelWrapper/mean": 3.50168057049022,
"rewards/RewardModelWrapper/std": 6.223201779758229,
"step": 2800
},
{
"clip_ratio/high_max": 0.023096702507464217,
"clip_ratio/high_mean": 0.023096702507464217,
"clip_ratio/low_mean": 0.01079344226163812,
"clip_ratio/low_min": 0.01079344226163812,
"clip_ratio/region_mean": 0.033890144524630156,
"completions/clipped_ratio": 0.947265625,
"completions/max_length": 128.0,
"completions/max_terminated_length": 111.125,
"completions/mean_length": 126.0732421875,
"completions/mean_terminated_length": 91.28541803359985,
"completions/min_length": 68.25,
"completions/min_terminated_length": 68.25,
"epoch": 0.6343906510851419,
"frac_reward_zero_std": 0.0,
"grad_norm": 5.740921974182129,
"kl": 1.2168986845016478,
"learning_rate": 2.0978829956749376e-06,
"loss": 0.3468,
"num_tokens": 9260988.0,
"reward": 2.903833270072937,
"reward_std": 5.634722024202347,
"rewards/RewardModelWrapper/mean": 2.903833270072937,
"rewards/RewardModelWrapper/std": 6.182769417762756,
"step": 2850
},
{
"clip_ratio/high_max": 0.021131394968833775,
"clip_ratio/high_mean": 0.021131394968833775,
"clip_ratio/low_mean": 0.00905259191960795,
"clip_ratio/low_min": 0.00905259191960795,
"clip_ratio/region_mean": 0.03018398679094389,
"completions/clipped_ratio": 0.9549632352941176,
"completions/max_length": 128.0,
"completions/max_terminated_length": 96.94117647058823,
"completions/mean_length": 125.89613970588235,
"completions/mean_terminated_length": 78.88333488913143,
"completions/min_length": 67.82352941176471,
"completions/min_terminated_length": 60.294117647058826,
"epoch": 0.6455203116304953,
"frac_reward_zero_std": 0.0,
"grad_norm": 4.610988616943359,
"kl": 1.332914224267006,
"learning_rate": 2.080810380150239e-06,
"loss": 0.3851,
"num_tokens": 9427003.0,
"reward": 3.4099216741674088,
"reward_std": 5.599381278542912,
"rewards/RewardModelWrapper/mean": 3.4099216741674088,
"rewards/RewardModelWrapper/std": 6.283486815059886,
"step": 2900
},
{
"clip_ratio/high_max": 0.024977084384299814,
"clip_ratio/high_mean": 0.024977084384299814,
"clip_ratio/low_mean": 0.009850850635266396,
"clip_ratio/low_min": 0.009850850635266396,
"clip_ratio/region_mean": 0.034827935132198035,
"completions/clipped_ratio": 0.9430147058823529,
"completions/max_length": 128.0,
"completions/max_terminated_length": 105.11764705882354,
"completions/mean_length": 125.54503676470588,
"completions/mean_terminated_length": 85.04131810805377,
"completions/min_length": 64.41176470588235,
"completions/min_terminated_length": 64.41176470588235,
"epoch": 0.6566499721758486,
"frac_reward_zero_std": 0.0,
"grad_norm": 6.673405170440674,
"kl": 1.34111887216568,
"learning_rate": 2.0637377646255406e-06,
"loss": 0.3787,
"num_tokens": 9592884.0,
"reward": 3.767064431134392,
"reward_std": 5.628603626700008,
"rewards/RewardModelWrapper/mean": 3.767064431134392,
"rewards/RewardModelWrapper/std": 6.238466964048498,
"step": 2950
},
{
"clip_ratio/high_max": 0.019235485673416406,
"clip_ratio/high_mean": 0.019235485673416406,
"clip_ratio/low_mean": 0.008951259328168816,
"clip_ratio/low_min": 0.008951259328168816,
"clip_ratio/region_mean": 0.02818674497772008,
"completions/clipped_ratio": 0.958984375,
"completions/max_length": 128.0,
"completions/max_terminated_length": 93.625,
"completions/mean_length": 126.48046875,
"completions/mean_terminated_length": 79.1166672706604,
"completions/min_length": 76.125,
"completions/min_terminated_length": 60.125,
"epoch": 0.667779632721202,
"frac_reward_zero_std": 0.0,
"grad_norm": Infinity,
"kl": 1.6551162710785865,
"learning_rate": 2.0470066014113363e-06,
"loss": 0.4809,
"num_tokens": 9750288.0,
"reward": 3.3632944226264954,
"reward_std": 5.644728451967239,
"rewards/RewardModelWrapper/mean": 3.3632944226264954,
"rewards/RewardModelWrapper/std": 6.475361466407776,
"step": 3000
},
{
"clip_ratio/high_max": 0.021347561194561424,
"clip_ratio/high_mean": 0.021347561194561424,
"clip_ratio/low_mean": 0.012039180095889605,
"clip_ratio/low_min": 0.012039180095889605,
"clip_ratio/region_mean": 0.03338674116646871,
"completions/clipped_ratio": 0.9641544117647058,
"completions/max_length": 128.0,
"completions/max_terminated_length": 84.29411764705883,
"completions/mean_length": 126.47426470588235,
"completions/mean_terminated_length": 67.78921643425437,
"completions/min_length": 73.29411764705883,
"completions/min_terminated_length": 50.705882352941174,
"epoch": 0.6789092932665554,
"frac_reward_zero_std": 0.0,
"grad_norm": 4.413740158081055,
"kl": 1.3862884595990181,
"learning_rate": 2.030275438197132e-06,
"loss": 0.4017,
"num_tokens": 9917180.0,
"reward": 3.722391970017377,
"reward_std": 5.822299059699564,
"rewards/RewardModelWrapper/mean": 3.722391970017377,
"rewards/RewardModelWrapper/std": 6.463091822231517,
"step": 3050
},
{
"clip_ratio/high_max": 0.018999405660433694,
"clip_ratio/high_mean": 0.018999405660433694,
"clip_ratio/low_mean": 0.010441597908793484,
"clip_ratio/low_min": 0.010441597908793484,
"clip_ratio/region_mean": 0.029441003524698316,
"completions/clipped_ratio": 0.9586397058823529,
"completions/max_length": 128.0,
"completions/max_terminated_length": 112.29411764705883,
"completions/mean_length": 126.4623161764706,
"completions/mean_terminated_length": 90.0686279745663,
"completions/min_length": 63.94117647058823,
"completions/min_terminated_length": 63.94117647058823,
"epoch": 0.6900389538119087,
"frac_reward_zero_std": 0.0,
"grad_norm": 11.30611801147461,
"kl": 1.3920823442935943,
"learning_rate": 2.0132028226724335e-06,
"loss": 0.4035,
"num_tokens": 10083867.0,
"reward": 3.71955924875596,
"reward_std": 5.790389762205236,
"rewards/RewardModelWrapper/mean": 3.71955924875596,
"rewards/RewardModelWrapper/std": 6.5407993653241325,
"step": 3100
},
{
"clip_ratio/high_max": 0.02239516925183125,
"clip_ratio/high_mean": 0.02239516925183125,
"clip_ratio/low_mean": 0.010940310020523612,
"clip_ratio/low_min": 0.010940310020523612,
"clip_ratio/region_mean": 0.03333547928952612,
"completions/clipped_ratio": 0.955078125,
"completions/max_length": 128.0,
"completions/max_terminated_length": 99.875,
"completions/mean_length": 126.2353515625,
"completions/mean_terminated_length": 82.07812547683716,
"completions/min_length": 70.75,
"completions/min_terminated_length": 62.75,
"epoch": 0.7011686143572621,
"frac_reward_zero_std": 0.0,
"grad_norm": 8.098145484924316,
"kl": 1.332285776436329,
"learning_rate": 1.996130207147735e-06,
"loss": 0.3818,
"num_tokens": 10240948.0,
"reward": 3.675293631851673,
"reward_std": 5.620851904153824,
"rewards/RewardModelWrapper/mean": 3.675293631851673,
"rewards/RewardModelWrapper/std": 6.339143455028534,
"step": 3150
},
{
"clip_ratio/high_max": 0.017545219952007755,
"clip_ratio/high_mean": 0.017545219952007755,
"clip_ratio/low_mean": 0.006160206313361414,
"clip_ratio/low_min": 0.006160206313361414,
"clip_ratio/region_mean": 0.023705426228698343,
"completions/clipped_ratio": 0.9540441176470589,
"completions/max_length": 128.0,
"completions/max_terminated_length": 105.05882352941177,
"completions/mean_length": 126.03216911764706,
"completions/mean_terminated_length": 81.38235316557042,
"completions/min_length": 66.52941176470588,
"completions/min_terminated_length": 59.0,
"epoch": 0.7122982749026154,
"frac_reward_zero_std": 0.0,
"grad_norm": 4.446596622467041,
"kl": 1.3602485132217408,
"learning_rate": 1.9790575916230366e-06,
"loss": 0.3915,
"num_tokens": 10407047.0,
"reward": 3.461222396177404,
"reward_std": 5.5388546831467576,
"rewards/RewardModelWrapper/mean": 3.461222396177404,
"rewards/RewardModelWrapper/std": 6.420014409457936,
"step": 3200
},
{
"clip_ratio/high_max": 0.01795817382866517,
"clip_ratio/high_mean": 0.01795817382866517,
"clip_ratio/low_mean": 0.008432389081281143,
"clip_ratio/low_min": 0.008432389081281143,
"clip_ratio/region_mean": 0.026390562802553176,
"completions/clipped_ratio": 0.9669117647058824,
"completions/max_length": 128.0,
"completions/max_terminated_length": 78.94117647058823,
"completions/mean_length": 126.28033088235294,
"completions/mean_terminated_length": 61.84313740449793,
"completions/min_length": 68.05882352941177,
"completions/min_terminated_length": 45.470588235294116,
"epoch": 0.7234279354479688,
"frac_reward_zero_std": 0.0,
"grad_norm": 17.92909812927246,
"kl": 1.4322891801595687,
"learning_rate": 1.9619849760983386e-06,
"loss": 0.4131,
"num_tokens": 10573736.0,
"reward": 3.6829915467430565,
"reward_std": 5.790671881507425,
"rewards/RewardModelWrapper/mean": 3.6829915467430565,
"rewards/RewardModelWrapper/std": 6.5448582032147575,
"step": 3250
},
{
"clip_ratio/high_max": 0.01961003711214289,
"clip_ratio/high_mean": 0.01961003711214289,
"clip_ratio/low_mean": 0.010123618032957893,
"clip_ratio/low_min": 0.010123618032957893,
"clip_ratio/region_mean": 0.02973365513375029,
"completions/clipped_ratio": 0.9736328125,
"completions/max_length": 128.0,
"completions/max_terminated_length": 89.75,
"completions/mean_length": 127.12890625,
"completions/mean_terminated_length": 78.04687547683716,
"completions/min_length": 88.3125,
"completions/min_terminated_length": 64.3125,
"epoch": 0.7345575959933222,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.4893033504486084,
"kl": 1.3992696887254714,
"learning_rate": 1.94491236057364e-06,
"loss": 0.407,
"num_tokens": 10732252.0,
"reward": 4.159975051879883,
"reward_std": 5.596900701522827,
"rewards/RewardModelWrapper/mean": 4.159975051879883,
"rewards/RewardModelWrapper/std": 6.406121611595154,
"step": 3300
},
{
"clip_ratio/high_max": 0.01751380935544148,
"clip_ratio/high_mean": 0.01751380935544148,
"clip_ratio/low_mean": 0.006701366908382625,
"clip_ratio/low_min": 0.006701366908382625,
"clip_ratio/region_mean": 0.02421517624054104,
"completions/clipped_ratio": 0.9632352941176471,
"completions/max_length": 128.0,
"completions/max_terminated_length": 89.11764705882354,
"completions/mean_length": 126.52113970588235,
"completions/mean_terminated_length": 75.78823538387523,
"completions/min_length": 73.58823529411765,
"completions/min_terminated_length": 58.529411764705884,
"epoch": 0.7456872565386756,
"frac_reward_zero_std": 0.0,
"grad_norm": 7.024867057800293,
"kl": 1.48705244243145,
"learning_rate": 1.9278397450489416e-06,
"loss": 0.4302,
"num_tokens": 10898899.0,
"reward": 4.022455299601836,
"reward_std": 6.009893417358398,
"rewards/RewardModelWrapper/mean": 4.022455299601836,
"rewards/RewardModelWrapper/std": 6.55277754278744,
"step": 3350
},
{
"clip_ratio/high_max": 0.020205343069974332,
"clip_ratio/high_mean": 0.020205343069974332,
"clip_ratio/low_mean": 0.008244332130707334,
"clip_ratio/low_min": 0.008244332130707334,
"clip_ratio/region_mean": 0.028449675207957624,
"completions/clipped_ratio": 0.953125,
"completions/max_length": 128.0,
"completions/max_terminated_length": 83.52941176470588,
"completions/mean_length": 125.67463235294117,
"completions/mean_terminated_length": 65.16414619894589,
"completions/min_length": 67.76470588235294,
"completions/min_terminated_length": 45.1764705882353,
"epoch": 0.756816917084029,
"frac_reward_zero_std": 0.0,
"grad_norm": 6.548982620239258,
"kl": 1.4358280056715012,
"learning_rate": 1.910767129524243e-06,
"loss": 0.4127,
"num_tokens": 11065097.0,
"reward": 3.6614036700304817,
"reward_std": 5.941182669471292,
"rewards/RewardModelWrapper/mean": 3.6614036700304817,
"rewards/RewardModelWrapper/std": 6.68101375243243,
"step": 3400
},
{
"clip_ratio/high_max": 0.018692465843632818,
"clip_ratio/high_mean": 0.018692465843632818,
"clip_ratio/low_mean": 0.008573709986812901,
"clip_ratio/low_min": 0.008573709986812901,
"clip_ratio/region_mean": 0.02726617576321587,
"completions/clipped_ratio": 0.9541015625,
"completions/max_length": 128.0,
"completions/max_terminated_length": 98.8125,
"completions/mean_length": 126.1083984375,
"completions/mean_terminated_length": 77.7172622680664,
"completions/min_length": 63.125,
"completions/min_terminated_length": 55.125,
"epoch": 0.7679465776293823,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.6161601543426514,
"kl": 1.4051894819736481,
"learning_rate": 1.8936945139995447e-06,
"loss": 0.4055,
"num_tokens": 11221336.0,
"reward": 2.737824946641922,
"reward_std": 6.139679282903671,
"rewards/RewardModelWrapper/mean": 2.737824946641922,
"rewards/RewardModelWrapper/std": 6.881059348583221,
"step": 3450
},
{
"clip_ratio/high_max": 0.019909201117698103,
"clip_ratio/high_mean": 0.019909201117698103,
"clip_ratio/low_mean": 0.009944785697734914,
"clip_ratio/low_min": 0.009944785697734914,
"clip_ratio/region_mean": 0.029853986804373563,
"completions/clipped_ratio": 0.9733455882352942,
"completions/max_length": 128.0,
"completions/max_terminated_length": 91.88235294117646,
"completions/mean_length": 127.20036764705883,
"completions/mean_terminated_length": 86.0049025591682,
"completions/min_length": 94.52941176470588,
"completions/min_terminated_length": 79.47058823529412,
"epoch": 0.7790762381747357,
"frac_reward_zero_std": 0.0,
"grad_norm": 7.78251314163208,
"kl": 1.4502297604084016,
"learning_rate": 1.8766218984748462e-06,
"loss": 0.4266,
"num_tokens": 11389018.0,
"reward": 4.499273047727697,
"reward_std": 5.489500326268813,
"rewards/RewardModelWrapper/mean": 4.499273047727697,
"rewards/RewardModelWrapper/std": 6.2598629839280076,
"step": 3500
},
{
"clip_ratio/high_max": 0.01706919132906478,
"clip_ratio/high_mean": 0.01706919132906478,
"clip_ratio/low_mean": 0.007432717043848243,
"clip_ratio/low_min": 0.007432717043848243,
"clip_ratio/region_mean": 0.024501908438978717,
"completions/clipped_ratio": 0.9568014705882353,
"completions/max_length": 128.0,
"completions/max_terminated_length": 91.47058823529412,
"completions/mean_length": 126.015625,
"completions/mean_terminated_length": 74.68823646096622,
"completions/min_length": 61.76470588235294,
"completions/min_terminated_length": 54.23529411764706,
"epoch": 0.7902058987200891,
"frac_reward_zero_std": 0.0,
"grad_norm": 7.431344509124756,
"kl": 1.4172208327054978,
"learning_rate": 1.859549282950148e-06,
"loss": 0.4053,
"num_tokens": 11555371.0,
"reward": 3.9203204547657684,
"reward_std": 5.879987856921027,
"rewards/RewardModelWrapper/mean": 3.9203204547657684,
"rewards/RewardModelWrapper/std": 6.654794917387121,
"step": 3550
},
{
"clip_ratio/high_max": 0.017978638106724246,
"clip_ratio/high_mean": 0.017978638106724246,
"clip_ratio/low_mean": 0.008542120530910325,
"clip_ratio/low_min": 0.008542120530910325,
"clip_ratio/region_mean": 0.02652075860532932,
"completions/clipped_ratio": 0.95703125,
"completions/max_length": 128.0,
"completions/max_terminated_length": 105.0625,
"completions/mean_length": 126.1298828125,
"completions/mean_terminated_length": 88.37500047683716,
"completions/min_length": 68.9375,
"completions/min_terminated_length": 68.9375,
"epoch": 0.8013355592654424,
"frac_reward_zero_std": 0.0,
"grad_norm": 5.334597110748291,
"kl": 1.357377045750618,
"learning_rate": 1.8424766674254495e-06,
"loss": 0.39,
"num_tokens": 11712544.0,
"reward": 3.334804505109787,
"reward_std": 6.004520118236542,
"rewards/RewardModelWrapper/mean": 3.334804505109787,
"rewards/RewardModelWrapper/std": 6.608620345592499,
"step": 3600
},
{
"clip_ratio/high_max": 0.01815531796310097,
"clip_ratio/high_mean": 0.01815531796310097,
"clip_ratio/low_mean": 0.00551853927434422,
"clip_ratio/low_min": 0.00551853927434422,
"clip_ratio/region_mean": 0.023673857206013053,
"completions/clipped_ratio": 0.9586397058823529,
"completions/max_length": 128.0,
"completions/max_terminated_length": 106.0,
"completions/mean_length": 126.57444852941177,
"completions/mean_terminated_length": 91.43627570657169,
"completions/min_length": 73.05882352941177,
"completions/min_terminated_length": 73.05882352941177,
"epoch": 0.8124652198107958,
"frac_reward_zero_std": 0.0,
"grad_norm": 4.32098913192749,
"kl": 1.3787575218081474,
"learning_rate": 1.825404051900751e-06,
"loss": 0.406,
"num_tokens": 11879329.0,
"reward": 4.733595371246338,
"reward_std": 5.286141087027157,
"rewards/RewardModelWrapper/mean": 4.733595371246338,
"rewards/RewardModelWrapper/std": 6.089175813338336,
"step": 3650
},
{
"clip_ratio/high_max": 0.018292159989941867,
"clip_ratio/high_mean": 0.018292159989941867,
"clip_ratio/low_mean": 0.00964461057272274,
"clip_ratio/low_min": 0.00964461057272274,
"clip_ratio/region_mean": 0.027936770617961883,
"completions/clipped_ratio": 0.9347426470588235,
"completions/max_length": 128.0,
"completions/max_terminated_length": 100.94117647058823,
"completions/mean_length": 125.18014705882354,
"completions/mean_terminated_length": 79.63531673655791,
"completions/min_length": 62.8235294117647,
"completions/min_terminated_length": 55.294117647058826,
"epoch": 0.8235948803561491,
"frac_reward_zero_std": 0.0,
"grad_norm": 11.81167984008789,
"kl": 1.3568527114391327,
"learning_rate": 1.8083314363760528e-06,
"loss": 0.3856,
"num_tokens": 12044285.0,
"reward": 3.853144645690918,
"reward_std": 5.8185105744530174,
"rewards/RewardModelWrapper/mean": 3.853144645690918,
"rewards/RewardModelWrapper/std": 6.648196416742661,
"step": 3700
},
{
"clip_ratio/high_max": 0.020421573969069868,
"clip_ratio/high_mean": 0.020421573969069868,
"clip_ratio/low_mean": 0.006358395353017841,
"clip_ratio/low_min": 0.006358395353017841,
"clip_ratio/region_mean": 0.02677996931830421,
"completions/clipped_ratio": 0.966796875,
"completions/max_length": 128.0,
"completions/max_terminated_length": 85.4375,
"completions/mean_length": 126.6142578125,
"completions/mean_terminated_length": 75.74791765213013,
"completions/min_length": 65.0,
"completions/min_terminated_length": 65.0,
"epoch": 0.8347245409015025,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.919222593307495,
"kl": 1.4361850446462632,
"learning_rate": 1.7912588208513545e-06,
"loss": 0.4195,
"num_tokens": 12201530.0,
"reward": 4.583240419626236,
"reward_std": 5.661596119403839,
"rewards/RewardModelWrapper/mean": 4.583240419626236,
"rewards/RewardModelWrapper/std": 6.355997741222382,
"step": 3750
},
{
"clip_ratio/high_max": 0.01899803020292893,
"clip_ratio/high_mean": 0.01899803020292893,
"clip_ratio/low_mean": 0.005853212493821047,
"clip_ratio/low_min": 0.005853212493821047,
"clip_ratio/region_mean": 0.02485124268569052,
"completions/clipped_ratio": 0.9485294117647058,
"completions/max_length": 128.0,
"completions/max_terminated_length": 106.41176470588235,
"completions/mean_length": 125.8529411764706,
"completions/mean_terminated_length": 86.47465066348805,
"completions/min_length": 63.11764705882353,
"completions/min_terminated_length": 63.11764705882353,
"epoch": 0.8458542014468559,
"frac_reward_zero_std": 0.0,
"grad_norm": 6.776216983795166,
"kl": 1.4694108253717422,
"learning_rate": 1.7741862053266563e-06,
"loss": 0.4258,
"num_tokens": 12367226.0,
"reward": 4.691084188573501,
"reward_std": 5.392301559448242,
"rewards/RewardModelWrapper/mean": 4.691084188573501,
"rewards/RewardModelWrapper/std": 6.076854313121123,
"step": 3800
},
{
"clip_ratio/high_max": 0.020900118886493145,
"clip_ratio/high_mean": 0.020900118886493145,
"clip_ratio/low_mean": 0.008081750934943557,
"clip_ratio/low_min": 0.008081750934943557,
"clip_ratio/region_mean": 0.028981869909912347,
"completions/clipped_ratio": 0.9733455882352942,
"completions/max_length": 128.0,
"completions/max_terminated_length": 76.76470588235294,
"completions/mean_length": 126.63051470588235,
"completions/mean_terminated_length": 62.98235298605526,
"completions/min_length": 81.41176470588235,
"completions/min_terminated_length": 51.294117647058826,
"epoch": 0.8569838619922092,
"frac_reward_zero_std": 0.0,
"grad_norm": 6.310102462768555,
"kl": 1.3774869224429132,
"learning_rate": 1.7571135898019578e-06,
"loss": 0.398,
"num_tokens": 12534040.0,
"reward": 3.871620360542746,
"reward_std": 5.696767147849588,
"rewards/RewardModelWrapper/mean": 3.871620360542746,
"rewards/RewardModelWrapper/std": 6.582426996792064,
"step": 3850
},
{
"clip_ratio/high_max": 0.021299479028675704,
"clip_ratio/high_mean": 0.021299479028675704,
"clip_ratio/low_mean": 0.0075305427008424885,
"clip_ratio/low_min": 0.0075305427008424885,
"clip_ratio/region_mean": 0.028830021731555463,
"completions/clipped_ratio": 0.9619140625,
"completions/max_length": 128.0,
"completions/max_terminated_length": 85.8125,
"completions/mean_length": 126.1943359375,
"completions/mean_terminated_length": 71.39270901679993,
"completions/min_length": 71.8125,
"completions/min_terminated_length": 55.8125,
"epoch": 0.8681135225375626,
"frac_reward_zero_std": 0.0,
"grad_norm": 4.930108547210693,
"kl": 1.3843135032057763,
"learning_rate": 1.7400409742772593e-06,
"loss": 0.3964,
"num_tokens": 12690615.0,
"reward": 3.086591437458992,
"reward_std": 6.208359390497208,
"rewards/RewardModelWrapper/mean": 3.086591437458992,
"rewards/RewardModelWrapper/std": 6.8491051197052,
"step": 3900
},
{
"clip_ratio/high_max": 0.018020967768970875,
"clip_ratio/high_mean": 0.018020967768970875,
"clip_ratio/low_mean": 0.006037966601434163,
"clip_ratio/low_min": 0.006037966601434163,
"clip_ratio/region_mean": 0.024058934384956956,
"completions/clipped_ratio": 0.953125,
"completions/max_length": 128.0,
"completions/max_terminated_length": 103.76470588235294,
"completions/mean_length": 126.11305147058823,
"completions/mean_terminated_length": 84.38186331356273,
"completions/min_length": 66.11764705882354,
"completions/min_terminated_length": 58.588235294117645,
"epoch": 0.8792431830829159,
"frac_reward_zero_std": 0.0,
"grad_norm": 4.74282169342041,
"kl": 1.432164865732193,
"learning_rate": 1.7229683587525609e-06,
"loss": 0.4134,
"num_tokens": 12857386.0,
"reward": 3.5356551899629483,
"reward_std": 5.877786804648006,
"rewards/RewardModelWrapper/mean": 3.5356551899629483,
"rewards/RewardModelWrapper/std": 6.742880484637092,
"step": 3950
},
{
"clip_ratio/high_max": 0.016392124033300207,
"clip_ratio/high_mean": 0.016392124033300207,
"clip_ratio/low_mean": 0.00735437709663529,
"clip_ratio/low_min": 0.00735437709663529,
"clip_ratio/region_mean": 0.02374650107929483,
"completions/clipped_ratio": 0.9669117647058824,
"completions/max_length": 128.0,
"completions/max_terminated_length": 95.0,
"completions/mean_length": 126.86764705882354,
"completions/mean_terminated_length": 82.9313735961914,
"completions/min_length": 84.29411764705883,
"completions/min_terminated_length": 69.23529411764706,
"epoch": 0.8903728436282693,
"frac_reward_zero_std": 0.0,
"grad_norm": 5.2612786293029785,
"kl": 1.4968037492036819,
"learning_rate": 1.7058957432278626e-06,
"loss": 0.4371,
"num_tokens": 13025050.0,
"reward": 3.9833039676441864,
"reward_std": 5.820403575897217,
"rewards/RewardModelWrapper/mean": 3.9833039676441864,
"rewards/RewardModelWrapper/std": 6.59747979220222,
"step": 4000
},
{
"clip_ratio/high_max": 0.013542763022705913,
"clip_ratio/high_mean": 0.013542763022705913,
"clip_ratio/low_mean": 0.007844352710526437,
"clip_ratio/low_min": 0.007844352710526437,
"clip_ratio/region_mean": 0.021387115789111705,
"completions/clipped_ratio": 0.9755859375,
"completions/max_length": 128.0,
"completions/max_terminated_length": 82.0,
"completions/mean_length": 127.052734375,
"completions/mean_terminated_length": 69.609375,
"completions/min_length": 89.8125,
"completions/min_terminated_length": 57.8125,
"epoch": 0.9015025041736227,
"frac_reward_zero_std": 0.0,
"grad_norm": 9.22656536102295,
"kl": 1.486895147562027,
"learning_rate": 1.6888231277031642e-06,
"loss": 0.4339,
"num_tokens": 13182896.0,
"reward": 3.8604883551597595,
"reward_std": 5.920006081461906,
"rewards/RewardModelWrapper/mean": 3.8604883551597595,
"rewards/RewardModelWrapper/std": 6.682152062654495,
"step": 4050
},
{
"clip_ratio/high_max": 0.017551230599638076,
"clip_ratio/high_mean": 0.017551230599638076,
"clip_ratio/low_mean": 0.006257881603378337,
"clip_ratio/low_min": 0.006257881603378337,
"clip_ratio/region_mean": 0.023809112217277287,
"completions/clipped_ratio": 0.9604779411764706,
"completions/max_length": 128.0,
"completions/max_terminated_length": 103.47058823529412,
"completions/mean_length": 126.76011029411765,
"completions/mean_terminated_length": 90.72815165800206,
"completions/min_length": 81.47058823529412,
"completions/min_terminated_length": 73.94117647058823,
"epoch": 0.9126321647189761,
"frac_reward_zero_std": 0.0,
"grad_norm": 4.342564105987549,
"kl": 1.479159579873085,
"learning_rate": 1.6717505121784657e-06,
"loss": 0.434,
"num_tokens": 13349539.0,
"reward": 3.891232869204353,
"reward_std": 5.906516776365392,
"rewards/RewardModelWrapper/mean": 3.891232869204353,
"rewards/RewardModelWrapper/std": 6.87522164513083,
"step": 4100
},
{
"clip_ratio/high_max": 0.017731820455519482,
"clip_ratio/high_mean": 0.017731820455519482,
"clip_ratio/low_mean": 0.0037902081329957583,
"clip_ratio/low_min": 0.0037902081329957583,
"clip_ratio/region_mean": 0.021522028532344847,
"completions/clipped_ratio": 0.9632352941176471,
"completions/max_length": 128.0,
"completions/max_terminated_length": 102.11764705882354,
"completions/mean_length": 126.3373161764706,
"completions/mean_terminated_length": 80.36274584601907,
"completions/min_length": 61.588235294117645,
"completions/min_terminated_length": 54.05882352941177,
"epoch": 0.9237618252643295,
"frac_reward_zero_std": 0.0,
"grad_norm": 8.095427513122559,
"kl": 1.5771301573514938,
"learning_rate": 1.6546778966537674e-06,
"loss": 0.4627,
"num_tokens": 13516058.0,
"reward": 4.4532030890969665,
"reward_std": 5.776824221891515,
"rewards/RewardModelWrapper/mean": 4.4532030890969665,
"rewards/RewardModelWrapper/std": 6.367258969475241,
"step": 4150
},
{
"clip_ratio/high_max": 0.018110398813150824,
"clip_ratio/high_mean": 0.018110398813150824,
"clip_ratio/low_mean": 0.006745649516233243,
"clip_ratio/low_min": 0.006745649516233243,
"clip_ratio/region_mean": 0.024856048391666264,
"completions/clipped_ratio": 0.9609375,
"completions/max_length": 128.0,
"completions/max_terminated_length": 93.9375,
"completions/mean_length": 126.3310546875,
"completions/mean_terminated_length": 81.33363127708435,
"completions/min_length": 74.75,
"completions/min_terminated_length": 66.75,
"epoch": 0.9348914858096828,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.7098264694213867,
"kl": 1.4337137299776077,
"learning_rate": 1.637605281129069e-06,
"loss": 0.4139,
"num_tokens": 13673549.0,
"reward": 3.717156395316124,
"reward_std": 5.887754291296005,
"rewards/RewardModelWrapper/mean": 3.717156395316124,
"rewards/RewardModelWrapper/std": 6.542896807193756,
"step": 4200
},
{
"clip_ratio/high_max": 0.01609679988003336,
"clip_ratio/high_mean": 0.01609679988003336,
"clip_ratio/low_mean": 0.006251108425203711,
"clip_ratio/low_min": 0.006251108425203711,
"clip_ratio/region_mean": 0.022347908235387876,
"completions/clipped_ratio": 0.9466911764705882,
"completions/max_length": 128.0,
"completions/max_terminated_length": 108.3529411764706,
"completions/mean_length": 126.234375,
"completions/mean_terminated_length": 89.88039308435776,
"completions/min_length": 68.6470588235294,
"completions/min_terminated_length": 61.11764705882353,
"epoch": 0.9460211463550362,
"frac_reward_zero_std": 0.0,
"grad_norm": 4.461524963378906,
"kl": 1.4435530692338943,
"learning_rate": 1.6205326656043705e-06,
"loss": 0.4174,
"num_tokens": 13839820.0,
"reward": 3.5667920813840976,
"reward_std": 5.679576621336095,
"rewards/RewardModelWrapper/mean": 3.5667920813840976,
"rewards/RewardModelWrapper/std": 6.743907311383416,
"step": 4250
},
{
"clip_ratio/high_max": 0.01591621272964403,
"clip_ratio/high_mean": 0.01591621272964403,
"clip_ratio/low_mean": 0.005297647488187067,
"clip_ratio/low_min": 0.005297647488187067,
"clip_ratio/region_mean": 0.021213860225398094,
"completions/clipped_ratio": 0.9669117647058824,
"completions/max_length": 128.0,
"completions/max_terminated_length": 100.76470588235294,
"completions/mean_length": 126.63602941176471,
"completions/mean_terminated_length": 87.36666780359604,
"completions/min_length": 71.0,
"completions/min_terminated_length": 71.0,
"epoch": 0.9571508069003896,
"frac_reward_zero_std": 0.0,
"grad_norm": 8.245085716247559,
"kl": 1.4910035210847854,
"learning_rate": 1.603460050079672e-06,
"loss": 0.4312,
"num_tokens": 14007144.0,
"reward": 4.034057981827679,
"reward_std": 5.743304505067713,
"rewards/RewardModelWrapper/mean": 4.034057981827679,
"rewards/RewardModelWrapper/std": 6.6319817094241875,
"step": 4300
},
{
"clip_ratio/high_max": 0.0162072420923505,
"clip_ratio/high_mean": 0.0162072420923505,
"clip_ratio/low_mean": 0.00646918074140558,
"clip_ratio/low_min": 0.00646918074140558,
"clip_ratio/region_mean": 0.022676422880031168,
"completions/clipped_ratio": 0.9560546875,
"completions/max_length": 128.0,
"completions/max_terminated_length": 97.0,
"completions/mean_length": 126.1572265625,
"completions/mean_terminated_length": 80.61093807220459,
"completions/min_length": 69.625,
"completions/min_terminated_length": 61.625,
"epoch": 0.9682804674457429,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.9271068572998047,
"kl": 1.5564635121822357,
"learning_rate": 1.5863874345549738e-06,
"loss": 0.4481,
"num_tokens": 14163497.0,
"reward": 4.418118596076965,
"reward_std": 5.663649529218674,
"rewards/RewardModelWrapper/mean": 4.418118596076965,
"rewards/RewardModelWrapper/std": 6.5488221347332,
"step": 4350
},
{
"clip_ratio/high_max": 0.015229720452334733,
"clip_ratio/high_mean": 0.015229720452334733,
"clip_ratio/low_mean": 0.005334880515874829,
"clip_ratio/low_min": 0.005334880515874829,
"clip_ratio/region_mean": 0.020564600981306285,
"completions/clipped_ratio": 0.9411764705882353,
"completions/max_length": 128.0,
"completions/max_terminated_length": 101.0,
"completions/mean_length": 124.7876838235294,
"completions/mean_terminated_length": 74.75882474113914,
"completions/min_length": 50.35294117647059,
"completions/min_terminated_length": 50.35294117647059,
"epoch": 0.9794101279910963,
"frac_reward_zero_std": 0.0,
"grad_norm": 7.419699192047119,
"kl": 1.4550988680124284,
"learning_rate": 1.5693148190302755e-06,
"loss": 0.4187,
"num_tokens": 14328034.0,
"reward": 3.752244500552907,
"reward_std": 5.818949124392341,
"rewards/RewardModelWrapper/mean": 3.752244500552907,
"rewards/RewardModelWrapper/std": 6.797629524679745,
"step": 4400
},
{
"clip_ratio/high_max": 0.018021058345912024,
"clip_ratio/high_mean": 0.018021058345912024,
"clip_ratio/low_mean": 0.0030438171711284667,
"clip_ratio/low_min": 0.0030438171711284667,
"clip_ratio/region_mean": 0.021064875536831097,
"completions/clipped_ratio": 0.9549632352941176,
"completions/max_length": 128.0,
"completions/max_terminated_length": 96.23529411764706,
"completions/mean_length": 126.15900735294117,
"completions/mean_terminated_length": 79.46218647676356,
"completions/min_length": 68.05882352941177,
"completions/min_terminated_length": 60.529411764705884,
"epoch": 0.9905397885364496,
"frac_reward_zero_std": 0.0,
"grad_norm": 5.946506977081299,
"kl": 1.4544246417284012,
"learning_rate": 1.5522422035055773e-06,
"loss": 0.4204,
"num_tokens": 14494631.0,
"reward": 3.687691057429594,
"reward_std": 5.869795238270479,
"rewards/RewardModelWrapper/mean": 3.687691057429594,
"rewards/RewardModelWrapper/std": 6.839460316826315,
"step": 4450
},
{
"epoch": 0.9996661101836394,
"eval_clip_ratio/high_max": 0.0,
"eval_clip_ratio/high_mean": 0.0,
"eval_clip_ratio/low_mean": 0.0,
"eval_clip_ratio/low_min": 0.0,
"eval_clip_ratio/region_mean": 0.0,
"eval_completions/clipped_ratio": 0.9438291139240507,
"eval_completions/max_length": 128.0,
"eval_completions/max_terminated_length": 53.129746835443036,
"eval_completions/mean_length": 125.40446993670886,
"eval_completions/mean_terminated_length": 48.271835556513146,
"eval_completions/min_length": 96.05696202531645,
"eval_completions/min_terminated_length": 43.39873417721519,
"eval_frac_reward_zero_std": 0.0,
"eval_kl": 1.4363023352019395,
"eval_loss": 0.41118884086608887,
"eval_num_tokens": 14622004.0,
"eval_reward": 3.463206129738047,
"eval_reward_std": 6.040495253722124,
"eval_rewards/RewardModelWrapper/mean": 3.463206129738047,
"eval_rewards/RewardModelWrapper/std": 6.557550964476187,
"eval_runtime": 1430.6223,
"eval_samples_per_second": 0.441,
"eval_steps_per_second": 0.028,
"step": 4491
}
],
"logging_steps": 50,
"max_steps": 8986,
"num_input_tokens_seen": 14622004,
"num_train_epochs": 2,
"save_steps": 2696,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}