HidatoQwenModel / last-checkpoint /trainer_state.json
michlea's picture
Training in progress, step 150, checkpoint
d831f18 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.01,
"eval_steps": 500,
"global_step": 150,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"completion_length": 313.75,
"completions/clipped_ratio": 0.0,
"completions/max_length": 346.0,
"completions/max_terminated_length": 346.0,
"completions/mean_length": 313.75,
"completions/mean_terminated_length": 313.75,
"completions/min_length": 302.0,
"completions/min_terminated_length": 302.0,
"epoch": 6.666666666666667e-05,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.6239553689956665,
"kl": 0.28753748536109924,
"learning_rate": 0.0,
"loss": 0.0003,
"num_tokens": 4438.0,
"reward": 1.6418367624282837,
"reward_std": 0.02908739261329174,
"rewards/correctness_reward_func/mean": 1.1418367624282837,
"rewards/correctness_reward_func/std": 0.029087385162711143,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 1
},
{
"completion_length": 125.875,
"completions/clipped_ratio": 0.0,
"completions/max_length": 199.0,
"completions/max_terminated_length": 199.0,
"completions/mean_length": 125.875,
"completions/mean_terminated_length": 125.875,
"completions/min_length": 107.0,
"completions/min_terminated_length": 107.0,
"epoch": 0.00013333333333333334,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0035445222165435553,
"kl": 0.6253855228424072,
"learning_rate": 3.3333333333333335e-07,
"loss": 0.0006,
"num_tokens": 6677.0,
"reward": 1.774999976158142,
"reward_std": 0.0,
"rewards/correctness_reward_func/mean": 1.274999976158142,
"rewards/correctness_reward_func/std": 0.0,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 2
},
{
"completion_length": 240.375,
"completions/clipped_ratio": 0.0,
"completions/max_length": 257.0,
"completions/max_terminated_length": 257.0,
"completions/mean_length": 240.375,
"completions/mean_terminated_length": 240.375,
"completions/min_length": 232.0,
"completions/min_terminated_length": 232.0,
"epoch": 0.0002,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.8441446423530579,
"kl": 0.42504680156707764,
"learning_rate": 6.666666666666667e-07,
"loss": 0.0004,
"num_tokens": 10184.0,
"reward": 1.5458333492279053,
"reward_std": 0.06408699601888657,
"rewards/correctness_reward_func/mean": 1.0458333492279053,
"rewards/correctness_reward_func/std": 0.06408701092004776,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 3
},
{
"completion_length": 111.875,
"completions/clipped_ratio": 0.0,
"completions/max_length": 132.0,
"completions/max_terminated_length": 132.0,
"completions/mean_length": 111.875,
"completions/mean_terminated_length": 111.875,
"completions/min_length": 104.0,
"completions/min_terminated_length": 104.0,
"epoch": 0.0002666666666666667,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.4565684795379639,
"kl": 0.5555254817008972,
"learning_rate": 1.0000000000000002e-06,
"loss": 0.0006,
"num_tokens": 12303.0,
"reward": 1.6624999046325684,
"reward_std": 0.0981980711221695,
"rewards/correctness_reward_func/mean": 1.1624999046325684,
"rewards/correctness_reward_func/std": 0.0981980711221695,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 4
},
{
"completion_length": 315.5,
"completions/clipped_ratio": 0.0,
"completions/max_length": 347.0,
"completions/max_terminated_length": 347.0,
"completions/mean_length": 315.5,
"completions/mean_terminated_length": 315.5,
"completions/min_length": 302.0,
"completions/min_terminated_length": 302.0,
"epoch": 0.0003333333333333333,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.6569257974624634,
"kl": 0.38966104388237,
"learning_rate": 1.3333333333333334e-06,
"loss": 0.0004,
"num_tokens": 16731.0,
"reward": 1.586734652519226,
"reward_std": 0.027575301006436348,
"rewards/correctness_reward_func/mean": 1.086734652519226,
"rewards/correctness_reward_func/std": 0.02757529728114605,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 5
},
{
"completion_length": 387.5,
"completions/clipped_ratio": 0.0,
"completions/max_length": 396.0,
"completions/max_terminated_length": 396.0,
"completions/mean_length": 387.5,
"completions/mean_terminated_length": 387.5,
"completions/min_length": 379.0,
"completions/min_terminated_length": 379.0,
"epoch": 0.0004,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.7828848958015442,
"kl": 0.33463814854621887,
"learning_rate": 1.6666666666666667e-06,
"loss": 0.0003,
"num_tokens": 21935.0,
"reward": 1.357812523841858,
"reward_std": 0.057355206459760666,
"rewards/correctness_reward_func/mean": 0.8578125238418579,
"rewards/correctness_reward_func/std": 0.05735520273447037,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 6
},
{
"completion_length": 332.75,
"completions/clipped_ratio": 0.0,
"completions/max_length": 377.0,
"completions/max_terminated_length": 377.0,
"completions/mean_length": 332.75,
"completions/mean_terminated_length": 332.75,
"completions/min_length": 300.0,
"completions/min_terminated_length": 300.0,
"epoch": 0.00046666666666666666,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.7264121174812317,
"kl": 0.368626207113266,
"learning_rate": 2.0000000000000003e-06,
"loss": 0.0004,
"num_tokens": 26381.0,
"reward": 1.3227040767669678,
"reward_std": 0.09919165819883347,
"rewards/correctness_reward_func/mean": 0.8227040767669678,
"rewards/correctness_reward_func/std": 0.09919163584709167,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 7
},
{
"completion_length": 506.125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 537.0,
"completions/max_terminated_length": 537.0,
"completions/mean_length": 506.125,
"completions/mean_terminated_length": 506.125,
"completions/min_length": 496.0,
"completions/min_terminated_length": 496.0,
"epoch": 0.0005333333333333334,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5257914662361145,
"kl": 0.2594059407711029,
"learning_rate": 2.3333333333333336e-06,
"loss": 0.0003,
"num_tokens": 32886.0,
"reward": 1.4370369911193848,
"reward_std": 0.02962968312203884,
"rewards/correctness_reward_func/mean": 0.9370369911193848,
"rewards/correctness_reward_func/std": 0.029629632830619812,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 8
},
{
"completion_length": 515.25,
"completions/clipped_ratio": 0.0,
"completions/max_length": 523.0,
"completions/max_terminated_length": 523.0,
"completions/mean_length": 515.25,
"completions/mean_terminated_length": 515.25,
"completions/min_length": 507.0,
"completions/min_terminated_length": 507.0,
"epoch": 0.0006,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.42451590299606323,
"kl": 0.31523531675338745,
"learning_rate": 2.666666666666667e-06,
"loss": 0.0003,
"num_tokens": 39448.0,
"reward": 1.3203704357147217,
"reward_std": 0.04222872108221054,
"rewards/correctness_reward_func/mean": 0.8203703761100769,
"rewards/correctness_reward_func/std": 0.04222871735692024,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 9
},
{
"completion_length": 518.75,
"completions/clipped_ratio": 0.0,
"completions/max_length": 562.0,
"completions/max_terminated_length": 562.0,
"completions/mean_length": 518.75,
"completions/mean_terminated_length": 518.75,
"completions/min_length": 503.0,
"completions/min_terminated_length": 503.0,
"epoch": 0.0006666666666666666,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.4859318137168884,
"kl": 0.3406108617782593,
"learning_rate": 3e-06,
"loss": 0.0003,
"num_tokens": 45990.0,
"reward": 1.3275463581085205,
"reward_std": 0.08047376573085785,
"rewards/correctness_reward_func/mean": 0.8275462985038757,
"rewards/correctness_reward_func/std": 0.08047378063201904,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 10
},
{
"completion_length": 115.25,
"completions/clipped_ratio": 0.0,
"completions/max_length": 121.0,
"completions/max_terminated_length": 121.0,
"completions/mean_length": 115.25,
"completions/mean_terminated_length": 115.25,
"completions/min_length": 109.0,
"completions/min_terminated_length": 109.0,
"epoch": 0.0007333333333333333,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.4112221002578735,
"kl": 0.6370461583137512,
"learning_rate": 3.3333333333333333e-06,
"loss": 0.0006,
"num_tokens": 48208.0,
"reward": 1.7843749523162842,
"reward_std": 0.10933034867048264,
"rewards/correctness_reward_func/mean": 1.2843749523162842,
"rewards/correctness_reward_func/std": 0.10933034867048264,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 11
},
{
"completion_length": 229.375,
"completions/clipped_ratio": 0.0,
"completions/max_length": 241.0,
"completions/max_terminated_length": 241.0,
"completions/mean_length": 229.375,
"completions/mean_terminated_length": 229.375,
"completions/min_length": 220.0,
"completions/min_terminated_length": 220.0,
"epoch": 0.0008,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.333493947982788,
"kl": 0.4309392273426056,
"learning_rate": 3.6666666666666666e-06,
"loss": 0.0004,
"num_tokens": 51659.0,
"reward": 1.5499999523162842,
"reward_std": 0.03563487157225609,
"rewards/correctness_reward_func/mean": 1.0499999523162842,
"rewards/correctness_reward_func/std": 0.035634856671094894,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 12
},
{
"completion_length": 113.5,
"completions/clipped_ratio": 0.0,
"completions/max_length": 120.0,
"completions/max_terminated_length": 120.0,
"completions/mean_length": 113.5,
"completions/mean_terminated_length": 113.5,
"completions/min_length": 106.0,
"completions/min_terminated_length": 106.0,
"epoch": 0.0008666666666666666,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.413602352142334,
"kl": 0.5921498537063599,
"learning_rate": 4.000000000000001e-06,
"loss": 0.0006,
"num_tokens": 53775.0,
"reward": 1.868749976158142,
"reward_std": 0.10415471345186234,
"rewards/correctness_reward_func/mean": 1.368749976158142,
"rewards/correctness_reward_func/std": 0.10415472090244293,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 13
},
{
"completion_length": 402.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 419.0,
"completions/max_terminated_length": 419.0,
"completions/mean_length": 402.0,
"completions/mean_terminated_length": 402.0,
"completions/min_length": 392.0,
"completions/min_terminated_length": 392.0,
"epoch": 0.0009333333333333333,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.926260232925415,
"kl": 4.320853233337402,
"learning_rate": 4.333333333333334e-06,
"loss": 0.0043,
"num_tokens": 59111.0,
"reward": 1.37890625,
"reward_std": 0.02542884461581707,
"rewards/correctness_reward_func/mean": 0.87890625,
"rewards/correctness_reward_func/std": 0.02542879618704319,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 14
},
{
"completion_length": 312.5,
"completions/clipped_ratio": 0.0,
"completions/max_length": 335.0,
"completions/max_terminated_length": 335.0,
"completions/mean_length": 312.5,
"completions/mean_terminated_length": 312.5,
"completions/min_length": 299.0,
"completions/min_terminated_length": 299.0,
"epoch": 0.001,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.7148852944374084,
"kl": 0.2926153540611267,
"learning_rate": 4.666666666666667e-06,
"loss": 0.0003,
"num_tokens": 63443.0,
"reward": 1.436734676361084,
"reward_std": 0.058541782200336456,
"rewards/correctness_reward_func/mean": 0.936734676361084,
"rewards/correctness_reward_func/std": 0.05854179337620735,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 15
},
{
"completion_length": 111.25,
"completions/clipped_ratio": 0.0,
"completions/max_length": 118.0,
"completions/max_terminated_length": 118.0,
"completions/mean_length": 111.25,
"completions/mean_terminated_length": 111.25,
"completions/min_length": 106.0,
"completions/min_terminated_length": 106.0,
"epoch": 0.0010666666666666667,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.1900497674942017,
"kl": 0.6309098601341248,
"learning_rate": 5e-06,
"loss": 0.0006,
"num_tokens": 65517.0,
"reward": 3.331249952316284,
"reward_std": 0.4772970676422119,
"rewards/correctness_reward_func/mean": 2.831249952316284,
"rewards/correctness_reward_func/std": 0.4772970974445343,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 16
},
{
"completion_length": 517.875,
"completions/clipped_ratio": 0.0,
"completions/max_length": 584.0,
"completions/max_terminated_length": 584.0,
"completions/mean_length": 517.875,
"completions/mean_terminated_length": 517.875,
"completions/min_length": 493.0,
"completions/min_terminated_length": 493.0,
"epoch": 0.0011333333333333334,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.4412200450897217,
"kl": 0.35693415999412537,
"learning_rate": 4.999323102948655e-06,
"loss": 0.0004,
"num_tokens": 72052.0,
"reward": 1.3960647583007812,
"reward_std": 0.06674329191446304,
"rewards/correctness_reward_func/mean": 0.896064817905426,
"rewards/correctness_reward_func/std": 0.06674332916736603,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 17
},
{
"completion_length": 314.625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 325.0,
"completions/max_terminated_length": 325.0,
"completions/mean_length": 314.625,
"completions/mean_terminated_length": 314.625,
"completions/min_length": 306.0,
"completions/min_terminated_length": 306.0,
"epoch": 0.0012,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.7217915654182434,
"kl": 0.3315311670303345,
"learning_rate": 4.997292778346312e-06,
"loss": 0.0003,
"num_tokens": 76481.0,
"reward": 1.540816307067871,
"reward_std": 0.062436942011117935,
"rewards/correctness_reward_func/mean": 1.040816307067871,
"rewards/correctness_reward_func/std": 0.062436968088150024,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 18
},
{
"completion_length": 77.875,
"completions/clipped_ratio": 0.0,
"completions/max_length": 116.0,
"completions/max_terminated_length": 116.0,
"completions/mean_length": 77.875,
"completions/mean_terminated_length": 77.875,
"completions/min_length": 63.0,
"completions/min_terminated_length": 63.0,
"epoch": 0.0012666666666666666,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.021702434867620468,
"kl": 1.1031895875930786,
"learning_rate": 4.993910125649561e-06,
"loss": 0.0011,
"num_tokens": 78136.0,
"reward": 1.8666666746139526,
"reward_std": 0.0,
"rewards/correctness_reward_func/mean": 1.3666666746139526,
"rewards/correctness_reward_func/std": 0.0,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 19
},
{
"completion_length": 235.5,
"completions/clipped_ratio": 0.0,
"completions/max_length": 247.0,
"completions/max_terminated_length": 247.0,
"completions/mean_length": 235.5,
"completions/mean_terminated_length": 235.5,
"completions/min_length": 227.0,
"completions/min_terminated_length": 227.0,
"epoch": 0.0013333333333333333,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.0724592208862305,
"kl": 0.3968948423862457,
"learning_rate": 4.989176976624511e-06,
"loss": 0.0004,
"num_tokens": 81660.0,
"reward": 1.5416666269302368,
"reward_std": 0.02357025258243084,
"rewards/correctness_reward_func/mean": 1.0416666269302368,
"rewards/correctness_reward_func/std": 0.023570258170366287,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 20
},
{
"completion_length": 327.375,
"completions/clipped_ratio": 0.0,
"completions/max_length": 361.0,
"completions/max_terminated_length": 361.0,
"completions/mean_length": 327.375,
"completions/mean_terminated_length": 327.375,
"completions/min_length": 303.0,
"completions/min_terminated_length": 303.0,
"epoch": 0.0014,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.620561420917511,
"kl": 0.3201756179332733,
"learning_rate": 4.983095894354858e-06,
"loss": 0.0003,
"num_tokens": 86247.0,
"reward": 1.4596939086914062,
"reward_std": 0.07869534194469452,
"rewards/correctness_reward_func/mean": 0.9596939086914062,
"rewards/correctness_reward_func/std": 0.07869534194469452,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 21
},
{
"completion_length": 113.5,
"completions/clipped_ratio": 0.0,
"completions/max_length": 117.0,
"completions/max_terminated_length": 117.0,
"completions/mean_length": 113.5,
"completions/mean_terminated_length": 113.5,
"completions/min_length": 101.0,
"completions/min_terminated_length": 101.0,
"epoch": 0.0014666666666666667,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0007677710382267833,
"kl": 0.603554368019104,
"learning_rate": 4.975670171853926e-06,
"loss": 0.0006,
"num_tokens": 88531.0,
"reward": 3.5,
"reward_std": 0.0,
"rewards/correctness_reward_func/mean": 3.0,
"rewards/correctness_reward_func/std": 0.0,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 22
},
{
"completion_length": 515.375,
"completions/clipped_ratio": 0.0,
"completions/max_length": 571.0,
"completions/max_terminated_length": 571.0,
"completions/mean_length": 515.375,
"completions/mean_terminated_length": 515.375,
"completions/min_length": 493.0,
"completions/min_terminated_length": 493.0,
"epoch": 0.0015333333333333334,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5324096083641052,
"kl": 0.2494540959596634,
"learning_rate": 4.966903830281449e-06,
"loss": 0.0002,
"num_tokens": 95110.0,
"reward": 1.3530092239379883,
"reward_std": 0.1169159933924675,
"rewards/correctness_reward_func/mean": 0.8530092239379883,
"rewards/correctness_reward_func/std": 0.1169159933924675,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 23
},
{
"completion_length": 71.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 88.0,
"completions/max_terminated_length": 88.0,
"completions/mean_length": 71.0,
"completions/mean_terminated_length": 71.0,
"completions/min_length": 58.0,
"completions/min_terminated_length": 58.0,
"epoch": 0.0016,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.7659872770309448,
"kl": 1.0184522867202759,
"learning_rate": 4.956801616766033e-06,
"loss": 0.001,
"num_tokens": 96894.0,
"reward": 3.484375,
"reward_std": 0.04419417306780815,
"rewards/correctness_reward_func/mean": 3.0,
"rewards/correctness_reward_func/std": 0.0,
"rewards/xmlcount_reward_func/mean": 0.484375,
"rewards/xmlcount_reward_func/std": 0.04419417306780815,
"step": 24
},
{
"completion_length": 516.375,
"completions/clipped_ratio": 0.0,
"completions/max_length": 581.0,
"completions/max_terminated_length": 581.0,
"completions/mean_length": 516.375,
"completions/mean_terminated_length": 516.375,
"completions/min_length": 492.0,
"completions/min_terminated_length": 492.0,
"epoch": 0.0016666666666666668,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.3833364248275757,
"kl": 0.24871811270713806,
"learning_rate": 4.9453690018345144e-06,
"loss": 0.0002,
"num_tokens": 103473.0,
"reward": 1.260879635810852,
"reward_std": 0.07416288554668427,
"rewards/correctness_reward_func/mean": 0.760879635810852,
"rewards/correctness_reward_func/std": 0.07416289299726486,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 25
},
{
"completion_length": 66.625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 73.0,
"completions/max_terminated_length": 73.0,
"completions/mean_length": 66.625,
"completions/mean_terminated_length": 66.625,
"completions/min_length": 60.0,
"completions/min_terminated_length": 60.0,
"epoch": 0.0017333333333333333,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.006470646243542433,
"kl": 1.0391145944595337,
"learning_rate": 4.93261217644956e-06,
"loss": 0.001,
"num_tokens": 105222.0,
"reward": 3.5,
"reward_std": 0.0,
"rewards/correctness_reward_func/mean": 3.0,
"rewards/correctness_reward_func/std": 0.0,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 26
},
{
"completion_length": 332.75,
"completions/clipped_ratio": 0.0,
"completions/max_length": 407.0,
"completions/max_terminated_length": 407.0,
"completions/mean_length": 332.75,
"completions/mean_terminated_length": 332.75,
"completions/min_length": 314.0,
"completions/min_terminated_length": 314.0,
"epoch": 0.0018,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.9204102754592896,
"kl": 0.35308611392974854,
"learning_rate": 4.91853804865716e-06,
"loss": 0.0004,
"num_tokens": 109892.0,
"reward": 1.4581632614135742,
"reward_std": 0.051432717591524124,
"rewards/correctness_reward_func/mean": 0.9581632614135742,
"rewards/correctness_reward_func/std": 0.051432736217975616,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 27
},
{
"completion_length": 80.875,
"completions/clipped_ratio": 0.0,
"completions/max_length": 112.0,
"completions/max_terminated_length": 112.0,
"completions/mean_length": 80.875,
"completions/mean_terminated_length": 80.875,
"completions/min_length": 61.0,
"completions/min_terminated_length": 61.0,
"epoch": 0.0018666666666666666,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.02564767189323902,
"kl": 0.903938889503479,
"learning_rate": 4.903154239845798e-06,
"loss": 0.0009,
"num_tokens": 111579.0,
"reward": 2.0333333015441895,
"reward_std": 0.0,
"rewards/correctness_reward_func/mean": 1.5333333015441895,
"rewards/correctness_reward_func/std": 0.0,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 28
},
{
"completion_length": 505.625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 553.0,
"completions/max_terminated_length": 553.0,
"completions/mean_length": 505.625,
"completions/mean_terminated_length": 505.625,
"completions/min_length": 488.0,
"completions/min_terminated_length": 488.0,
"epoch": 0.0019333333333333333,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5756263732910156,
"kl": 0.296729177236557,
"learning_rate": 4.88646908061933e-06,
"loss": 0.0003,
"num_tokens": 118104.0,
"reward": 1.3613426685333252,
"reward_std": 0.06403681635856628,
"rewards/correctness_reward_func/mean": 0.8925926089286804,
"rewards/correctness_reward_func/std": 0.027431709691882133,
"rewards/xmlcount_reward_func/mean": 0.46875,
"rewards/xmlcount_reward_func/std": 0.0578637570142746,
"step": 29
},
{
"completion_length": 118.875,
"completions/clipped_ratio": 0.0,
"completions/max_length": 143.0,
"completions/max_terminated_length": 143.0,
"completions/mean_length": 118.875,
"completions/mean_terminated_length": 118.875,
"completions/min_length": 103.0,
"completions/min_terminated_length": 103.0,
"epoch": 0.002,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0035723582841455936,
"kl": 0.6597684025764465,
"learning_rate": 4.868491606285823e-06,
"loss": 0.0007,
"num_tokens": 120335.0,
"reward": 1.850000023841858,
"reward_std": 0.0,
"rewards/correctness_reward_func/mean": 1.350000023841858,
"rewards/correctness_reward_func/std": 0.0,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 30
},
{
"completion_length": 505.875,
"completions/clipped_ratio": 0.0,
"completions/max_length": 528.0,
"completions/max_terminated_length": 528.0,
"completions/mean_length": 505.875,
"completions/mean_terminated_length": 505.875,
"completions/min_length": 487.0,
"completions/min_terminated_length": 487.0,
"epoch": 0.0020666666666666667,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.46144646406173706,
"kl": 0.20666523277759552,
"learning_rate": 4.849231551964771e-06,
"loss": 0.0002,
"num_tokens": 126766.0,
"reward": 1.342592477798462,
"reward_std": 0.02956344559788704,
"rewards/correctness_reward_func/mean": 0.8425925970077515,
"rewards/correctness_reward_func/std": 0.0295634176582098,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 31
},
{
"completion_length": 504.875,
"completions/clipped_ratio": 0.0,
"completions/max_length": 559.0,
"completions/max_terminated_length": 559.0,
"completions/mean_length": 504.875,
"completions/mean_terminated_length": 504.875,
"completions/min_length": 455.0,
"completions/min_terminated_length": 455.0,
"epoch": 0.0021333333333333334,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5260511040687561,
"kl": 0.22362877428531647,
"learning_rate": 4.828699347315357e-06,
"loss": 0.0002,
"num_tokens": 133381.0,
"reward": 1.3238425254821777,
"reward_std": 0.1052764430642128,
"rewards/correctness_reward_func/mean": 0.8238425850868225,
"rewards/correctness_reward_func/std": 0.10527642071247101,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 32
},
{
"completion_length": 232.875,
"completions/clipped_ratio": 0.0,
"completions/max_length": 239.0,
"completions/max_terminated_length": 239.0,
"completions/mean_length": 232.875,
"completions/mean_terminated_length": 232.875,
"completions/min_length": 220.0,
"completions/min_terminated_length": 220.0,
"epoch": 0.0022,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.2515056133270264,
"kl": 0.3806723952293396,
"learning_rate": 4.806906110888606e-06,
"loss": 0.0004,
"num_tokens": 136836.0,
"reward": 1.5499999523162842,
"reward_std": 0.03563486412167549,
"rewards/correctness_reward_func/mean": 1.0499999523162842,
"rewards/correctness_reward_func/std": 0.035634856671094894,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 33
},
{
"completion_length": 231.125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 238.0,
"completions/max_terminated_length": 238.0,
"completions/mean_length": 231.125,
"completions/mean_terminated_length": 231.125,
"completions/min_length": 225.0,
"completions/min_terminated_length": 225.0,
"epoch": 0.002266666666666667,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.8692136406898499,
"kl": 0.3986778259277344,
"learning_rate": 4.783863644106502e-06,
"loss": 0.0004,
"num_tokens": 140421.0,
"reward": 1.5833333730697632,
"reward_std": 0.05909371376037598,
"rewards/correctness_reward_func/mean": 1.0833333730697632,
"rewards/correctness_reward_func/std": 0.05909368395805359,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 34
},
{
"completion_length": 112.625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 126.0,
"completions/max_terminated_length": 126.0,
"completions/mean_length": 112.625,
"completions/mean_terminated_length": 112.625,
"completions/min_length": 106.0,
"completions/min_terminated_length": 106.0,
"epoch": 0.0023333333333333335,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.6767317056655884,
"kl": 0.6677060723304749,
"learning_rate": 4.759584424871302e-06,
"loss": 0.0007,
"num_tokens": 142490.0,
"reward": 1.7562499046325684,
"reward_std": 0.034718237817287445,
"rewards/correctness_reward_func/mean": 1.2562499046325684,
"rewards/correctness_reward_func/std": 0.03471822291612625,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 35
},
{
"completion_length": 111.5,
"completions/clipped_ratio": 0.0,
"completions/max_length": 124.0,
"completions/max_terminated_length": 124.0,
"completions/mean_length": 111.5,
"completions/mean_terminated_length": 111.5,
"completions/min_length": 106.0,
"completions/min_terminated_length": 106.0,
"epoch": 0.0024,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.146012544631958,
"kl": 0.5600647926330566,
"learning_rate": 4.734081600808531e-06,
"loss": 0.0006,
"num_tokens": 144622.0,
"reward": 2.4781250953674316,
"reward_std": 0.8475213050842285,
"rewards/correctness_reward_func/mean": 1.978124976158142,
"rewards/correctness_reward_func/std": 0.8475213646888733,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 36
},
{
"completion_length": 402.25,
"completions/clipped_ratio": 0.0,
"completions/max_length": 413.0,
"completions/max_terminated_length": 413.0,
"completions/mean_length": 402.25,
"completions/mean_terminated_length": 402.25,
"completions/min_length": 391.0,
"completions/min_terminated_length": 391.0,
"epoch": 0.0024666666666666665,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.6264684796333313,
"kl": 0.30946826934814453,
"learning_rate": 4.707368982147318e-06,
"loss": 0.0003,
"num_tokens": 150032.0,
"reward": 1.4140625,
"reward_std": 0.06318817287683487,
"rewards/correctness_reward_func/mean": 0.9140625,
"rewards/correctness_reward_func/std": 0.06318815797567368,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 37
},
{
"completion_length": 243.25,
"completions/clipped_ratio": 0.0,
"completions/max_length": 249.0,
"completions/max_terminated_length": 249.0,
"completions/mean_length": 243.25,
"completions/mean_terminated_length": 243.25,
"completions/min_length": 234.0,
"completions/min_terminated_length": 234.0,
"epoch": 0.002533333333333333,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.272317886352539,
"kl": 0.37085089087486267,
"learning_rate": 4.679461034241906e-06,
"loss": 0.0004,
"num_tokens": 153546.0,
"reward": 1.433333396911621,
"reward_std": 0.0471404492855072,
"rewards/correctness_reward_func/mean": 0.9333332777023315,
"rewards/correctness_reward_func/std": 0.0471404492855072,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 38
},
{
"completion_length": 524.125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 565.0,
"completions/max_terminated_length": 565.0,
"completions/mean_length": 524.125,
"completions/mean_terminated_length": 524.125,
"completions/min_length": 496.0,
"completions/min_terminated_length": 496.0,
"epoch": 0.0026,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5807402729988098,
"kl": 0.2668019235134125,
"learning_rate": 4.650372869738415e-06,
"loss": 0.0003,
"num_tokens": 160147.0,
"reward": 1.3439815044403076,
"reward_std": 0.07557178288698196,
"rewards/correctness_reward_func/mean": 0.8439815044403076,
"rewards/correctness_reward_func/std": 0.07557182013988495,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 39
},
{
"completion_length": 76.875,
"completions/clipped_ratio": 0.0,
"completions/max_length": 118.0,
"completions/max_terminated_length": 118.0,
"completions/mean_length": 76.875,
"completions/mean_terminated_length": 76.875,
"completions/min_length": 60.0,
"completions/min_terminated_length": 60.0,
"epoch": 0.0026666666666666666,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.3997167348861694,
"kl": 0.9243869185447693,
"learning_rate": 4.620120240391065e-06,
"loss": 0.0009,
"num_tokens": 161978.0,
"reward": 3.299999952316284,
"reward_std": 0.5656854510307312,
"rewards/correctness_reward_func/mean": 2.799999952316284,
"rewards/correctness_reward_func/std": 0.5656854510307312,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 40
},
{
"completion_length": 521.375,
"completions/clipped_ratio": 0.0,
"completions/max_length": 626.0,
"completions/max_terminated_length": 626.0,
"completions/mean_length": 521.375,
"completions/mean_terminated_length": 521.375,
"completions/min_length": 472.0,
"completions/min_terminated_length": 472.0,
"epoch": 0.0027333333333333333,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.6085057258605957,
"kl": 0.2621612548828125,
"learning_rate": 4.588719528532342e-06,
"loss": 0.0003,
"num_tokens": 168597.0,
"reward": 1.2180554866790771,
"reward_std": 0.09554418921470642,
"rewards/correctness_reward_func/mean": 0.7180555462837219,
"rewards/correctness_reward_func/std": 0.09554421156644821,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 41
},
{
"completion_length": 409.75,
"completions/clipped_ratio": 0.0,
"completions/max_length": 472.0,
"completions/max_terminated_length": 472.0,
"completions/mean_length": 409.75,
"completions/mean_terminated_length": 409.75,
"completions/min_length": 388.0,
"completions/min_terminated_length": 388.0,
"epoch": 0.0028,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.6022652983665466,
"kl": 0.27105629444122314,
"learning_rate": 4.556187738201656e-06,
"loss": 0.0003,
"num_tokens": 174019.0,
"reward": 1.3343749046325684,
"reward_std": 0.026516515761613846,
"rewards/correctness_reward_func/mean": 0.8343750238418579,
"rewards/correctness_reward_func/std": 0.026516523212194443,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 42
},
{
"completion_length": 252.25,
"completions/clipped_ratio": 0.125,
"completions/max_length": 356.0,
"completions/max_terminated_length": 356.0,
"completions/mean_length": 267.125,
"completions/mean_terminated_length": 254.4285888671875,
"completions/min_length": 232.0,
"completions/min_terminated_length": 232.0,
"epoch": 0.0028666666666666667,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.8693382740020752,
"kl": 0.3942442536354065,
"learning_rate": 4.522542485937369e-06,
"loss": 0.0004,
"num_tokens": 177812.0,
"reward": 1.4541666507720947,
"reward_std": 0.024800769984722137,
"rewards/correctness_reward_func/mean": 0.9541666507720947,
"rewards/correctness_reward_func/std": 0.02480078488588333,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 43
},
{
"completion_length": 231.25,
"completions/clipped_ratio": 0.0,
"completions/max_length": 242.0,
"completions/max_terminated_length": 242.0,
"completions/mean_length": 231.25,
"completions/mean_terminated_length": 231.25,
"completions/min_length": 223.0,
"completions/min_terminated_length": 223.0,
"epoch": 0.0029333333333333334,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.0458123683929443,
"kl": 0.39546874165534973,
"learning_rate": 4.48780199123712e-06,
"loss": 0.0004,
"num_tokens": 181310.0,
"reward": 1.616666555404663,
"reward_std": 0.050395265221595764,
"rewards/correctness_reward_func/mean": 1.116666555404663,
"rewards/correctness_reward_func/std": 0.05039524286985397,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 44
},
{
"completion_length": 69.625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 89.0,
"completions/max_terminated_length": 89.0,
"completions/mean_length": 69.625,
"completions/mean_terminated_length": 69.625,
"completions/min_length": 62.0,
"completions/min_terminated_length": 62.0,
"epoch": 0.003,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.004571537487208843,
"kl": 0.9226992130279541,
"learning_rate": 4.451985066691649e-06,
"loss": 0.0009,
"num_tokens": 183003.0,
"reward": 2.0333333015441895,
"reward_std": 0.0,
"rewards/correctness_reward_func/mean": 1.5333333015441895,
"rewards/correctness_reward_func/std": 0.0,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 45
},
{
"completion_length": 238.125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 269.0,
"completions/max_terminated_length": 269.0,
"completions/mean_length": 238.125,
"completions/mean_terminated_length": 238.125,
"completions/min_length": 226.0,
"completions/min_terminated_length": 226.0,
"epoch": 0.0030666666666666668,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.8987017273902893,
"kl": 0.4024360477924347,
"learning_rate": 4.415111107797445e-06,
"loss": 0.0004,
"num_tokens": 186500.0,
"reward": 1.5750000476837158,
"reward_std": 0.0388321727514267,
"rewards/correctness_reward_func/mean": 1.0750000476837158,
"rewards/correctness_reward_func/std": 0.038832176476716995,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 46
},
{
"completion_length": 242.375,
"completions/clipped_ratio": 0.0,
"completions/max_length": 321.0,
"completions/max_terminated_length": 321.0,
"completions/mean_length": 242.375,
"completions/mean_terminated_length": 242.375,
"completions/min_length": 224.0,
"completions/min_terminated_length": 224.0,
"epoch": 0.0031333333333333335,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.7395060658454895,
"kl": 0.3209053874015808,
"learning_rate": 4.377200082453748e-06,
"loss": 0.0003,
"num_tokens": 190023.0,
"reward": 1.4749999046325684,
"reward_std": 0.07506612688302994,
"rewards/correctness_reward_func/mean": 0.9750000238418579,
"rewards/correctness_reward_func/std": 0.07506611198186874,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 47
},
{
"completion_length": 510.25,
"completions/clipped_ratio": 0.0,
"completions/max_length": 557.0,
"completions/max_terminated_length": 557.0,
"completions/mean_length": 510.25,
"completions/mean_terminated_length": 510.25,
"completions/min_length": 494.0,
"completions/min_terminated_length": 494.0,
"epoch": 0.0032,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.46141135692596436,
"kl": 0.288358598947525,
"learning_rate": 4.338272520149572e-06,
"loss": 0.0003,
"num_tokens": 196513.0,
"reward": 1.4010416269302368,
"reward_std": 0.053073856979608536,
"rewards/correctness_reward_func/mean": 0.9166666269302368,
"rewards/correctness_reward_func/std": 0.03259018436074257,
"rewards/xmlcount_reward_func/mean": 0.484375,
"rewards/xmlcount_reward_func/std": 0.04419417306780815,
"step": 48
},
{
"completion_length": 126.625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 200.0,
"completions/max_terminated_length": 200.0,
"completions/mean_length": 126.625,
"completions/mean_terminated_length": 126.625,
"completions/min_length": 104.0,
"completions/min_terminated_length": 104.0,
"epoch": 0.003266666666666667,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.2877793312072754,
"kl": 0.701366126537323,
"learning_rate": 4.2983495008466285e-06,
"loss": 0.0007,
"num_tokens": 198702.0,
"reward": 2.2718749046325684,
"reward_std": 0.7668858170509338,
"rewards/correctness_reward_func/mean": 1.771875023841858,
"rewards/correctness_reward_func/std": 0.7668858766555786,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 49
},
{
"completion_length": 77.125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 139.0,
"completions/max_terminated_length": 139.0,
"completions/mean_length": 77.125,
"completions/mean_terminated_length": 77.125,
"completions/min_length": 63.0,
"completions/min_terminated_length": 63.0,
"epoch": 0.0033333333333333335,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0013717930996790528,
"kl": 0.7978397011756897,
"learning_rate": 4.257452643564155e-06,
"loss": 0.0008,
"num_tokens": 200543.0,
"reward": 3.5,
"reward_std": 0.0,
"rewards/correctness_reward_func/mean": 3.0,
"rewards/correctness_reward_func/std": 0.0,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 50
},
{
"completion_length": 300.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 310.0,
"completions/max_terminated_length": 310.0,
"completions/mean_length": 300.0,
"completions/mean_terminated_length": 300.0,
"completions/min_length": 292.0,
"completions/min_terminated_length": 292.0,
"epoch": 0.0034,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.9057684540748596,
"kl": 0.36814409494400024,
"learning_rate": 4.215604094671835e-06,
"loss": 0.0004,
"num_tokens": 204775.0,
"reward": 1.4673469066619873,
"reward_std": 0.03400970622897148,
"rewards/correctness_reward_func/mean": 0.9673469066619873,
"rewards/correctness_reward_func/std": 0.03400970995426178,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 51
},
{
"completion_length": 264.25,
"completions/clipped_ratio": 0.0,
"completions/max_length": 278.0,
"completions/max_terminated_length": 278.0,
"completions/mean_length": 264.25,
"completions/mean_terminated_length": 264.25,
"completions/min_length": 257.0,
"completions/min_terminated_length": 257.0,
"epoch": 0.0034666666666666665,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.1917438507080078,
"kl": 0.4059743583202362,
"learning_rate": 4.172826515897146e-06,
"loss": 0.0004,
"num_tokens": 208865.0,
"reward": 1.1244897842407227,
"reward_std": 0.05225903540849686,
"rewards/correctness_reward_func/mean": 0.6244897842407227,
"rewards/correctness_reward_func/std": 0.05225902795791626,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 52
},
{
"completion_length": 82.25,
"completions/clipped_ratio": 0.0,
"completions/max_length": 128.0,
"completions/max_terminated_length": 128.0,
"completions/mean_length": 82.25,
"completions/mean_terminated_length": 82.25,
"completions/min_length": 61.0,
"completions/min_terminated_length": 61.0,
"epoch": 0.003533333333333333,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.004285034723579884,
"kl": 0.8721116781234741,
"learning_rate": 4.129143072053639e-06,
"loss": 0.0009,
"num_tokens": 210611.0,
"reward": 3.5,
"reward_std": 0.0,
"rewards/correctness_reward_func/mean": 3.0,
"rewards/correctness_reward_func/std": 0.0,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 53
},
{
"completion_length": 394.5,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 394.5,
"completions/mean_terminated_length": 394.5,
"completions/min_length": 389.0,
"completions/min_terminated_length": 389.0,
"epoch": 0.0036,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.7819033861160278,
"kl": 0.23851144313812256,
"learning_rate": 4.084577418496775e-06,
"loss": 0.0002,
"num_tokens": 215887.0,
"reward": 1.44921875,
"reward_std": 0.04475096985697746,
"rewards/correctness_reward_func/mean": 0.94921875,
"rewards/correctness_reward_func/std": 0.044750988483428955,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 54
},
{
"completion_length": 309.375,
"completions/clipped_ratio": 0.0,
"completions/max_length": 321.0,
"completions/max_terminated_length": 321.0,
"completions/mean_length": 309.375,
"completions/mean_terminated_length": 309.375,
"completions/min_length": 302.0,
"completions/min_terminated_length": 302.0,
"epoch": 0.0036666666666666666,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.6779981255531311,
"kl": 0.37480688095092773,
"learning_rate": 4.039153688314146e-06,
"loss": 0.0004,
"num_tokens": 220202.0,
"reward": 1.5561224222183228,
"reward_std": 0.05771271511912346,
"rewards/correctness_reward_func/mean": 1.0561224222183228,
"rewards/correctness_reward_func/std": 0.05771271139383316,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 55
},
{
"completion_length": 124.625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 162.0,
"completions/max_terminated_length": 162.0,
"completions/mean_length": 124.625,
"completions/mean_terminated_length": 124.625,
"completions/min_length": 110.0,
"completions/min_terminated_length": 110.0,
"epoch": 0.0037333333333333333,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.004615799989551306,
"kl": 0.7520985007286072,
"learning_rate": 3.992896479256966e-06,
"loss": 0.0008,
"num_tokens": 222471.0,
"reward": 1.774999976158142,
"reward_std": 0.0,
"rewards/correctness_reward_func/mean": 1.274999976158142,
"rewards/correctness_reward_func/std": 0.0,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 56
},
{
"completion_length": 392.5,
"completions/clipped_ratio": 0.0,
"completions/max_length": 400.0,
"completions/max_terminated_length": 400.0,
"completions/mean_length": 392.5,
"completions/mean_terminated_length": 392.5,
"completions/min_length": 385.0,
"completions/min_terminated_length": 385.0,
"epoch": 0.0038,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.64808189868927,
"kl": 0.4047853946685791,
"learning_rate": 3.945830840419966e-06,
"loss": 0.0004,
"num_tokens": 227763.0,
"reward": 1.271093726158142,
"reward_std": 0.09154777228832245,
"rewards/correctness_reward_func/mean": 0.7710937261581421,
"rewards/correctness_reward_func/std": 0.09154780954122543,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 57
},
{
"completion_length": 236.375,
"completions/clipped_ratio": 0.0,
"completions/max_length": 248.0,
"completions/max_terminated_length": 248.0,
"completions/mean_length": 236.375,
"completions/mean_terminated_length": 236.375,
"completions/min_length": 227.0,
"completions/min_terminated_length": 227.0,
"epoch": 0.0038666666666666667,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.7382640242576599,
"kl": 0.40867918729782104,
"learning_rate": 3.897982258676867e-06,
"loss": 0.0004,
"num_tokens": 231302.0,
"reward": 1.649999976158142,
"reward_std": 0.05039524659514427,
"rewards/correctness_reward_func/mean": 1.149999976158142,
"rewards/correctness_reward_func/std": 0.05039524286985397,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 58
},
{
"completion_length": 490.75,
"completions/clipped_ratio": 0.0,
"completions/max_length": 495.0,
"completions/max_terminated_length": 495.0,
"completions/mean_length": 490.75,
"completions/mean_terminated_length": 490.75,
"completions/min_length": 486.0,
"completions/min_terminated_length": 486.0,
"epoch": 0.003933333333333333,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.47874385118484497,
"kl": 0.26013773679733276,
"learning_rate": 3.849376644878783e-06,
"loss": 0.0003,
"num_tokens": 237692.0,
"reward": 1.3481481075286865,
"reward_std": 0.036288779228925705,
"rewards/correctness_reward_func/mean": 0.8481481075286865,
"rewards/correctness_reward_func/std": 0.03628873825073242,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 59
},
{
"completion_length": 220.25,
"completions/clipped_ratio": 0.0,
"completions/max_length": 235.0,
"completions/max_terminated_length": 235.0,
"completions/mean_length": 220.25,
"completions/mean_terminated_length": 220.25,
"completions/min_length": 211.0,
"completions/min_terminated_length": 211.0,
"epoch": 0.004,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.9544376730918884,
"kl": 0.49662235379219055,
"learning_rate": 3.8000403198230385e-06,
"loss": 0.0005,
"num_tokens": 241198.0,
"reward": 1.5499999523162842,
"reward_std": 0.039840973913669586,
"rewards/correctness_reward_func/mean": 1.0499999523162842,
"rewards/correctness_reward_func/std": 0.039840973913669586,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 60
},
{
"completion_length": 393.875,
"completions/clipped_ratio": 0.0,
"completions/max_length": 399.0,
"completions/max_terminated_length": 399.0,
"completions/mean_length": 393.875,
"completions/mean_terminated_length": 393.875,
"completions/min_length": 388.0,
"completions/min_terminated_length": 388.0,
"epoch": 0.004066666666666666,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5445326566696167,
"kl": 0.28289133310317993,
"learning_rate": 3.7500000000000005e-06,
"loss": 0.0003,
"num_tokens": 246621.0,
"reward": 1.498437523841858,
"reward_std": 0.024032628163695335,
"rewards/correctness_reward_func/mean": 0.9984374642372131,
"rewards/correctness_reward_func/std": 0.024032630026340485,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 61
},
{
"completion_length": 525.75,
"completions/clipped_ratio": 0.0,
"completions/max_length": 931.0,
"completions/max_terminated_length": 931.0,
"completions/mean_length": 525.75,
"completions/mean_terminated_length": 525.75,
"completions/min_length": 449.0,
"completions/min_terminated_length": 449.0,
"epoch": 0.0041333333333333335,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5638792514801025,
"kl": 0.28673431277275085,
"learning_rate": 3.699282783125616e-06,
"loss": 0.0003,
"num_tokens": 253339.0,
"reward": 1.1914352178573608,
"reward_std": 0.10863616317510605,
"rewards/correctness_reward_func/mean": 0.6914352178573608,
"rewards/correctness_reward_func/std": 0.10863618552684784,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 62
},
{
"completion_length": 114.5,
"completions/clipped_ratio": 0.0,
"completions/max_length": 157.0,
"completions/max_terminated_length": 157.0,
"completions/mean_length": 114.5,
"completions/mean_terminated_length": 114.5,
"completions/min_length": 102.0,
"completions/min_terminated_length": 102.0,
"epoch": 0.0042,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.2769147157669067,
"kl": 0.7663796544075012,
"learning_rate": 3.6479161334675294e-06,
"loss": 0.0008,
"num_tokens": 255431.0,
"reward": 1.7281250953674316,
"reward_std": 0.05580177903175354,
"rewards/correctness_reward_func/mean": 1.2281250953674316,
"rewards/correctness_reward_func/std": 0.05580177158117294,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 63
},
{
"completion_length": 232.375,
"completions/clipped_ratio": 0.0,
"completions/max_length": 246.0,
"completions/max_terminated_length": 246.0,
"completions/mean_length": 232.375,
"completions/mean_terminated_length": 232.375,
"completions/min_length": 218.0,
"completions/min_terminated_length": 218.0,
"epoch": 0.004266666666666667,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.7423648834228516,
"kl": 0.4253622889518738,
"learning_rate": 3.595927866972694e-06,
"loss": 0.0004,
"num_tokens": 259026.0,
"reward": 1.5833332538604736,
"reward_std": 0.03984096646308899,
"rewards/correctness_reward_func/mean": 1.0833332538604736,
"rewards/correctness_reward_func/std": 0.039840951561927795,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 64
},
{
"completion_length": 390.875,
"completions/clipped_ratio": 0.0,
"completions/max_length": 396.0,
"completions/max_terminated_length": 396.0,
"completions/mean_length": 390.875,
"completions/mean_terminated_length": 390.875,
"completions/min_length": 380.0,
"completions/min_terminated_length": 380.0,
"epoch": 0.004333333333333333,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.6674738526344299,
"kl": 0.24484872817993164,
"learning_rate": 3.543346136204545e-06,
"loss": 0.0002,
"num_tokens": 264257.0,
"reward": 1.482031226158142,
"reward_std": 0.03741618990898132,
"rewards/correctness_reward_func/mean": 0.9820312261581421,
"rewards/correctness_reward_func/std": 0.037416212260723114,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 65
},
{
"completion_length": 457.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 502.0,
"completions/max_terminated_length": 502.0,
"completions/mean_length": 457.0,
"completions/mean_terminated_length": 457.0,
"completions/min_length": 434.0,
"completions/min_terminated_length": 434.0,
"epoch": 0.0044,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.6465625166893005,
"kl": 0.7725451588630676,
"learning_rate": 3.4901994150978926e-06,
"loss": 0.0008,
"num_tokens": 270425.0,
"reward": 1.0481481552124023,
"reward_std": 0.20149599015712738,
"rewards/correctness_reward_func/mean": 0.5481481552124023,
"rewards/correctness_reward_func/std": 0.20149599015712738,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 66
},
{
"completion_length": 130.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 254.0,
"completions/max_terminated_length": 254.0,
"completions/mean_length": 130.0,
"completions/mean_terminated_length": 130.0,
"completions/min_length": 102.0,
"completions/min_terminated_length": 102.0,
"epoch": 0.0044666666666666665,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.4314974546432495,
"kl": 0.6890926361083984,
"learning_rate": 3.436516483539781e-06,
"loss": 0.0007,
"num_tokens": 272641.0,
"reward": 2.3375000953674316,
"reward_std": 0.7304597496986389,
"rewards/correctness_reward_func/mean": 1.837499976158142,
"rewards/correctness_reward_func/std": 0.7304597496986389,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 67
},
{
"completion_length": 98.375,
"completions/clipped_ratio": 0.0,
"completions/max_length": 163.0,
"completions/max_terminated_length": 163.0,
"completions/mean_length": 98.375,
"completions/mean_terminated_length": 98.375,
"completions/min_length": 63.0,
"completions/min_terminated_length": 63.0,
"epoch": 0.004533333333333334,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.40536630153656,
"kl": 0.7224768400192261,
"learning_rate": 3.3823264117846722e-06,
"loss": 0.0007,
"num_tokens": 274532.0,
"reward": 3.299999952316284,
"reward_std": 0.5656854510307312,
"rewards/correctness_reward_func/mean": 2.799999952316284,
"rewards/correctness_reward_func/std": 0.5656854510307312,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 68
},
{
"completion_length": 109.875,
"completions/clipped_ratio": 0.0,
"completions/max_length": 121.0,
"completions/max_terminated_length": 121.0,
"completions/mean_length": 109.875,
"completions/mean_terminated_length": 109.875,
"completions/min_length": 103.0,
"completions/min_terminated_length": 103.0,
"epoch": 0.0046,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.5540027618408203,
"kl": 0.5734180808067322,
"learning_rate": 3.3276585447123957e-06,
"loss": 0.0006,
"num_tokens": 276595.0,
"reward": 1.859375,
"reward_std": 0.02651650831103325,
"rewards/correctness_reward_func/mean": 1.359375,
"rewards/correctness_reward_func/std": 0.02651648037135601,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 69
},
{
"completion_length": 303.625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 312.0,
"completions/max_terminated_length": 312.0,
"completions/mean_length": 303.625,
"completions/mean_terminated_length": 303.625,
"completions/min_length": 299.0,
"completions/min_terminated_length": 299.0,
"epoch": 0.004666666666666667,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.7526683211326599,
"kl": 0.30397114157676697,
"learning_rate": 3.272542485937369e-06,
"loss": 0.0003,
"num_tokens": 280896.0,
"reward": 1.4918367862701416,
"reward_std": 0.06512364000082016,
"rewards/correctness_reward_func/mean": 0.9918367266654968,
"rewards/correctness_reward_func/std": 0.06512364000082016,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 70
},
{
"completion_length": 443.375,
"completions/clipped_ratio": 0.0,
"completions/max_length": 761.0,
"completions/max_terminated_length": 761.0,
"completions/mean_length": 443.375,
"completions/mean_terminated_length": 443.375,
"completions/min_length": 391.0,
"completions/min_terminated_length": 391.0,
"epoch": 0.004733333333333333,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.3900875747203827,
"kl": 0.45952847599983215,
"learning_rate": 3.217008081777726e-06,
"loss": 0.0005,
"num_tokens": 286619.0,
"reward": 1.3507812023162842,
"reward_std": 0.13555601239204407,
"rewards/correctness_reward_func/mean": 0.897656261920929,
"rewards/correctness_reward_func/std": 0.0323791466653347,
"rewards/xmlcount_reward_func/mean": 0.453125,
"rewards/xmlcount_reward_func/std": 0.13258251547813416,
"step": 71
},
{
"completion_length": 493.75,
"completions/clipped_ratio": 0.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 512.0,
"completions/mean_length": 493.75,
"completions/mean_terminated_length": 493.75,
"completions/min_length": 444.0,
"completions/min_terminated_length": 444.0,
"epoch": 0.0048,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.4038362205028534,
"kl": 0.24884635210037231,
"learning_rate": 3.1610854050930063e-06,
"loss": 0.0002,
"num_tokens": 293065.0,
"reward": 1.3905092477798462,
"reward_std": 0.09639514237642288,
"rewards/correctness_reward_func/mean": 0.8905092477798462,
"rewards/correctness_reward_func/std": 0.09639513492584229,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 72
},
{
"completion_length": 329.875,
"completions/clipped_ratio": 0.0,
"completions/max_length": 468.0,
"completions/max_terminated_length": 468.0,
"completions/mean_length": 329.875,
"completions/mean_terminated_length": 329.875,
"completions/min_length": 299.0,
"completions/min_terminated_length": 299.0,
"epoch": 0.004866666666666667,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.7928107976913452,
"kl": 0.33076590299606323,
"learning_rate": 3.1048047389991693e-06,
"loss": 0.0003,
"num_tokens": 297712.0,
"reward": 1.445918321609497,
"reward_std": 0.04708431288599968,
"rewards/correctness_reward_func/mean": 0.9459183216094971,
"rewards/correctness_reward_func/std": 0.04708430916070938,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 73
},
{
"completion_length": 497.375,
"completions/clipped_ratio": 0.0,
"completions/max_length": 533.0,
"completions/max_terminated_length": 533.0,
"completions/mean_length": 497.375,
"completions/mean_terminated_length": 497.375,
"completions/min_length": 447.0,
"completions/min_terminated_length": 447.0,
"epoch": 0.004933333333333333,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5035240650177002,
"kl": 0.27565228939056396,
"learning_rate": 3.0481965604697582e-06,
"loss": 0.0003,
"num_tokens": 304155.0,
"reward": 1.2884259223937988,
"reward_std": 0.14154860377311707,
"rewards/correctness_reward_func/mean": 0.7884259223937988,
"rewards/correctness_reward_func/std": 0.14154860377311707,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 74
},
{
"completion_length": 309.5,
"completions/clipped_ratio": 0.0,
"completions/max_length": 325.0,
"completions/max_terminated_length": 325.0,
"completions/mean_length": 309.5,
"completions/mean_terminated_length": 309.5,
"completions/min_length": 301.0,
"completions/min_terminated_length": 301.0,
"epoch": 0.005,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.7498885989189148,
"kl": 0.40710383653640747,
"learning_rate": 2.9912915238320755e-06,
"loss": 0.0004,
"num_tokens": 308503.0,
"reward": 1.4122449159622192,
"reward_std": 0.029270855709910393,
"rewards/correctness_reward_func/mean": 0.9122449159622192,
"rewards/correctness_reward_func/std": 0.029270906001329422,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 75
},
{
"completion_length": 106.125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 113.0,
"completions/max_terminated_length": 113.0,
"completions/mean_length": 106.125,
"completions/mean_terminated_length": 106.125,
"completions/min_length": 101.0,
"completions/min_terminated_length": 101.0,
"epoch": 0.005066666666666666,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.3539204597473145,
"kl": 0.5555666089057922,
"learning_rate": 2.9341204441673267e-06,
"loss": 0.0006,
"num_tokens": 310560.0,
"reward": 1.803125023841858,
"reward_std": 0.03881620615720749,
"rewards/correctness_reward_func/mean": 1.303125023841858,
"rewards/correctness_reward_func/std": 0.038816213607788086,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 76
},
{
"completion_length": 62.625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 70.0,
"completions/max_terminated_length": 70.0,
"completions/mean_length": 62.625,
"completions/mean_terminated_length": 62.625,
"completions/min_length": 59.0,
"completions/min_terminated_length": 59.0,
"epoch": 0.0051333333333333335,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.8112568855285645,
"kl": 1.0676077604293823,
"learning_rate": 2.876714280623708e-06,
"loss": 0.0011,
"num_tokens": 312141.0,
"reward": 1.691666603088379,
"reward_std": 0.17343400418758392,
"rewards/correctness_reward_func/mean": 1.191666603088379,
"rewards/correctness_reward_func/std": 0.17343401908874512,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 77
},
{
"completion_length": 307.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 324.0,
"completions/max_terminated_length": 324.0,
"completions/mean_length": 307.0,
"completions/mean_terminated_length": 307.0,
"completions/min_length": 301.0,
"completions/min_terminated_length": 301.0,
"epoch": 0.0052,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.8042576909065247,
"kl": 0.4135895371437073,
"learning_rate": 2.8191041196514874e-06,
"loss": 0.0004,
"num_tokens": 316581.0,
"reward": 1.5102040767669678,
"reward_std": 0.026180749759078026,
"rewards/correctness_reward_func/mean": 1.0102040767669678,
"rewards/correctness_reward_func/std": 0.026180710643529892,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 78
},
{
"completion_length": 445.125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 454.0,
"completions/max_terminated_length": 454.0,
"completions/mean_length": 445.125,
"completions/mean_terminated_length": 445.125,
"completions/min_length": 437.0,
"completions/min_terminated_length": 437.0,
"epoch": 0.005266666666666667,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.3808617889881134,
"kl": 0.3184884190559387,
"learning_rate": 2.761321158169134e-06,
"loss": 0.0003,
"num_tokens": 322718.0,
"reward": 1.229629635810852,
"reward_std": 0.020091881975531578,
"rewards/correctness_reward_func/mean": 0.729629635810852,
"rewards/correctness_reward_func/std": 0.020091887563467026,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 79
},
{
"completion_length": 117.875,
"completions/clipped_ratio": 0.0,
"completions/max_length": 150.0,
"completions/max_terminated_length": 150.0,
"completions/mean_length": 117.875,
"completions/mean_terminated_length": 117.875,
"completions/min_length": 103.0,
"completions/min_terminated_length": 103.0,
"epoch": 0.005333333333333333,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.203380584716797,
"kl": 0.9543490409851074,
"learning_rate": 2.703396686669646e-06,
"loss": 0.001,
"num_tokens": 325029.0,
"reward": 1.9249999523162842,
"reward_std": 0.13887302577495575,
"rewards/correctness_reward_func/mean": 1.4249999523162842,
"rewards/correctness_reward_func/std": 0.13887299597263336,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 80
},
{
"completion_length": 401.875,
"completions/clipped_ratio": 0.0,
"completions/max_length": 466.0,
"completions/max_terminated_length": 466.0,
"completions/mean_length": 401.875,
"completions/mean_terminated_length": 401.875,
"completions/min_length": 387.0,
"completions/min_terminated_length": 387.0,
"epoch": 0.0054,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.6207239031791687,
"kl": 0.3206217586994171,
"learning_rate": 2.6453620722761897e-06,
"loss": 0.0003,
"num_tokens": 330380.0,
"reward": 1.4375,
"reward_std": 0.034718237817287445,
"rewards/correctness_reward_func/mean": 0.9375,
"rewards/correctness_reward_func/std": 0.03471824526786804,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 81
},
{
"completion_length": 510.75,
"completions/clipped_ratio": 0.0,
"completions/max_length": 521.0,
"completions/max_terminated_length": 521.0,
"completions/mean_length": 510.75,
"completions/mean_terminated_length": 510.75,
"completions/min_length": 502.0,
"completions/min_terminated_length": 502.0,
"epoch": 0.0054666666666666665,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.4737114906311035,
"kl": 0.22106648981571198,
"learning_rate": 2.587248741756253e-06,
"loss": 0.0002,
"num_tokens": 336954.0,
"reward": 1.440740704536438,
"reward_std": 0.06372092664241791,
"rewards/correctness_reward_func/mean": 0.940740704536438,
"rewards/correctness_reward_func/std": 0.06372092664241791,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 82
},
{
"completion_length": 226.125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 230.0,
"completions/max_terminated_length": 230.0,
"completions/mean_length": 226.125,
"completions/mean_terminated_length": 226.125,
"completions/min_length": 222.0,
"completions/min_terminated_length": 222.0,
"epoch": 0.005533333333333334,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.6092992424964905,
"kl": 0.3960443437099457,
"learning_rate": 2.5290881645034932e-06,
"loss": 0.0004,
"num_tokens": 340379.0,
"reward": 1.5791666507720947,
"reward_std": 0.024800803512334824,
"rewards/correctness_reward_func/mean": 1.0791666507720947,
"rewards/correctness_reward_func/std": 0.024800801649689674,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 83
},
{
"completion_length": 393.375,
"completions/clipped_ratio": 0.0,
"completions/max_length": 410.0,
"completions/max_terminated_length": 410.0,
"completions/mean_length": 393.375,
"completions/mean_terminated_length": 393.375,
"completions/min_length": 385.0,
"completions/min_terminated_length": 385.0,
"epoch": 0.0056,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.7555404901504517,
"kl": 0.3491273820400238,
"learning_rate": 2.470911835496508e-06,
"loss": 0.0003,
"num_tokens": 345710.0,
"reward": 1.3624999523162842,
"reward_std": 0.036135926842689514,
"rewards/correctness_reward_func/mean": 0.8624999523162842,
"rewards/correctness_reward_func/std": 0.03613590821623802,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 84
},
{
"completion_length": 231.625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 243.0,
"completions/max_terminated_length": 243.0,
"completions/mean_length": 231.625,
"completions/mean_terminated_length": 231.625,
"completions/min_length": 221.0,
"completions/min_terminated_length": 221.0,
"epoch": 0.005666666666666667,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.1376268863677979,
"kl": 0.4538520574569702,
"learning_rate": 2.4127512582437486e-06,
"loss": 0.0005,
"num_tokens": 349107.0,
"reward": 1.6541666984558105,
"reward_std": 0.0889756828546524,
"rewards/correctness_reward_func/mean": 1.1541666984558105,
"rewards/correctness_reward_func/std": 0.08897567540407181,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 85
},
{
"completion_length": 308.875,
"completions/clipped_ratio": 0.0,
"completions/max_length": 321.0,
"completions/max_terminated_length": 321.0,
"completions/mean_length": 308.875,
"completions/mean_terminated_length": 308.875,
"completions/min_length": 304.0,
"completions/min_terminated_length": 304.0,
"epoch": 0.005733333333333333,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5613489747047424,
"kl": 0.33147352933883667,
"learning_rate": 2.3546379277238107e-06,
"loss": 0.0003,
"num_tokens": 353554.0,
"reward": 1.5193877220153809,
"reward_std": 0.018220985308289528,
"rewards/correctness_reward_func/mean": 1.0193877220153809,
"rewards/correctness_reward_func/std": 0.01822100207209587,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 86
},
{
"completion_length": 482.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 508.0,
"completions/max_terminated_length": 508.0,
"completions/mean_length": 482.0,
"completions/mean_terminated_length": 482.0,
"completions/min_length": 390.0,
"completions/min_terminated_length": 390.0,
"epoch": 0.0058,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.7360633611679077,
"kl": 0.4801836907863617,
"learning_rate": 2.296603313330355e-06,
"loss": 0.0005,
"num_tokens": 359986.0,
"reward": 1.3534722328186035,
"reward_std": 0.1422896832227707,
"rewards/correctness_reward_func/mean": 0.8534722328186035,
"rewards/correctness_reward_func/std": 0.14228971302509308,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 87
},
{
"completion_length": 392.5,
"completions/clipped_ratio": 0.0,
"completions/max_length": 399.0,
"completions/max_terminated_length": 399.0,
"completions/mean_length": 392.5,
"completions/mean_terminated_length": 392.5,
"completions/min_length": 388.0,
"completions/min_terminated_length": 388.0,
"epoch": 0.005866666666666667,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.529614269733429,
"kl": 0.22710314393043518,
"learning_rate": 2.238678841830867e-06,
"loss": 0.0002,
"num_tokens": 365254.0,
"reward": 1.470312476158142,
"reward_std": 0.04101800173521042,
"rewards/correctness_reward_func/mean": 0.9703124761581421,
"rewards/correctness_reward_func/std": 0.04101802781224251,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 88
},
{
"completion_length": 410.5,
"completions/clipped_ratio": 0.0,
"completions/max_length": 488.0,
"completions/max_terminated_length": 488.0,
"completions/mean_length": 410.5,
"completions/mean_terminated_length": 410.5,
"completions/min_length": 388.0,
"completions/min_terminated_length": 388.0,
"epoch": 0.005933333333333333,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5982156991958618,
"kl": 0.3257940411567688,
"learning_rate": 2.1808958803485134e-06,
"loss": 0.0003,
"num_tokens": 370786.0,
"reward": 1.4328124523162842,
"reward_std": 0.026038672775030136,
"rewards/correctness_reward_func/mean": 0.932812511920929,
"rewards/correctness_reward_func/std": 0.02603868953883648,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 89
},
{
"completion_length": 110.375,
"completions/clipped_ratio": 0.0,
"completions/max_length": 117.0,
"completions/max_terminated_length": 117.0,
"completions/mean_length": 110.375,
"completions/mean_terminated_length": 110.375,
"completions/min_length": 105.0,
"completions/min_terminated_length": 105.0,
"epoch": 0.006,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.7572274208068848,
"kl": 0.7833219766616821,
"learning_rate": 2.1232857193762923e-06,
"loss": 0.0008,
"num_tokens": 372957.0,
"reward": 1.774999976158142,
"reward_std": 0.0400891974568367,
"rewards/correctness_reward_func/mean": 1.274999976158142,
"rewards/correctness_reward_func/std": 0.04008918255567551,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 90
},
{
"completion_length": 229.5,
"completions/clipped_ratio": 0.0,
"completions/max_length": 237.0,
"completions/max_terminated_length": 237.0,
"completions/mean_length": 229.5,
"completions/mean_terminated_length": 229.5,
"completions/min_length": 217.0,
"completions/min_terminated_length": 217.0,
"epoch": 0.006066666666666666,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.748127281665802,
"kl": 0.7552896738052368,
"learning_rate": 2.0658795558326745e-06,
"loss": 0.0008,
"num_tokens": 376433.0,
"reward": 1.3583333492279053,
"reward_std": 0.04272470250725746,
"rewards/correctness_reward_func/mean": 0.8583333492279053,
"rewards/correctness_reward_func/std": 0.04272466525435448,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 91
},
{
"completion_length": 306.25,
"completions/clipped_ratio": 0.0,
"completions/max_length": 317.0,
"completions/max_terminated_length": 317.0,
"completions/mean_length": 306.25,
"completions/mean_terminated_length": 306.25,
"completions/min_length": 299.0,
"completions/min_terminated_length": 299.0,
"epoch": 0.0061333333333333335,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.6332816481590271,
"kl": 0.35773783922195435,
"learning_rate": 2.0087084761679245e-06,
"loss": 0.0004,
"num_tokens": 380867.0,
"reward": 1.4734693765640259,
"reward_std": 0.050698716193437576,
"rewards/correctness_reward_func/mean": 0.9734693765640259,
"rewards/correctness_reward_func/std": 0.050698697566986084,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 92
},
{
"completion_length": 226.75,
"completions/clipped_ratio": 0.0,
"completions/max_length": 250.0,
"completions/max_terminated_length": 250.0,
"completions/mean_length": 226.75,
"completions/mean_terminated_length": 226.75,
"completions/min_length": 217.0,
"completions/min_terminated_length": 217.0,
"epoch": 0.0062,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.8360308408737183,
"kl": 0.3975153863430023,
"learning_rate": 1.9518034395302413e-06,
"loss": 0.0004,
"num_tokens": 384409.0,
"reward": 1.566666603088379,
"reward_std": 0.04364359378814697,
"rewards/correctness_reward_func/mean": 1.066666603088379,
"rewards/correctness_reward_func/std": 0.043643590062856674,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 93
},
{
"completion_length": 705.25,
"completions/clipped_ratio": 0.125,
"completions/max_length": 1745.0,
"completions/max_terminated_length": 788.0,
"completions/mean_length": 705.25,
"completions/mean_terminated_length": 556.7142944335938,
"completions/min_length": 501.0,
"completions/min_terminated_length": 501.0,
"epoch": 0.006266666666666667,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.3474324941635132,
"kl": 0.18562744557857513,
"learning_rate": 1.895195261000831e-06,
"loss": 0.0002,
"num_tokens": 392475.0,
"reward": 1.246759295463562,
"reward_std": 0.16773828864097595,
"rewards/correctness_reward_func/mean": 0.8092592358589172,
"rewards/correctness_reward_func/std": 0.04941357672214508,
"rewards/xmlcount_reward_func/mean": 0.4375,
"rewards/xmlcount_reward_func/std": 0.1767766922712326,
"step": 94
},
{
"completion_length": 66.875,
"completions/clipped_ratio": 0.0,
"completions/max_length": 81.0,
"completions/max_terminated_length": 81.0,
"completions/mean_length": 66.875,
"completions/mean_terminated_length": 66.875,
"completions/min_length": 60.0,
"completions/min_terminated_length": 60.0,
"epoch": 0.006333333333333333,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.022137416526675224,
"kl": 1.1477253437042236,
"learning_rate": 1.8389145949069953e-06,
"loss": 0.0011,
"num_tokens": 394226.0,
"reward": 3.5,
"reward_std": 0.0,
"rewards/correctness_reward_func/mean": 3.0,
"rewards/correctness_reward_func/std": 0.0,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 95
},
{
"completion_length": 403.75,
"completions/clipped_ratio": 0.0,
"completions/max_length": 411.0,
"completions/max_terminated_length": 411.0,
"completions/mean_length": 403.75,
"completions/mean_terminated_length": 403.75,
"completions/min_length": 396.0,
"completions/min_terminated_length": 396.0,
"epoch": 0.0064,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.46710848808288574,
"kl": 0.2613021433353424,
"learning_rate": 1.7829919182222752e-06,
"loss": 0.0003,
"num_tokens": 399752.0,
"reward": 1.3953125476837158,
"reward_std": 0.03578673303127289,
"rewards/correctness_reward_func/mean": 0.8953125476837158,
"rewards/correctness_reward_func/std": 0.03578675538301468,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 96
},
{
"completion_length": 264.25,
"completions/clipped_ratio": 0.0,
"completions/max_length": 499.0,
"completions/max_terminated_length": 499.0,
"completions/mean_length": 264.25,
"completions/mean_terminated_length": 264.25,
"completions/min_length": 222.0,
"completions/min_terminated_length": 222.0,
"epoch": 0.006466666666666667,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.8940762281417847,
"kl": 0.3712972402572632,
"learning_rate": 1.7274575140626318e-06,
"loss": 0.0004,
"num_tokens": 403538.0,
"reward": 1.5916666984558105,
"reward_std": 0.042724691331386566,
"rewards/correctness_reward_func/mean": 1.0916666984558105,
"rewards/correctness_reward_func/std": 0.042724668979644775,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 97
},
{
"completion_length": 304.375,
"completions/clipped_ratio": 0.0,
"completions/max_length": 312.0,
"completions/max_terminated_length": 312.0,
"completions/mean_length": 304.375,
"completions/mean_terminated_length": 304.375,
"completions/min_length": 298.0,
"completions/min_terminated_length": 298.0,
"epoch": 0.006533333333333334,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.7366277575492859,
"kl": 0.27435365319252014,
"learning_rate": 1.6723414552876052e-06,
"loss": 0.0003,
"num_tokens": 407853.0,
"reward": 1.4581632614135742,
"reward_std": 0.027575310319662094,
"rewards/correctness_reward_func/mean": 0.9581632614135742,
"rewards/correctness_reward_func/std": 0.02757529355585575,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 98
},
{
"completion_length": 385.875,
"completions/clipped_ratio": 0.0,
"completions/max_length": 416.0,
"completions/max_terminated_length": 416.0,
"completions/mean_length": 385.875,
"completions/mean_terminated_length": 385.875,
"completions/min_length": 359.0,
"completions/min_terminated_length": 359.0,
"epoch": 0.0066,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.7135072350502014,
"kl": 0.2686983048915863,
"learning_rate": 1.6176735882153284e-06,
"loss": 0.0003,
"num_tokens": 413020.0,
"reward": 1.3273437023162842,
"reward_std": 0.17168211936950684,
"rewards/correctness_reward_func/mean": 0.827343761920929,
"rewards/correctness_reward_func/std": 0.17168213427066803,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 99
},
{
"completion_length": 232.75,
"completions/clipped_ratio": 0.0,
"completions/max_length": 253.0,
"completions/max_terminated_length": 253.0,
"completions/mean_length": 232.75,
"completions/mean_terminated_length": 232.75,
"completions/min_length": 223.0,
"completions/min_terminated_length": 223.0,
"epoch": 0.006666666666666667,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.6561140418052673,
"kl": 0.36341872811317444,
"learning_rate": 1.56348351646022e-06,
"loss": 0.0004,
"num_tokens": 416618.0,
"reward": 1.6083333492279053,
"reward_std": 0.046291012316942215,
"rewards/correctness_reward_func/mean": 1.1083333492279053,
"rewards/correctness_reward_func/std": 0.04629099741578102,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 100
},
{
"completion_length": 69.625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 79.0,
"completions/max_terminated_length": 79.0,
"completions/mean_length": 69.625,
"completions/mean_terminated_length": 69.625,
"completions/min_length": 64.0,
"completions/min_terminated_length": 64.0,
"epoch": 0.006733333333333333,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0018204136285930872,
"kl": 1.0169363021850586,
"learning_rate": 1.509800584902108e-06,
"loss": 0.001,
"num_tokens": 418319.0,
"reward": 3.5,
"reward_std": 0.0,
"rewards/correctness_reward_func/mean": 3.0,
"rewards/correctness_reward_func/std": 0.0,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 101
},
{
"completion_length": 305.5,
"completions/clipped_ratio": 0.0,
"completions/max_length": 319.0,
"completions/max_terminated_length": 319.0,
"completions/mean_length": 305.5,
"completions/mean_terminated_length": 305.5,
"completions/min_length": 294.0,
"completions/min_terminated_length": 294.0,
"epoch": 0.0068,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5131522417068481,
"kl": 0.3132336437702179,
"learning_rate": 1.4566538637954556e-06,
"loss": 0.0003,
"num_tokens": 422547.0,
"reward": 1.4489796161651611,
"reward_std": 0.03463377058506012,
"rewards/correctness_reward_func/mean": 0.9489796161651611,
"rewards/correctness_reward_func/std": 0.03463379293680191,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 102
},
{
"completion_length": 502.625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 530.0,
"completions/max_terminated_length": 530.0,
"completions/mean_length": 502.625,
"completions/mean_terminated_length": 502.625,
"completions/min_length": 488.0,
"completions/min_terminated_length": 488.0,
"epoch": 0.006866666666666667,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.40155088901519775,
"kl": 0.25858941674232483,
"learning_rate": 1.4040721330273063e-06,
"loss": 0.0003,
"num_tokens": 429144.0,
"reward": 1.3925926685333252,
"reward_std": 0.017707079648971558,
"rewards/correctness_reward_func/mean": 0.8925925493240356,
"rewards/correctness_reward_func/std": 0.017707087099552155,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 103
},
{
"completion_length": 304.75,
"completions/clipped_ratio": 0.0,
"completions/max_length": 312.0,
"completions/max_terminated_length": 312.0,
"completions/mean_length": 304.75,
"completions/mean_terminated_length": 304.75,
"completions/min_length": 296.0,
"completions/min_terminated_length": 296.0,
"epoch": 0.006933333333333333,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.8429935574531555,
"kl": 0.3152122497558594,
"learning_rate": 1.3520838665324704e-06,
"loss": 0.0003,
"num_tokens": 433502.0,
"reward": 1.620408296585083,
"reward_std": 0.04719792306423187,
"rewards/correctness_reward_func/mean": 1.120408296585083,
"rewards/correctness_reward_func/std": 0.04719793051481247,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 104
},
{
"completion_length": 227.25,
"completions/clipped_ratio": 0.0,
"completions/max_length": 234.0,
"completions/max_terminated_length": 234.0,
"completions/mean_length": 227.25,
"completions/mean_terminated_length": 227.25,
"completions/min_length": 223.0,
"completions/min_terminated_length": 223.0,
"epoch": 0.007,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.8614357113838196,
"kl": 0.37386342883110046,
"learning_rate": 1.3007172168743854e-06,
"loss": 0.0004,
"num_tokens": 436944.0,
"reward": 1.537500023841858,
"reward_std": 0.051754895597696304,
"rewards/correctness_reward_func/mean": 1.037500023841858,
"rewards/correctness_reward_func/std": 0.05175492912530899,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 105
},
{
"completion_length": 104.375,
"completions/clipped_ratio": 0.0,
"completions/max_length": 114.0,
"completions/max_terminated_length": 114.0,
"completions/mean_length": 104.375,
"completions/mean_terminated_length": 104.375,
"completions/min_length": 100.0,
"completions/min_terminated_length": 100.0,
"epoch": 0.007066666666666666,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.7808598279953003,
"kl": 0.581811249256134,
"learning_rate": 1.2500000000000007e-06,
"loss": 0.0006,
"num_tokens": 439019.0,
"reward": 3.293750047683716,
"reward_std": 0.5833630561828613,
"rewards/correctness_reward_func/mean": 2.793750047683716,
"rewards/correctness_reward_func/std": 0.5833631157875061,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 106
},
{
"completion_length": 228.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 236.0,
"completions/max_terminated_length": 236.0,
"completions/mean_length": 228.0,
"completions/mean_terminated_length": 228.0,
"completions/min_length": 223.0,
"completions/min_terminated_length": 223.0,
"epoch": 0.0071333333333333335,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.6257885694503784,
"kl": 0.40867361426353455,
"learning_rate": 1.1999596801769617e-06,
"loss": 0.0004,
"num_tokens": 442611.0,
"reward": 1.5999999046325684,
"reward_std": 0.04364356771111488,
"rewards/correctness_reward_func/mean": 1.0999999046325684,
"rewards/correctness_reward_func/std": 0.043643563985824585,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 107
},
{
"completion_length": 102.875,
"completions/clipped_ratio": 0.0,
"completions/max_length": 108.0,
"completions/max_terminated_length": 108.0,
"completions/mean_length": 102.875,
"completions/mean_terminated_length": 102.875,
"completions/min_length": 98.0,
"completions/min_terminated_length": 98.0,
"epoch": 0.0072,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.187460422515869,
"kl": 0.6943673491477966,
"learning_rate": 1.1506233551212186e-06,
"loss": 0.0007,
"num_tokens": 444634.0,
"reward": 1.6156249046325684,
"reward_std": 0.13557912409305573,
"rewards/correctness_reward_func/mean": 1.1156249046325684,
"rewards/correctness_reward_func/std": 0.13557912409305573,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 108
},
{
"completion_length": 225.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 229.0,
"completions/max_terminated_length": 229.0,
"completions/mean_length": 225.0,
"completions/mean_terminated_length": 225.0,
"completions/min_length": 222.0,
"completions/min_terminated_length": 222.0,
"epoch": 0.007266666666666667,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.7688984870910645,
"kl": 0.2997400760650635,
"learning_rate": 1.1020177413231334e-06,
"loss": 0.0003,
"num_tokens": 448178.0,
"reward": 1.9041666984558105,
"reward_std": 0.12010247260332108,
"rewards/correctness_reward_func/mean": 1.4041666984558105,
"rewards/correctness_reward_func/std": 0.12010248750448227,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 109
},
{
"completion_length": 511.375,
"completions/clipped_ratio": 0.0,
"completions/max_length": 543.0,
"completions/max_terminated_length": 543.0,
"completions/mean_length": 511.375,
"completions/mean_terminated_length": 511.375,
"completions/min_length": 499.0,
"completions/min_terminated_length": 499.0,
"epoch": 0.007333333333333333,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5489156246185303,
"kl": 0.3106227219104767,
"learning_rate": 1.0541691595800338e-06,
"loss": 0.0003,
"num_tokens": 454669.0,
"reward": 1.2565972805023193,
"reward_std": 0.12646111845970154,
"rewards/correctness_reward_func/mean": 0.7722221612930298,
"rewards/correctness_reward_func/std": 0.09936108440160751,
"rewards/xmlcount_reward_func/mean": 0.484375,
"rewards/xmlcount_reward_func/std": 0.04419417306780815,
"step": 110
},
{
"completion_length": 105.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 109.0,
"completions/max_terminated_length": 109.0,
"completions/mean_length": 105.0,
"completions/mean_terminated_length": 105.0,
"completions/min_length": 100.0,
"completions/min_terminated_length": 100.0,
"epoch": 0.0074,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0009156708256341517,
"kl": 0.6540140509605408,
"learning_rate": 1.0071035207430352e-06,
"loss": 0.0007,
"num_tokens": 456733.0,
"reward": 1.850000023841858,
"reward_std": 0.0,
"rewards/correctness_reward_func/mean": 1.350000023841858,
"rewards/correctness_reward_func/std": 0.0,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 111
},
{
"completion_length": 305.625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 319.0,
"completions/max_terminated_length": 319.0,
"completions/mean_length": 305.625,
"completions/mean_terminated_length": 305.625,
"completions/min_length": 295.0,
"completions/min_terminated_length": 295.0,
"epoch": 0.007466666666666667,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.6442992687225342,
"kl": 0.36618179082870483,
"learning_rate": 9.608463116858544e-07,
"loss": 0.0004,
"num_tokens": 461034.0,
"reward": 1.4826531410217285,
"reward_std": 0.04229113087058067,
"rewards/correctness_reward_func/mean": 0.982653021812439,
"rewards/correctness_reward_func/std": 0.04229113087058067,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 112
},
{
"completion_length": 144.5,
"completions/clipped_ratio": 0.0,
"completions/max_length": 232.0,
"completions/max_terminated_length": 232.0,
"completions/mean_length": 144.5,
"completions/mean_terminated_length": 144.5,
"completions/min_length": 102.0,
"completions/min_terminated_length": 102.0,
"epoch": 0.007533333333333334,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.6790902614593506,
"kl": 0.6071375012397766,
"learning_rate": 9.154225815032242e-07,
"loss": 0.0006,
"num_tokens": 463350.0,
"reward": 1.90625,
"reward_std": 0.03471820428967476,
"rewards/correctness_reward_func/mean": 1.40625,
"rewards/correctness_reward_func/std": 0.03471821919083595,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 113
},
{
"completion_length": 227.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 234.0,
"completions/max_terminated_length": 234.0,
"completions/mean_length": 227.0,
"completions/mean_terminated_length": 227.0,
"completions/min_length": 223.0,
"completions/min_terminated_length": 223.0,
"epoch": 0.0076,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.0019490718841553,
"kl": 0.36533433198928833,
"learning_rate": 8.708569279463622e-07,
"loss": 0.0004,
"num_tokens": 466782.0,
"reward": 1.625,
"reward_std": 0.06842906773090363,
"rewards/correctness_reward_func/mean": 1.125,
"rewards/correctness_reward_func/std": 0.06842907518148422,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 114
},
{
"completion_length": 111.125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 120.0,
"completions/max_terminated_length": 120.0,
"completions/mean_length": 111.125,
"completions/mean_terminated_length": 111.125,
"completions/min_length": 105.0,
"completions/min_terminated_length": 105.0,
"epoch": 0.007666666666666666,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.1805026531219482,
"kl": 0.7322518825531006,
"learning_rate": 8.271734841028553e-07,
"loss": 0.0007,
"num_tokens": 468943.0,
"reward": 1.803125023841858,
"reward_std": 0.08908011764287949,
"rewards/correctness_reward_func/mean": 1.303125023841858,
"rewards/correctness_reward_func/std": 0.08908012509346008,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 115
},
{
"completion_length": 134.5,
"completions/clipped_ratio": 0.0,
"completions/max_length": 223.0,
"completions/max_terminated_length": 223.0,
"completions/mean_length": 134.5,
"completions/mean_terminated_length": 134.5,
"completions/min_length": 105.0,
"completions/min_terminated_length": 105.0,
"epoch": 0.007733333333333333,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.005333705805242062,
"kl": 0.5382705330848694,
"learning_rate": 7.843959053281663e-07,
"loss": 0.0005,
"num_tokens": 471387.0,
"reward": 3.5,
"reward_std": 0.0,
"rewards/correctness_reward_func/mean": 3.0,
"rewards/correctness_reward_func/std": 0.0,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 116
},
{
"completion_length": 388.5,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 388.5,
"completions/mean_terminated_length": 388.5,
"completions/min_length": 381.0,
"completions/min_terminated_length": 381.0,
"epoch": 0.0078,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.6017757654190063,
"kl": 0.38339361548423767,
"learning_rate": 7.425473564358457e-07,
"loss": 0.0004,
"num_tokens": 476759.0,
"reward": 1.30859375,
"reward_std": 0.03673892840743065,
"rewards/correctness_reward_func/mean": 0.80859375,
"rewards/correctness_reward_func/std": 0.03673893213272095,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 117
},
{
"completion_length": 72.25,
"completions/clipped_ratio": 0.0,
"completions/max_length": 108.0,
"completions/max_terminated_length": 108.0,
"completions/mean_length": 72.25,
"completions/mean_terminated_length": 72.25,
"completions/min_length": 56.0,
"completions/min_terminated_length": 56.0,
"epoch": 0.007866666666666666,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.01756666600704193,
"kl": 1.2723251581192017,
"learning_rate": 7.016504991533727e-07,
"loss": 0.0013,
"num_tokens": 478377.0,
"reward": 3.5,
"reward_std": 0.0,
"rewards/correctness_reward_func/mean": 3.0,
"rewards/correctness_reward_func/std": 0.0,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 118
},
{
"completion_length": 226.625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 237.0,
"completions/max_terminated_length": 237.0,
"completions/mean_length": 226.625,
"completions/mean_terminated_length": 226.625,
"completions/min_length": 219.0,
"completions/min_terminated_length": 219.0,
"epoch": 0.007933333333333334,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.8822813034057617,
"kl": 0.5029420256614685,
"learning_rate": 6.617274798504286e-07,
"loss": 0.0005,
"num_tokens": 481918.0,
"reward": 1.566666603088379,
"reward_std": 0.025197675451636314,
"rewards/correctness_reward_func/mean": 1.066666603088379,
"rewards/correctness_reward_func/std": 0.025197653099894524,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 119
},
{
"completion_length": 401.5,
"completions/clipped_ratio": 0.0,
"completions/max_length": 411.0,
"completions/max_terminated_length": 411.0,
"completions/mean_length": 401.5,
"completions/mean_terminated_length": 401.5,
"completions/min_length": 393.0,
"completions/min_terminated_length": 393.0,
"epoch": 0.008,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.8926092386245728,
"kl": 0.3140659034252167,
"learning_rate": 6.227999175462521e-07,
"loss": 0.0003,
"num_tokens": 487298.0,
"reward": 1.5382813215255737,
"reward_std": 0.049035049974918365,
"rewards/correctness_reward_func/mean": 1.0382813215255737,
"rewards/correctness_reward_func/std": 0.049035049974918365,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 120
},
{
"completion_length": 296.5,
"completions/clipped_ratio": 0.0,
"completions/max_length": 309.0,
"completions/max_terminated_length": 309.0,
"completions/mean_length": 296.5,
"completions/mean_terminated_length": 296.5,
"completions/min_length": 288.0,
"completions/min_terminated_length": 288.0,
"epoch": 0.008066666666666666,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.1491118669509888,
"kl": 0.42884907126426697,
"learning_rate": 5.848888922025553e-07,
"loss": 0.0004,
"num_tokens": 491462.0,
"reward": 1.4948980808258057,
"reward_std": 0.04329225793480873,
"rewards/correctness_reward_func/mean": 0.9948980212211609,
"rewards/correctness_reward_func/std": 0.04329225420951843,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 121
},
{
"completion_length": 235.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 250.0,
"completions/max_terminated_length": 250.0,
"completions/mean_length": 235.0,
"completions/mean_terminated_length": 235.0,
"completions/min_length": 227.0,
"completions/min_terminated_length": 227.0,
"epoch": 0.008133333333333333,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.8497012257575989,
"kl": 0.38695231080055237,
"learning_rate": 5.48014933308352e-07,
"loss": 0.0004,
"num_tokens": 494910.0,
"reward": 1.5625,
"reward_std": 0.02136235125362873,
"rewards/correctness_reward_func/mean": 1.0625,
"rewards/correctness_reward_func/std": 0.021362358704209328,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 122
},
{
"completion_length": 493.625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 515.0,
"completions/max_terminated_length": 515.0,
"completions/mean_length": 493.625,
"completions/mean_terminated_length": 493.625,
"completions/min_length": 458.0,
"completions/min_terminated_length": 458.0,
"epoch": 0.0082,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5217005610466003,
"kl": 0.29427477717399597,
"learning_rate": 5.121980087628802e-07,
"loss": 0.0003,
"num_tokens": 501307.0,
"reward": 1.3701388835906982,
"reward_std": 0.08480729162693024,
"rewards/correctness_reward_func/mean": 0.8701388835906982,
"rewards/correctness_reward_func/std": 0.08480728417634964,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 123
},
{
"completion_length": 300.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 305.0,
"completions/max_terminated_length": 305.0,
"completions/mean_length": 300.0,
"completions/mean_terminated_length": 300.0,
"completions/min_length": 290.0,
"completions/min_terminated_length": 290.0,
"epoch": 0.008266666666666667,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.6252700090408325,
"kl": 0.3322438895702362,
"learning_rate": 4.774575140626317e-07,
"loss": 0.0003,
"num_tokens": 505563.0,
"reward": 1.5040816068649292,
"reward_std": 0.04487145319581032,
"rewards/correctness_reward_func/mean": 1.0040816068649292,
"rewards/correctness_reward_func/std": 0.04487145319581032,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 124
},
{
"completion_length": 398.375,
"completions/clipped_ratio": 0.0,
"completions/max_length": 409.0,
"completions/max_terminated_length": 409.0,
"completions/mean_length": 398.375,
"completions/mean_terminated_length": 398.375,
"completions/min_length": 393.0,
"completions/min_terminated_length": 393.0,
"epoch": 0.008333333333333333,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5247986316680908,
"kl": 0.2880602180957794,
"learning_rate": 4.438122617983442e-07,
"loss": 0.0003,
"num_tokens": 510910.0,
"reward": 1.47265625,
"reward_std": 0.03534547612071037,
"rewards/correctness_reward_func/mean": 0.97265625,
"rewards/correctness_reward_func/std": 0.035345472395420074,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 125
},
{
"completion_length": 109.25,
"completions/clipped_ratio": 0.0,
"completions/max_length": 122.0,
"completions/max_terminated_length": 122.0,
"completions/mean_length": 109.25,
"completions/mean_terminated_length": 109.25,
"completions/min_length": 103.0,
"completions/min_terminated_length": 103.0,
"epoch": 0.0084,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.40777325630188,
"kl": 0.7852993607521057,
"learning_rate": 4.1128047146765936e-07,
"loss": 0.0008,
"num_tokens": 512952.0,
"reward": 1.8125,
"reward_std": 0.08017835021018982,
"rewards/correctness_reward_func/mean": 1.3125,
"rewards/correctness_reward_func/std": 0.08017835021018982,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 126
},
{
"completion_length": 113.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 140.0,
"completions/max_terminated_length": 140.0,
"completions/mean_length": 113.0,
"completions/mean_terminated_length": 113.0,
"completions/min_length": 107.0,
"completions/min_terminated_length": 107.0,
"epoch": 0.008466666666666667,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.6808403730392456,
"kl": 0.7311285734176636,
"learning_rate": 3.798797596089351e-07,
"loss": 0.0007,
"num_tokens": 515128.0,
"reward": 1.615625023841858,
"reward_std": 0.06258923560380936,
"rewards/correctness_reward_func/mean": 1.115625023841858,
"rewards/correctness_reward_func/std": 0.06258922815322876,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 127
},
{
"completion_length": 487.625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 530.0,
"completions/max_terminated_length": 530.0,
"completions/mean_length": 487.625,
"completions/mean_terminated_length": 487.625,
"completions/min_length": 437.0,
"completions/min_terminated_length": 437.0,
"epoch": 0.008533333333333334,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.8878220319747925,
"kl": 0.2915445566177368,
"learning_rate": 3.4962713026158697e-07,
"loss": 0.0003,
"num_tokens": 521421.0,
"reward": 1.3439815044403076,
"reward_std": 0.11603187024593353,
"rewards/correctness_reward_func/mean": 0.8439815044403076,
"rewards/correctness_reward_func/std": 0.11603190749883652,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 128
},
{
"completion_length": 428.5,
"completions/clipped_ratio": 0.0,
"completions/max_length": 528.0,
"completions/max_terminated_length": 528.0,
"completions/mean_length": 428.5,
"completions/mean_terminated_length": 428.5,
"completions/min_length": 388.0,
"completions/min_terminated_length": 388.0,
"epoch": 0.0086,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.4781865179538727,
"kl": 0.2633122205734253,
"learning_rate": 3.2053896575809426e-07,
"loss": 0.0003,
"num_tokens": 526913.0,
"reward": 1.40234375,
"reward_std": 0.038081441074609756,
"rewards/correctness_reward_func/mean": 0.90234375,
"rewards/correctness_reward_func/std": 0.03808142989873886,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 129
},
{
"completion_length": 119.5,
"completions/clipped_ratio": 0.0,
"completions/max_length": 173.0,
"completions/max_terminated_length": 173.0,
"completions/mean_length": 119.5,
"completions/mean_terminated_length": 119.5,
"completions/min_length": 105.0,
"completions/min_terminated_length": 105.0,
"epoch": 0.008666666666666666,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0010288404300808907,
"kl": 0.6225458979606628,
"learning_rate": 2.9263101785268253e-07,
"loss": 0.0006,
"num_tokens": 529037.0,
"reward": 1.774999976158142,
"reward_std": 0.0,
"rewards/correctness_reward_func/mean": 1.274999976158142,
"rewards/correctness_reward_func/std": 0.0,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 130
},
{
"completion_length": 474.5,
"completions/clipped_ratio": 0.0,
"completions/max_length": 499.0,
"completions/max_terminated_length": 499.0,
"completions/mean_length": 474.5,
"completions/mean_terminated_length": 474.5,
"completions/min_length": 431.0,
"completions/min_terminated_length": 431.0,
"epoch": 0.008733333333333333,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.6302768588066101,
"kl": 0.34326815605163574,
"learning_rate": 2.6591839919146963e-07,
"loss": 0.0003,
"num_tokens": 535233.0,
"reward": 1.3050925731658936,
"reward_std": 0.12362809479236603,
"rewards/correctness_reward_func/mean": 0.8050925731658936,
"rewards/correctness_reward_func/std": 0.12362809479236603,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 131
},
{
"completion_length": 493.875,
"completions/clipped_ratio": 0.0,
"completions/max_length": 499.0,
"completions/max_terminated_length": 499.0,
"completions/mean_length": 493.875,
"completions/mean_terminated_length": 493.875,
"completions/min_length": 484.0,
"completions/min_terminated_length": 484.0,
"epoch": 0.0088,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.7365954518318176,
"kl": 0.9036705493927002,
"learning_rate": 2.404155751286988e-07,
"loss": 0.0009,
"num_tokens": 541680.0,
"reward": 1.3611111640930176,
"reward_std": 0.05035635083913803,
"rewards/correctness_reward_func/mean": 0.8611111044883728,
"rewards/correctness_reward_func/std": 0.050356365740299225,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 132
},
{
"completion_length": 403.25,
"completions/clipped_ratio": 0.0,
"completions/max_length": 432.0,
"completions/max_terminated_length": 432.0,
"completions/mean_length": 403.25,
"completions/mean_terminated_length": 403.25,
"completions/min_length": 391.0,
"completions/min_terminated_length": 391.0,
"epoch": 0.008866666666666667,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5254420638084412,
"kl": 0.32496964931488037,
"learning_rate": 2.1613635589349756e-07,
"loss": 0.0003,
"num_tokens": 547090.0,
"reward": 1.4890625476837158,
"reward_std": 0.027900906279683113,
"rewards/correctness_reward_func/mean": 0.989062488079071,
"rewards/correctness_reward_func/std": 0.027900898829102516,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 133
},
{
"completion_length": 308.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 315.0,
"completions/max_terminated_length": 315.0,
"completions/mean_length": 308.0,
"completions/mean_terminated_length": 308.0,
"completions/min_length": 301.0,
"completions/min_terminated_length": 301.0,
"epoch": 0.008933333333333333,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.06843101978302,
"kl": 0.3652152419090271,
"learning_rate": 1.9309388911139427e-07,
"loss": 0.0004,
"num_tokens": 551474.0,
"reward": 1.485714316368103,
"reward_std": 0.04719790816307068,
"rewards/correctness_reward_func/mean": 0.985714316368103,
"rewards/correctness_reward_func/std": 0.04719791188836098,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 134
},
{
"completion_length": 511.875,
"completions/clipped_ratio": 0.125,
"completions/max_length": 560.0,
"completions/max_terminated_length": 515.0,
"completions/mean_length": 511.875,
"completions/mean_terminated_length": 505.0000305175781,
"completions/min_length": 489.0,
"completions/min_terminated_length": 489.0,
"epoch": 0.009,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.595820426940918,
"kl": 0.25694188475608826,
"learning_rate": 1.713006526846439e-07,
"loss": 0.0003,
"num_tokens": 558169.0,
"reward": 1.4703704118728638,
"reward_std": 0.046680908650159836,
"rewards/correctness_reward_func/mean": 0.970370352268219,
"rewards/correctness_reward_func/std": 0.04668092727661133,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 135
},
{
"completion_length": 311.25,
"completions/clipped_ratio": 0.0,
"completions/max_length": 325.0,
"completions/max_terminated_length": 325.0,
"completions/mean_length": 311.25,
"completions/mean_terminated_length": 311.25,
"completions/min_length": 305.0,
"completions/min_terminated_length": 305.0,
"epoch": 0.009066666666666667,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.8594949841499329,
"kl": 0.34354594349861145,
"learning_rate": 1.507684480352292e-07,
"loss": 0.0003,
"num_tokens": 562531.0,
"reward": 1.4979591369628906,
"reward_std": 0.0392710380256176,
"rewards/correctness_reward_func/mean": 0.9979591369628906,
"rewards/correctness_reward_func/std": 0.03927105292677879,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 136
},
{
"completion_length": 312.25,
"completions/clipped_ratio": 0.0,
"completions/max_length": 327.0,
"completions/max_terminated_length": 327.0,
"completions/mean_length": 312.25,
"completions/mean_terminated_length": 312.25,
"completions/min_length": 300.0,
"completions/min_terminated_length": 300.0,
"epoch": 0.009133333333333334,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.516212821006775,
"kl": 0.3029043972492218,
"learning_rate": 1.31508393714177e-07,
"loss": 0.0003,
"num_tokens": 566933.0,
"reward": 1.5897959470748901,
"reward_std": 0.031389568001031876,
"rewards/correctness_reward_func/mean": 1.0897959470748901,
"rewards/correctness_reward_func/std": 0.03138954937458038,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 137
},
{
"completion_length": 64.125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 70.0,
"completions/max_terminated_length": 70.0,
"completions/mean_length": 64.125,
"completions/mean_terminated_length": 64.125,
"completions/min_length": 58.0,
"completions/min_terminated_length": 58.0,
"epoch": 0.0092,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.05483343079686165,
"kl": 1.4965819120407104,
"learning_rate": 1.1353091938067024e-07,
"loss": 0.0015,
"num_tokens": 568670.0,
"reward": 3.5,
"reward_std": 0.0,
"rewards/correctness_reward_func/mean": 3.0,
"rewards/correctness_reward_func/std": 0.0,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 138
},
{
"completion_length": 303.25,
"completions/clipped_ratio": 0.0,
"completions/max_length": 310.0,
"completions/max_terminated_length": 310.0,
"completions/mean_length": 303.25,
"completions/mean_terminated_length": 303.25,
"completions/min_length": 298.0,
"completions/min_terminated_length": 298.0,
"epoch": 0.009266666666666666,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.6266666650772095,
"kl": 0.35942456126213074,
"learning_rate": 9.684576015420277e-08,
"loss": 0.0004,
"num_tokens": 572984.0,
"reward": 1.430612325668335,
"reward_std": 0.03644194081425667,
"rewards/correctness_reward_func/mean": 0.9306122064590454,
"rewards/correctness_reward_func/std": 0.03644197806715965,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 139
},
{
"completion_length": 494.375,
"completions/clipped_ratio": 0.0,
"completions/max_length": 517.0,
"completions/max_terminated_length": 517.0,
"completions/mean_length": 494.375,
"completions/mean_terminated_length": 494.375,
"completions/min_length": 444.0,
"completions/min_terminated_length": 444.0,
"epoch": 0.009333333333333334,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5252565145492554,
"kl": 0.28363481163978577,
"learning_rate": 8.146195134284052e-08,
"loss": 0.0003,
"num_tokens": 579523.0,
"reward": 1.3534722328186035,
"reward_std": 0.10252274572849274,
"rewards/correctness_reward_func/mean": 0.8534722328186035,
"rewards/correctness_reward_func/std": 0.10252274572849274,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 140
},
{
"completion_length": 229.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 239.0,
"completions/max_terminated_length": 239.0,
"completions/mean_length": 229.0,
"completions/mean_terminated_length": 229.0,
"completions/min_length": 221.0,
"completions/min_terminated_length": 221.0,
"epoch": 0.0094,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.1325074434280396,
"kl": 0.38521093130111694,
"learning_rate": 6.738782355044048e-08,
"loss": 0.0004,
"num_tokens": 583011.0,
"reward": 1.5833333730697632,
"reward_std": 0.043643586337566376,
"rewards/correctness_reward_func/mean": 1.0833333730697632,
"rewards/correctness_reward_func/std": 0.04364357516169548,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 141
},
{
"completion_length": 312.625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 376.0,
"completions/max_terminated_length": 376.0,
"completions/mean_length": 312.625,
"completions/mean_terminated_length": 312.625,
"completions/min_length": 300.0,
"completions/min_terminated_length": 300.0,
"epoch": 0.009466666666666667,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.6133989691734314,
"kl": 0.3078048825263977,
"learning_rate": 5.463099816548578e-08,
"loss": 0.0003,
"num_tokens": 587368.0,
"reward": 1.4489796161651611,
"reward_std": 0.03463380038738251,
"rewards/correctness_reward_func/mean": 0.9489796161651611,
"rewards/correctness_reward_func/std": 0.03463379293680191,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 142
},
{
"completion_length": 400.375,
"completions/clipped_ratio": 0.0,
"completions/max_length": 416.0,
"completions/max_terminated_length": 416.0,
"completions/mean_length": 400.375,
"completions/mean_terminated_length": 400.375,
"completions/min_length": 394.0,
"completions/min_terminated_length": 394.0,
"epoch": 0.009533333333333333,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5225932002067566,
"kl": 0.2902478873729706,
"learning_rate": 4.319838323396691e-08,
"loss": 0.0003,
"num_tokens": 592715.0,
"reward": 1.4421875476837158,
"reward_std": 0.0357866995036602,
"rewards/correctness_reward_func/mean": 0.942187488079071,
"rewards/correctness_reward_func/std": 0.03578675165772438,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 143
},
{
"completion_length": 401.375,
"completions/clipped_ratio": 0.0,
"completions/max_length": 411.0,
"completions/max_terminated_length": 411.0,
"completions/mean_length": 401.375,
"completions/mean_terminated_length": 401.375,
"completions/min_length": 391.0,
"completions/min_terminated_length": 391.0,
"epoch": 0.0096,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5049280524253845,
"kl": 0.27406245470046997,
"learning_rate": 3.309616971855195e-08,
"loss": 0.0003,
"num_tokens": 598118.0,
"reward": 1.5125000476837158,
"reward_std": 0.056694649159908295,
"rewards/correctness_reward_func/mean": 1.0125000476837158,
"rewards/correctness_reward_func/std": 0.05669466406106949,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 144
},
{
"completion_length": 68.5,
"completions/clipped_ratio": 0.0,
"completions/max_length": 95.0,
"completions/max_terminated_length": 95.0,
"completions/mean_length": 68.5,
"completions/mean_terminated_length": 68.5,
"completions/min_length": 51.0,
"completions/min_terminated_length": 51.0,
"epoch": 0.009666666666666667,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.006095044314861298,
"kl": 1.1216628551483154,
"learning_rate": 2.4329828146074096e-08,
"loss": 0.0011,
"num_tokens": 599706.0,
"reward": 2.0333333015441895,
"reward_std": 0.0,
"rewards/correctness_reward_func/mean": 1.5333333015441895,
"rewards/correctness_reward_func/std": 0.0,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 145
},
{
"completion_length": 60.625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 70.0,
"completions/max_terminated_length": 70.0,
"completions/mean_length": 60.625,
"completions/mean_terminated_length": 60.625,
"completions/min_length": 57.0,
"completions/min_terminated_length": 57.0,
"epoch": 0.009733333333333333,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.0240890979766846,
"kl": 1.1488873958587646,
"learning_rate": 1.6904105645142443e-08,
"loss": 0.0011,
"num_tokens": 601295.0,
"reward": 2.308333396911621,
"reward_std": 0.7535629868507385,
"rewards/correctness_reward_func/mean": 1.808333396911621,
"rewards/correctness_reward_func/std": 0.7535629868507385,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 146
},
{
"completion_length": 228.5,
"completions/clipped_ratio": 0.0,
"completions/max_length": 236.0,
"completions/max_terminated_length": 236.0,
"completions/mean_length": 228.5,
"completions/mean_terminated_length": 228.5,
"completions/min_length": 222.0,
"completions/min_terminated_length": 222.0,
"epoch": 0.0098,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.8666861653327942,
"kl": 0.8305746912956238,
"learning_rate": 1.0823023375489128e-08,
"loss": 0.0008,
"num_tokens": 604659.0,
"reward": 1.379166603088379,
"reward_std": 0.017251623794436455,
"rewards/correctness_reward_func/mean": 0.8791666626930237,
"rewards/correctness_reward_func/std": 0.017251623794436455,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 147
},
{
"completion_length": 226.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 237.0,
"completions/max_terminated_length": 237.0,
"completions/mean_length": 226.0,
"completions/mean_terminated_length": 226.0,
"completions/min_length": 221.0,
"completions/min_terminated_length": 221.0,
"epoch": 0.009866666666666666,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.6299178600311279,
"kl": 0.3639201819896698,
"learning_rate": 6.089874350439507e-09,
"loss": 0.0004,
"num_tokens": 608107.0,
"reward": 1.629166603088379,
"reward_std": 0.05473604425787926,
"rewards/correctness_reward_func/mean": 1.129166603088379,
"rewards/correctness_reward_func/std": 0.054736021906137466,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 148
},
{
"completion_length": 103.375,
"completions/clipped_ratio": 0.0,
"completions/max_length": 107.0,
"completions/max_terminated_length": 107.0,
"completions/mean_length": 103.375,
"completions/mean_terminated_length": 103.375,
"completions/min_length": 99.0,
"completions/min_terminated_length": 99.0,
"epoch": 0.009933333333333334,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.348041534423828,
"kl": 0.7296304702758789,
"learning_rate": 2.7072216536885855e-09,
"loss": 0.0007,
"num_tokens": 610142.0,
"reward": 1.896875023841858,
"reward_std": 0.03881613910198212,
"rewards/correctness_reward_func/mean": 1.396875023841858,
"rewards/correctness_reward_func/std": 0.03881615400314331,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 149
},
{
"completion_length": 74.25,
"completions/clipped_ratio": 0.0,
"completions/max_length": 95.0,
"completions/max_terminated_length": 95.0,
"completions/mean_length": 74.25,
"completions/mean_terminated_length": 74.25,
"completions/min_length": 65.0,
"completions/min_terminated_length": 65.0,
"epoch": 0.01,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.000831127166748,
"kl": 1.3544789552688599,
"learning_rate": 6.768970513457151e-10,
"loss": 0.0014,
"num_tokens": 611768.0,
"reward": 1.7000000476837158,
"reward_std": 0.0617213174700737,
"rewards/correctness_reward_func/mean": 1.2000000476837158,
"rewards/correctness_reward_func/std": 0.06172133609652519,
"rewards/xmlcount_reward_func/mean": 0.5,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 150
}
],
"logging_steps": 1,
"max_steps": 150,
"num_input_tokens_seen": 611768,
"num_train_epochs": 1,
"save_steps": 150,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}