AgPerry's picture
Training in progress, step 200, checkpoint
b5c3325 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.6993006993006993,
"eval_steps": 500,
"global_step": 200,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"all_correct": 0.0,
"all_wrong": 0.7142857142857143,
"completion_length": 310.1607360839844,
"epoch": 0.0034965034965034965,
"grad_norm": 2.380299597679801,
"kl": 0.0,
"learning_rate": 9.999698350006064e-07,
"loss": -0.0,
"reward": 0.7892857193946838,
"reward_std": 0.2723116874694824,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.4642857313156128,
"step": 1,
"temporal_rewards": 0.6428571343421936
},
{
"all_correct": 0.0,
"all_wrong": 0.14285714285714285,
"completion_length": 327.83929443359375,
"epoch": 0.006993006993006993,
"grad_norm": 1.9019752792945082,
"kl": 0.000507354736328125,
"learning_rate": 9.99879343642134e-07,
"loss": 0.0,
"reward": 0.8178572058677673,
"reward_std": 0.5231723785400391,
"rewards/accuracy_reward": 0.267857164144516,
"rewards/format_reward": 0.5178571939468384,
"step": 2,
"temporal_rewards": 0.5
},
{
"all_correct": 0.0,
"all_wrong": 0.0,
"completion_length": 202.2678680419922,
"epoch": 0.01048951048951049,
"grad_norm": 4.450412510480093,
"kl": 0.00102996826171875,
"learning_rate": 9.997285368432701e-07,
"loss": 0.0,
"reward": 1.4160715341567993,
"reward_std": 0.5284246206283569,
"rewards/accuracy_reward": 0.6964285969734192,
"rewards/format_reward": 0.5892857313156128,
"step": 3,
"temporal_rewards": 0.5
},
{
"all_correct": 0.0,
"all_wrong": 0.14285714285714285,
"completion_length": 270.08929443359375,
"epoch": 0.013986013986013986,
"grad_norm": 2.315112630577657,
"kl": 0.00173187255859375,
"learning_rate": 9.99517432800363e-07,
"loss": 0.0001,
"reward": 0.8560110330581665,
"reward_std": 0.5568417906761169,
"rewards/accuracy_reward": 0.300653874874115,
"rewards/format_reward": 0.4910714626312256,
"step": 4,
"temporal_rewards": 0.6428571343421936
},
{
"all_correct": 0.0,
"all_wrong": 0.0,
"completion_length": 188.57144165039062,
"epoch": 0.017482517482517484,
"grad_norm": 2.724486071626872,
"kl": 0.0050048828125,
"learning_rate": 9.992460569852254e-07,
"loss": 0.0002,
"reward": 1.4839285612106323,
"reward_std": 0.5056161880493164,
"rewards/accuracy_reward": 0.5535714626312256,
"rewards/format_reward": 0.7767857313156128,
"step": 5,
"temporal_rewards": 0.7857142686843872
},
{
"all_correct": 0.0,
"all_wrong": 0.0,
"completion_length": 219.7678680419922,
"epoch": 0.02097902097902098,
"grad_norm": 2.8351172787613836,
"kl": 0.00860595703125,
"learning_rate": 9.989144421420628e-07,
"loss": 0.0003,
"reward": 1.4946428537368774,
"reward_std": 0.729103147983551,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.8660714626312256,
"step": 6,
"temporal_rewards": 0.8571428656578064
},
{
"all_correct": 0.14285714285714285,
"all_wrong": 0.42857142857142855,
"completion_length": 293.9107360839844,
"epoch": 0.024475524475524476,
"grad_norm": 2.0246275476434126,
"kl": 0.00592041015625,
"learning_rate": 9.985226282835216e-07,
"loss": 0.0002,
"reward": 1.1946429014205933,
"reward_std": 0.29120072722435,
"rewards/accuracy_reward": 0.267857164144516,
"rewards/format_reward": 0.8660714626312256,
"step": 7,
"temporal_rewards": 0.6428571343421936
},
{
"all_correct": 0.14285714285714285,
"all_wrong": 0.2857142857142857,
"completion_length": 220.85714721679688,
"epoch": 0.027972027972027972,
"grad_norm": 7.642786587891259,
"kl": 0.00732421875,
"learning_rate": 9.980706626858607e-07,
"loss": 0.0003,
"reward": 1.4571430683135986,
"reward_std": 0.3087652921676636,
"rewards/accuracy_reward": 0.3750000298023224,
"rewards/format_reward": 0.9910714626312256,
"step": 8,
"temporal_rewards": 0.714285671710968
},
{
"all_correct": 0.2857142857142857,
"all_wrong": 0.2857142857142857,
"completion_length": 319.51788330078125,
"epoch": 0.03146853146853147,
"grad_norm": 0.9767130753502494,
"kl": 0.00543212890625,
"learning_rate": 9.975585998832479e-07,
"loss": 0.0002,
"reward": 1.462499976158142,
"reward_std": 0.24715806543827057,
"rewards/accuracy_reward": 0.4464285969734192,
"rewards/format_reward": 0.9196429252624512,
"step": 9,
"temporal_rewards": 0.714285671710968
},
{
"all_correct": 0.0,
"all_wrong": 0.2857142857142857,
"completion_length": 212.07144165039062,
"epoch": 0.03496503496503497,
"grad_norm": 4.547473584939765,
"kl": 0.01116943359375,
"learning_rate": 9.9698650166118e-07,
"loss": 0.0004,
"reward": 1.4553571939468384,
"reward_std": 0.4356800317764282,
"rewards/accuracy_reward": 0.4285714626312256,
"rewards/format_reward": 0.9464285969734192,
"step": 10,
"temporal_rewards": 0.6428571343421936
},
{
"all_correct": 0.2857142857142857,
"all_wrong": 0.42857142857142855,
"completion_length": 240.62501525878906,
"epoch": 0.038461538461538464,
"grad_norm": 0.9309831321101983,
"kl": 0.0140380859375,
"learning_rate": 9.963544370490268e-07,
"loss": 0.0006,
"reward": 1.567857265472412,
"reward_std": 0.1529129594564438,
"rewards/accuracy_reward": 0.535714328289032,
"rewards/format_reward": 0.9464285969734192,
"step": 11,
"temporal_rewards": 0.5714285373687744
},
{
"all_correct": 0.14285714285714285,
"all_wrong": 0.14285714285714285,
"completion_length": 183.42857360839844,
"epoch": 0.04195804195804196,
"grad_norm": 2.703233395202908,
"kl": 0.01611328125,
"learning_rate": 9.956624823117034e-07,
"loss": 0.0006,
"reward": 1.5553573369979858,
"reward_std": 0.48100805282592773,
"rewards/accuracy_reward": 0.535714328289032,
"rewards/format_reward": 0.9196429252624512,
"step": 12,
"temporal_rewards": 0.7857142686843872
},
{
"all_correct": 0.14285714285714285,
"all_wrong": 0.14285714285714285,
"completion_length": 282.0,
"epoch": 0.045454545454545456,
"grad_norm": 4.458836776433339,
"kl": 0.0230712890625,
"learning_rate": 9.949107209404663e-07,
"loss": 0.0009,
"reward": 1.3767857551574707,
"reward_std": 0.49624696373939514,
"rewards/accuracy_reward": 0.3571428656578064,
"rewards/format_reward": 0.9285714626312256,
"step": 13,
"temporal_rewards": 0.8571428656578064
},
{
"all_correct": 0.2857142857142857,
"all_wrong": 0.14285714285714285,
"completion_length": 194.57144165039062,
"epoch": 0.04895104895104895,
"grad_norm": 3.5177838483972175,
"kl": 0.037353515625,
"learning_rate": 9.940992436428409e-07,
"loss": 0.0015,
"reward": 1.6607143878936768,
"reward_std": 0.33305349946022034,
"rewards/accuracy_reward": 0.535714328289032,
"rewards/format_reward": 0.9910714626312256,
"step": 14,
"temporal_rewards": 0.714285671710968
},
{
"all_correct": 0.0,
"all_wrong": 0.2857142857142857,
"completion_length": 211.55357360839844,
"epoch": 0.05244755244755245,
"grad_norm": 1.9694919360921588,
"kl": 0.034912109375,
"learning_rate": 9.932281483316758e-07,
"loss": 0.0014,
"reward": 1.350000023841858,
"reward_std": 0.3625797629356384,
"rewards/accuracy_reward": 0.3035714328289032,
"rewards/format_reward": 0.9821429252624512,
"step": 15,
"temporal_rewards": 0.6428571343421936
},
{
"all_correct": 0.14285714285714285,
"all_wrong": 0.0,
"completion_length": 160.9107208251953,
"epoch": 0.055944055944055944,
"grad_norm": 8.09731279933887,
"kl": 0.032958984375,
"learning_rate": 9.922975401133292e-07,
"loss": 0.0013,
"reward": 1.7142858505249023,
"reward_std": 0.534223735332489,
"rewards/accuracy_reward": 0.5714285969734192,
"rewards/format_reward": 1.0,
"step": 16,
"temporal_rewards": 0.9285714030265808
},
{
"all_correct": 0.0,
"all_wrong": 0.5714285714285714,
"completion_length": 157.75,
"epoch": 0.05944055944055944,
"grad_norm": 2.860877842453824,
"kl": 0.0262451171875,
"learning_rate": 9.913075312749865e-07,
"loss": 0.001,
"reward": 1.1875,
"reward_std": 0.24128873646259308,
"rewards/accuracy_reward": 0.1607142984867096,
"rewards/format_reward": 1.0,
"step": 17,
"temporal_rewards": 0.7857142686843872
},
{
"all_correct": 0.0,
"all_wrong": 0.2857142857142857,
"completion_length": 329.9285888671875,
"epoch": 0.06293706293706294,
"grad_norm": 1.8234744714725273,
"kl": 0.020751953125,
"learning_rate": 9.902582412711118e-07,
"loss": 0.0008,
"reward": 1.321428656578064,
"reward_std": 0.36161118745803833,
"rewards/accuracy_reward": 0.3214285969734192,
"rewards/format_reward": 0.9821429252624512,
"step": 18,
"temporal_rewards": 0.4285714328289032
},
{
"all_correct": 0.0,
"all_wrong": 0.42857142857142855,
"completion_length": 236.12501525878906,
"epoch": 0.06643356643356643,
"grad_norm": 4.288921643679896,
"kl": 0.0203857421875,
"learning_rate": 9.891497967090343e-07,
"loss": 0.0008,
"reward": 1.383928656578064,
"reward_std": 0.4247966706752777,
"rewards/accuracy_reward": 0.3571428656578064,
"rewards/format_reward": 0.9196429252624512,
"step": 19,
"temporal_rewards": 0.8571428656578064
},
{
"all_correct": 0.0,
"all_wrong": 0.5714285714285714,
"completion_length": 294.375,
"epoch": 0.06993006993006994,
"grad_norm": 4.6505006273887135,
"kl": 0.0223388671875,
"learning_rate": 9.879823313336722e-07,
"loss": 0.0009,
"reward": 1.196428656578064,
"reward_std": 0.39018020033836365,
"rewards/accuracy_reward": 0.267857164144516,
"rewards/format_reward": 0.848214328289032,
"step": 20,
"temporal_rewards": 0.7857142686843872
},
{
"all_correct": 0.0,
"all_wrong": 0.2857142857142857,
"completion_length": 167.2857208251953,
"epoch": 0.07342657342657342,
"grad_norm": 4.184221344881508,
"kl": 0.03173828125,
"learning_rate": 9.86755986011395e-07,
"loss": 0.0013,
"reward": 1.5053571462631226,
"reward_std": 0.3639127314090729,
"rewards/accuracy_reward": 0.392857164144516,
"rewards/format_reward": 1.0,
"step": 21,
"temporal_rewards": 0.6428571343421936
},
{
"all_correct": 0.2857142857142857,
"all_wrong": 0.0,
"completion_length": 201.19644165039062,
"epoch": 0.07692307692307693,
"grad_norm": 2.690611018179203,
"kl": 0.0279541015625,
"learning_rate": 9.85470908713026e-07,
"loss": 0.0011,
"reward": 1.7660715579986572,
"reward_std": 0.40759509801864624,
"rewards/accuracy_reward": 0.6428571939468384,
"rewards/format_reward": 1.0,
"step": 22,
"temporal_rewards": 0.714285671710968
},
{
"all_correct": 0.0,
"all_wrong": 0.14285714285714285,
"completion_length": 335.1964416503906,
"epoch": 0.08041958041958042,
"grad_norm": 2.7267639716406697,
"kl": 0.01708984375,
"learning_rate": 9.84127254495989e-07,
"loss": 0.0007,
"reward": 1.1553571224212646,
"reward_std": 0.48766034841537476,
"rewards/accuracy_reward": 0.196428582072258,
"rewards/format_reward": 0.9285714626312256,
"step": 23,
"temporal_rewards": 0.5714285373687744
},
{
"all_correct": 0.14285714285714285,
"all_wrong": 0.2857142857142857,
"completion_length": 213.2857208251953,
"epoch": 0.08391608391608392,
"grad_norm": 4.208996844683101,
"kl": 0.0230712890625,
"learning_rate": 9.82725185485599e-07,
"loss": 0.0009,
"reward": 1.5232144594192505,
"reward_std": 0.3175846040248871,
"rewards/accuracy_reward": 0.4285714626312256,
"rewards/format_reward": 0.9910714626312256,
"step": 24,
"temporal_rewards": 0.5714285373687744
},
{
"all_correct": 0.14285714285714285,
"all_wrong": 0.14285714285714285,
"completion_length": 213.85714721679688,
"epoch": 0.08741258741258741,
"grad_norm": 3.3203724401356522,
"kl": 0.026611328125,
"learning_rate": 9.81264870855499e-07,
"loss": 0.0011,
"reward": 1.5660713911056519,
"reward_std": 0.3996223509311676,
"rewards/accuracy_reward": 0.4642857313156128,
"rewards/format_reward": 1.0,
"step": 25,
"temporal_rewards": 0.6428571343421936
},
{
"all_correct": 0.14285714285714285,
"all_wrong": 0.14285714285714285,
"completion_length": 143.83929443359375,
"epoch": 0.09090909090909091,
"grad_norm": 5.297436012938182,
"kl": 0.035888671875,
"learning_rate": 9.797464868072486e-07,
"loss": 0.0014,
"reward": 1.5696429014205933,
"reward_std": 0.3823689818382263,
"rewards/accuracy_reward": 0.4464285969734192,
"rewards/format_reward": 1.0,
"step": 26,
"temporal_rewards": 0.7857142686843872
},
{
"all_correct": 0.2857142857142857,
"all_wrong": 0.14285714285714285,
"completion_length": 169.0357208251953,
"epoch": 0.0944055944055944,
"grad_norm": 2.83355376716199,
"kl": 0.034423828125,
"learning_rate": 9.781702165490637e-07,
"loss": 0.0014,
"reward": 1.783928632736206,
"reward_std": 0.2857024669647217,
"rewards/accuracy_reward": 0.6071428656578064,
"rewards/format_reward": 1.0,
"step": 27,
"temporal_rewards": 0.8571428656578064
},
{
"all_correct": 0.14285714285714285,
"all_wrong": 0.5714285714285714,
"completion_length": 208.35714721679688,
"epoch": 0.0979020979020979,
"grad_norm": 1.024100633083295,
"kl": 0.0272216796875,
"learning_rate": 9.765362502737097e-07,
"loss": 0.0011,
"reward": 1.399999976158142,
"reward_std": 0.16669097542762756,
"rewards/accuracy_reward": 0.3214285969734192,
"rewards/format_reward": 1.0,
"step": 28,
"temporal_rewards": 0.714285671710968
},
{
"all_correct": 0.14285714285714285,
"all_wrong": 0.5714285714285714,
"completion_length": 325.6071472167969,
"epoch": 0.10139860139860139,
"grad_norm": 1.5716694001154234,
"kl": 0.016845703125,
"learning_rate": 9.748447851355533e-07,
"loss": 0.0007,
"reward": 1.4339287281036377,
"reward_std": 0.26826655864715576,
"rewards/accuracy_reward": 0.3750000298023224,
"rewards/format_reward": 0.9464285969734192,
"step": 29,
"temporal_rewards": 0.7857142686843872
},
{
"all_correct": 0.14285714285714285,
"all_wrong": 0.0,
"completion_length": 269.5357360839844,
"epoch": 0.1048951048951049,
"grad_norm": 2.5384743751593186,
"kl": 0.02294921875,
"learning_rate": 9.730960252267742e-07,
"loss": 0.0009,
"reward": 1.7035715579986572,
"reward_std": 0.45422056317329407,
"rewards/accuracy_reward": 0.6071428656578064,
"rewards/format_reward": 0.9910714626312256,
"step": 30,
"temporal_rewards": 0.6428571343421936
},
{
"all_correct": 0.2857142857142857,
"all_wrong": 0.2857142857142857,
"completion_length": 262.2321472167969,
"epoch": 0.10839160839160839,
"grad_norm": 2.0666478679768003,
"kl": 0.02294921875,
"learning_rate": 9.712901815527385e-07,
"loss": 0.0009,
"reward": 1.6035715341567993,
"reward_std": 0.2705501914024353,
"rewards/accuracy_reward": 0.4821428656578064,
"rewards/format_reward": 0.9821429252624512,
"step": 31,
"temporal_rewards": 0.7857142686843872
},
{
"all_correct": 0.2857142857142857,
"all_wrong": 0.0,
"completion_length": 134.4107208251953,
"epoch": 0.11188811188811189,
"grad_norm": 2.5344699018238,
"kl": 0.03662109375,
"learning_rate": 9.694274720065398e-07,
"loss": 0.0015,
"reward": 1.892857313156128,
"reward_std": 0.3375154435634613,
"rewards/accuracy_reward": 0.7321428656578064,
"rewards/format_reward": 1.0,
"step": 32,
"temporal_rewards": 0.6428571343421936
},
{
"all_correct": 0.2857142857142857,
"all_wrong": 0.0,
"completion_length": 191.83929443359375,
"epoch": 0.11538461538461539,
"grad_norm": 4.114325907280972,
"kl": 0.0245361328125,
"learning_rate": 9.675081213427074e-07,
"loss": 0.001,
"reward": 1.8464285135269165,
"reward_std": 0.3899296522140503,
"rewards/accuracy_reward": 0.6785714626312256,
"rewards/format_reward": 1.0,
"step": 33,
"temporal_rewards": 0.8571428656578064
},
{
"all_correct": 0.14285714285714285,
"all_wrong": 0.2857142857142857,
"completion_length": 283.9285888671875,
"epoch": 0.11888111888111888,
"grad_norm": 2.057519800361947,
"kl": 0.0166015625,
"learning_rate": 9.655323611500873e-07,
"loss": 0.0007,
"reward": 1.2660715579986572,
"reward_std": 0.2641655504703522,
"rewards/accuracy_reward": 0.2321428656578064,
"rewards/format_reward": 0.9821429252624512,
"step": 34,
"temporal_rewards": 0.6428571343421936
},
{
"all_correct": 0.0,
"all_wrong": 0.2857142857142857,
"completion_length": 263.08929443359375,
"epoch": 0.12237762237762238,
"grad_norm": 1.6784752125291083,
"kl": 0.01470947265625,
"learning_rate": 9.635004298239002e-07,
"loss": 0.0006,
"reward": 1.3589285612106323,
"reward_std": 0.31579551100730896,
"rewards/accuracy_reward": 0.3035714328289032,
"rewards/format_reward": 1.0,
"step": 35,
"temporal_rewards": 0.4285714328289032
},
{
"all_correct": 0.14285714285714285,
"all_wrong": 0.2857142857142857,
"completion_length": 209.21429443359375,
"epoch": 0.1258741258741259,
"grad_norm": 3.2262252650312333,
"kl": 0.0191650390625,
"learning_rate": 9.614125725369745e-07,
"loss": 0.0008,
"reward": 1.4517858028411865,
"reward_std": 0.3842121958732605,
"rewards/accuracy_reward": 0.392857164144516,
"rewards/format_reward": 0.9910714626312256,
"step": 36,
"temporal_rewards": 0.7857142686843872
},
{
"all_correct": 0.14285714285714285,
"all_wrong": 0.14285714285714285,
"completion_length": 254.5357208251953,
"epoch": 0.12937062937062938,
"grad_norm": 2.461213562184743,
"kl": 0.0172119140625,
"learning_rate": 9.592690412101657e-07,
"loss": 0.0007,
"reward": 1.7017858028411865,
"reward_std": 0.44310158491134644,
"rewards/accuracy_reward": 0.5892857313156128,
"rewards/format_reward": 0.9821429252624512,
"step": 37,
"temporal_rewards": 0.6428571343421936
},
{
"all_correct": 0.2857142857142857,
"all_wrong": 0.42857142857142855,
"completion_length": 226.35714721679688,
"epoch": 0.13286713286713286,
"grad_norm": 3.674434851466659,
"kl": 0.0172119140625,
"learning_rate": 9.570700944819582e-07,
"loss": 0.0007,
"reward": 1.3678572177886963,
"reward_std": 0.14704714715480804,
"rewards/accuracy_reward": 0.3571428656578064,
"rewards/format_reward": 1.0,
"step": 38,
"temporal_rewards": 0.5
},
{
"all_correct": 0.14285714285714285,
"all_wrong": 0.5714285714285714,
"completion_length": 317.2321472167969,
"epoch": 0.13636363636363635,
"grad_norm": 1.9427116571962744,
"kl": 0.01251220703125,
"learning_rate": 9.548159976772592e-07,
"loss": 0.0005,
"reward": 1.2660715579986572,
"reward_std": 0.3043068051338196,
"rewards/accuracy_reward": 0.267857164144516,
"rewards/format_reward": 0.9285714626312256,
"step": 39,
"temporal_rewards": 0.6428571343421936
},
{
"all_correct": 0.5714285714285714,
"all_wrong": 0.0,
"completion_length": 178.9107208251953,
"epoch": 0.13986013986013987,
"grad_norm": 1.4904776222689382,
"kl": 0.01708984375,
"learning_rate": 9.525070227753833e-07,
"loss": 0.0007,
"reward": 1.9839286804199219,
"reward_std": 0.20042385160923004,
"rewards/accuracy_reward": 0.7678571939468384,
"rewards/format_reward": 1.0,
"step": 40,
"temporal_rewards": 0.7857142686843872
},
{
"all_correct": 0.14285714285714285,
"all_wrong": 0.14285714285714285,
"completion_length": 232.73214721679688,
"epoch": 0.14335664335664336,
"grad_norm": 3.261090873434451,
"kl": 0.0159912109375,
"learning_rate": 9.50143448377237e-07,
"loss": 0.0006,
"reward": 1.692857265472412,
"reward_std": 0.45394080877304077,
"rewards/accuracy_reward": 0.5892857313156128,
"rewards/format_reward": 0.9642857313156128,
"step": 41,
"temporal_rewards": 0.7857142686843872
},
{
"all_correct": 0.0,
"all_wrong": 0.42857142857142855,
"completion_length": 261.64288330078125,
"epoch": 0.14685314685314685,
"grad_norm": 3.8796318426121963,
"kl": 0.0174560546875,
"learning_rate": 9.477255596717011e-07,
"loss": 0.0007,
"reward": 1.3589285612106323,
"reward_std": 0.31432804465293884,
"rewards/accuracy_reward": 0.3035714328289032,
"rewards/format_reward": 0.973214328289032,
"step": 42,
"temporal_rewards": 0.5714285373687744
},
{
"all_correct": 0.0,
"all_wrong": 0.42857142857142855,
"completion_length": 314.01788330078125,
"epoch": 0.15034965034965034,
"grad_norm": 1.9128498151636428,
"kl": 0.0133056640625,
"learning_rate": 9.452536484012212e-07,
"loss": 0.0005,
"reward": 1.4642857313156128,
"reward_std": 0.3462127447128296,
"rewards/accuracy_reward": 0.4107142984867096,
"rewards/format_reward": 0.9642857313156128,
"step": 43,
"temporal_rewards": 0.5714285373687744
},
{
"all_correct": 0.14285714285714285,
"all_wrong": 0.2857142857142857,
"completion_length": 265.0714416503906,
"epoch": 0.15384615384615385,
"grad_norm": 2.8462472228365914,
"kl": 0.0174560546875,
"learning_rate": 9.427280128266049e-07,
"loss": 0.0007,
"reward": 1.485714316368103,
"reward_std": 0.41046157479286194,
"rewards/accuracy_reward": 0.4642857313156128,
"rewards/format_reward": 0.8928571939468384,
"step": 44,
"temporal_rewards": 0.714285671710968
},
{
"all_correct": 0.0,
"all_wrong": 0.2857142857142857,
"completion_length": 185.5178680419922,
"epoch": 0.15734265734265734,
"grad_norm": 3.3769919224351135,
"kl": 0.0216064453125,
"learning_rate": 9.401489576910348e-07,
"loss": 0.0009,
"reward": 1.255357265472412,
"reward_std": 0.3892122805118561,
"rewards/accuracy_reward": 0.196428582072258,
"rewards/format_reward": 1.0,
"step": 45,
"temporal_rewards": 0.9285714030265808
},
{
"all_correct": 0.14285714285714285,
"all_wrong": 0.14285714285714285,
"completion_length": 310.3571472167969,
"epoch": 0.16083916083916083,
"grad_norm": 3.5940256384708946,
"kl": 0.0137939453125,
"learning_rate": 9.375167941832973e-07,
"loss": 0.0006,
"reward": 1.5750000476837158,
"reward_std": 0.41830208897590637,
"rewards/accuracy_reward": 0.4821428656578064,
"rewards/format_reward": 0.9821429252624512,
"step": 46,
"temporal_rewards": 0.714285671710968
},
{
"all_correct": 0.0,
"all_wrong": 0.2857142857142857,
"completion_length": 222.12501525878906,
"epoch": 0.16433566433566432,
"grad_norm": 3.3517712947005456,
"kl": 0.0159912109375,
"learning_rate": 9.348318399002345e-07,
"loss": 0.0006,
"reward": 1.3285715579986572,
"reward_std": 0.34704774618148804,
"rewards/accuracy_reward": 0.267857164144516,
"rewards/format_reward": 1.0,
"step": 47,
"temporal_rewards": 0.6428571343421936
},
{
"all_correct": 0.2857142857142857,
"all_wrong": 0.0,
"completion_length": 239.0357208251953,
"epoch": 0.16783216783216784,
"grad_norm": 1.5367703370703922,
"kl": 0.0230712890625,
"learning_rate": 9.320944188084241e-07,
"loss": 0.0009,
"reward": 1.6750000715255737,
"reward_std": 0.33668023347854614,
"rewards/accuracy_reward": 0.535714328289032,
"rewards/format_reward": 1.0,
"step": 48,
"temporal_rewards": 0.7857142686843872
},
{
"all_correct": 0.14285714285714285,
"all_wrong": 0.2857142857142857,
"completion_length": 174.7678680419922,
"epoch": 0.17132867132867133,
"grad_norm": 2.4741938852675536,
"kl": 0.020263671875,
"learning_rate": 9.293048612050883e-07,
"loss": 0.0008,
"reward": 1.4267858266830444,
"reward_std": 0.3323812484741211,
"rewards/accuracy_reward": 0.3571428656578064,
"rewards/format_reward": 1.0,
"step": 49,
"temporal_rewards": 0.5
},
{
"all_correct": 0.14285714285714285,
"all_wrong": 0.42857142857142855,
"completion_length": 246.62501525878906,
"epoch": 0.17482517482517482,
"grad_norm": 1.6031937748680414,
"kl": 0.0189208984375,
"learning_rate": 9.264635036782405e-07,
"loss": 0.0008,
"reward": 1.3767857551574707,
"reward_std": 0.23027247190475464,
"rewards/accuracy_reward": 0.3392857313156128,
"rewards/format_reward": 1.0,
"step": 50,
"temporal_rewards": 0.5714285373687744
},
{
"all_correct": 0.42857142857142855,
"all_wrong": 0.0,
"completion_length": 172.92857360839844,
"epoch": 0.17832167832167833,
"grad_norm": 2.1468520628879197,
"kl": 0.0201416015625,
"learning_rate": 9.235706890660732e-07,
"loss": 0.0008,
"reward": 2.0625,
"reward_std": 0.2902068495750427,
"rewards/accuracy_reward": 0.8571429252624512,
"rewards/format_reward": 1.0,
"step": 51,
"temporal_rewards": 0.8571428656578064
},
{
"all_correct": 0.2857142857142857,
"all_wrong": 0.2857142857142857,
"completion_length": 225.3928680419922,
"epoch": 0.18181818181818182,
"grad_norm": 2.041359786136277,
"kl": 0.0166015625,
"learning_rate": 9.206267664155906e-07,
"loss": 0.0007,
"reward": 1.5571428537368774,
"reward_std": 0.16162440180778503,
"rewards/accuracy_reward": 0.4464285969734192,
"rewards/format_reward": 1.0,
"step": 52,
"temporal_rewards": 0.5
},
{
"all_correct": 0.14285714285714285,
"all_wrong": 0.2857142857142857,
"completion_length": 219.5178680419922,
"epoch": 0.1853146853146853,
"grad_norm": 2.1860526381838405,
"kl": 0.01953125,
"learning_rate": 9.176320909404923e-07,
"loss": 0.0008,
"reward": 1.4892857074737549,
"reward_std": 0.34200409054756165,
"rewards/accuracy_reward": 0.4285714626312256,
"rewards/format_reward": 0.9821429252624512,
"step": 53,
"temporal_rewards": 0.714285671710968
},
{
"all_correct": 0.2857142857142857,
"all_wrong": 0.14285714285714285,
"completion_length": 146.5,
"epoch": 0.1888111888111888,
"grad_norm": 7.135493678756918,
"kl": 0.0250244140625,
"learning_rate": 9.145870239783141e-07,
"loss": 0.001,
"reward": 1.889285683631897,
"reward_std": 0.3193110227584839,
"rewards/accuracy_reward": 0.7142857313156128,
"rewards/format_reward": 1.0,
"step": 54,
"temporal_rewards": 0.9285714030265808
},
{
"all_correct": 0.2857142857142857,
"all_wrong": 0.14285714285714285,
"completion_length": 199.71429443359375,
"epoch": 0.19230769230769232,
"grad_norm": 1.6651581992503819,
"kl": 0.018798828125,
"learning_rate": 9.114919329468282e-07,
"loss": 0.0008,
"reward": 1.8017858266830444,
"reward_std": 0.29444825649261475,
"rewards/accuracy_reward": 0.6428571939468384,
"rewards/format_reward": 1.0,
"step": 55,
"temporal_rewards": 0.8571428656578064
},
{
"all_correct": 0.0,
"all_wrong": 0.42857142857142855,
"completion_length": 267.8571472167969,
"epoch": 0.1958041958041958,
"grad_norm": 5.502551920924967,
"kl": 0.01953125,
"learning_rate": 9.083471912997108e-07,
"loss": 0.0008,
"reward": 1.1821428537368774,
"reward_std": 0.2820361256599426,
"rewards/accuracy_reward": 0.1607142984867096,
"rewards/format_reward": 1.0,
"step": 56,
"temporal_rewards": 0.4285714328289032
},
{
"all_correct": 0.2857142857142857,
"all_wrong": 0.2857142857142857,
"completion_length": 286.6607360839844,
"epoch": 0.1993006993006993,
"grad_norm": 1.0605637999415247,
"kl": 0.0162353515625,
"learning_rate": 9.051531784814816e-07,
"loss": 0.0007,
"reward": 1.662500023841858,
"reward_std": 0.24488236010074615,
"rewards/accuracy_reward": 0.5714285969734192,
"rewards/format_reward": 0.9464285969734192,
"step": 57,
"temporal_rewards": 0.7857142686843872
},
{
"all_correct": 0.14285714285714285,
"all_wrong": 0.14285714285714285,
"completion_length": 388.0357360839844,
"epoch": 0.20279720279720279,
"grad_norm": 2.4145653726848453,
"kl": 0.01318359375,
"learning_rate": 9.019102798817195e-07,
"loss": 0.0005,
"reward": 1.3767857551574707,
"reward_std": 0.5423558950424194,
"rewards/accuracy_reward": 0.4285714626312256,
"rewards/format_reward": 0.8571429252624512,
"step": 58,
"temporal_rewards": 0.6428571343421936
},
{
"all_correct": 0.14285714285714285,
"all_wrong": 0.2857142857142857,
"completion_length": 199.3928680419922,
"epoch": 0.2062937062937063,
"grad_norm": 3.84081401020837,
"kl": 0.0224609375,
"learning_rate": 8.986188867885616e-07,
"loss": 0.0009,
"reward": 1.3732143640518188,
"reward_std": 0.3065285086631775,
"rewards/accuracy_reward": 0.3035714328289032,
"rewards/format_reward": 1.0,
"step": 59,
"temporal_rewards": 0.5714285373687744
},
{
"all_correct": 0.5714285714285714,
"all_wrong": 0.0,
"completion_length": 220.35714721679688,
"epoch": 0.2097902097902098,
"grad_norm": 2.5367802513821687,
"kl": 0.02001953125,
"learning_rate": 8.952793963414906e-07,
"loss": 0.0008,
"reward": 1.9267857074737549,
"reward_std": 0.3000243604183197,
"rewards/accuracy_reward": 0.7678571939468384,
"rewards/format_reward": 0.9821429252624512,
"step": 60,
"temporal_rewards": 0.714285671710968
},
{
"all_correct": 0.2857142857142857,
"all_wrong": 0.14285714285714285,
"completion_length": 299.0535888671875,
"epoch": 0.21328671328671328,
"grad_norm": 5.034973765435732,
"kl": 0.016845703125,
"learning_rate": 8.918922114834156e-07,
"loss": 0.0007,
"reward": 1.7000000476837158,
"reward_std": 0.38362255692481995,
"rewards/accuracy_reward": 0.625,
"rewards/format_reward": 0.9464285969734192,
"step": 61,
"temporal_rewards": 0.6428571343421936
},
{
"all_correct": 0.5714285714285714,
"all_wrong": 0.0,
"completion_length": 154.21429443359375,
"epoch": 0.21678321678321677,
"grad_norm": 2.038565297311163,
"kl": 0.0269775390625,
"learning_rate": 8.884577409120535e-07,
"loss": 0.0011,
"reward": 2.1178572177886963,
"reward_std": 0.24774520099163055,
"rewards/accuracy_reward": 0.8928571939468384,
"rewards/format_reward": 1.0,
"step": 62,
"temporal_rewards": 0.9285714030265808
},
{
"all_correct": 0.5714285714285714,
"all_wrong": 0.2857142857142857,
"completion_length": 190.6607208251953,
"epoch": 0.2202797202797203,
"grad_norm": 1.081625687458147,
"kl": 0.02294921875,
"learning_rate": 8.849763990306152e-07,
"loss": 0.0009,
"reward": 1.7767858505249023,
"reward_std": 0.16224628686904907,
"rewards/accuracy_reward": 0.625,
"rewards/format_reward": 0.9642857313156128,
"step": 63,
"temporal_rewards": 0.9285714030265808
},
{
"all_correct": 0.0,
"all_wrong": 0.42857142857142855,
"completion_length": 286.1785888671875,
"epoch": 0.22377622377622378,
"grad_norm": 9.904092820497874,
"kl": 0.0238037109375,
"learning_rate": 8.814486058978033e-07,
"loss": 0.001,
"reward": 1.321428656578064,
"reward_std": 0.394111305475235,
"rewards/accuracy_reward": 0.3571428656578064,
"rewards/format_reward": 0.8571429252624512,
"step": 64,
"temporal_rewards": 0.7857142686843872
},
{
"all_correct": 0.14285714285714285,
"all_wrong": 0.2857142857142857,
"completion_length": 344.1785888671875,
"epoch": 0.22727272727272727,
"grad_norm": 1.3046406890165314,
"kl": 0.01422119140625,
"learning_rate": 8.778747871771291e-07,
"loss": 0.0006,
"reward": 1.289285659790039,
"reward_std": 0.37180283665657043,
"rewards/accuracy_reward": 0.2857142984867096,
"rewards/format_reward": 0.9464285969734192,
"step": 65,
"temporal_rewards": 0.5714285373687744
},
{
"all_correct": 0.0,
"all_wrong": 0.42857142857142855,
"completion_length": 252.8035888671875,
"epoch": 0.23076923076923078,
"grad_norm": 2.5493063713880852,
"kl": 0.017333984375,
"learning_rate": 8.742553740855505e-07,
"loss": 0.0007,
"reward": 1.1375000476837158,
"reward_std": 0.41169679164886475,
"rewards/accuracy_reward": 0.196428582072258,
"rewards/format_reward": 0.910714328289032,
"step": 66,
"temporal_rewards": 0.4285714328289032
},
{
"all_correct": 0.14285714285714285,
"all_wrong": 0.2857142857142857,
"completion_length": 242.57144165039062,
"epoch": 0.23426573426573427,
"grad_norm": 5.785405122787179,
"kl": 0.021484375,
"learning_rate": 8.705908033414424e-07,
"loss": 0.0009,
"reward": 1.6410715579986572,
"reward_std": 0.3037279546260834,
"rewards/accuracy_reward": 0.5178571939468384,
"rewards/format_reward": 1.0,
"step": 67,
"temporal_rewards": 0.7857142686843872
},
{
"all_correct": 0.0,
"all_wrong": 0.5714285714285714,
"completion_length": 249.73214721679688,
"epoch": 0.23776223776223776,
"grad_norm": 2.926217120408116,
"kl": 0.0189208984375,
"learning_rate": 8.668815171119019e-07,
"loss": 0.0008,
"reward": 1.1678571701049805,
"reward_std": 0.30887019634246826,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.8750000596046448,
"step": 68,
"temporal_rewards": 0.5714285373687744
},
{
"all_correct": 0.0,
"all_wrong": 0.14285714285714285,
"completion_length": 237.94644165039062,
"epoch": 0.24125874125874125,
"grad_norm": 7.506566104564083,
"kl": 0.0230712890625,
"learning_rate": 8.631279629593966e-07,
"loss": 0.0009,
"reward": 1.35535728931427,
"reward_std": 0.43541181087493896,
"rewards/accuracy_reward": 0.3214285969734192,
"rewards/format_reward": 0.9642857313156128,
"step": 69,
"temporal_rewards": 0.4285714328289032
},
{
"all_correct": 0.14285714285714285,
"all_wrong": 0.2857142857142857,
"completion_length": 245.5357208251953,
"epoch": 0.24475524475524477,
"grad_norm": 3.175217838092476,
"kl": 0.0198974609375,
"learning_rate": 8.593305937877613e-07,
"loss": 0.0008,
"reward": 1.4392857551574707,
"reward_std": 0.328709214925766,
"rewards/accuracy_reward": 0.3750000298023224,
"rewards/format_reward": 1.0,
"step": 70,
"temporal_rewards": 0.7857142686843872
},
{
"all_correct": 0.14285714285714285,
"all_wrong": 0.7142857142857143,
"completion_length": 236.2678680419922,
"epoch": 0.24825174825174826,
"grad_norm": 1.0790210342784936,
"kl": 0.019775390625,
"learning_rate": 8.554898677875508e-07,
"loss": 0.0008,
"reward": 1.3071428537368774,
"reward_std": 0.13647663593292236,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.9821429252624512,
"step": 71,
"temporal_rewards": 0.714285671710968
},
{
"all_correct": 0.7142857142857143,
"all_wrong": 0.0,
"completion_length": 148.4107208251953,
"epoch": 0.2517482517482518,
"grad_norm": 7.833869612393734,
"kl": 0.027587890625,
"learning_rate": 8.516062483807554e-07,
"loss": 0.0011,
"reward": 2.1125001907348633,
"reward_std": 0.1543826460838318,
"rewards/accuracy_reward": 0.8928571939468384,
"rewards/format_reward": 1.0,
"step": 72,
"temporal_rewards": 0.7857142686843872
},
{
"all_correct": 0.2857142857142857,
"all_wrong": 0.42857142857142855,
"completion_length": 171.07144165039062,
"epoch": 0.25524475524475526,
"grad_norm": 1.8151636448140855,
"kl": 0.0240478515625,
"learning_rate": 8.476802041648831e-07,
"loss": 0.001,
"reward": 1.5642857551574707,
"reward_std": 0.1599045991897583,
"rewards/accuracy_reward": 0.4464285969734192,
"rewards/format_reward": 1.0,
"step": 73,
"temporal_rewards": 0.714285671710968
},
{
"all_correct": 0.14285714285714285,
"all_wrong": 0.14285714285714285,
"completion_length": 217.07144165039062,
"epoch": 0.25874125874125875,
"grad_norm": 4.21791710784553,
"kl": 0.01953125,
"learning_rate": 8.437122088564197e-07,
"loss": 0.0008,
"reward": 1.673214316368103,
"reward_std": 0.43455594778060913,
"rewards/accuracy_reward": 0.535714328289032,
"rewards/format_reward": 1.0,
"step": 74,
"temporal_rewards": 0.714285671710968
},
{
"all_correct": 0.0,
"all_wrong": 0.42857142857142855,
"completion_length": 243.8928680419922,
"epoch": 0.26223776223776224,
"grad_norm": 2.3789653415719485,
"kl": 0.0189208984375,
"learning_rate": 8.39702741233669e-07,
"loss": 0.0008,
"reward": 1.2517858743667603,
"reward_std": 0.2406107783317566,
"rewards/accuracy_reward": 0.3571428656578064,
"rewards/format_reward": 0.8571429252624512,
"step": 75,
"temporal_rewards": 0.357142835855484
},
{
"all_correct": 0.2857142857142857,
"all_wrong": 0.0,
"completion_length": 255.62501525878906,
"epoch": 0.26573426573426573,
"grad_norm": 3.628990949164193,
"kl": 0.022705078125,
"learning_rate": 8.356522850789851e-07,
"loss": 0.0009,
"reward": 1.4482142925262451,
"reward_std": 0.44743552803993225,
"rewards/accuracy_reward": 0.4642857313156128,
"rewards/format_reward": 0.8571429252624512,
"step": 76,
"temporal_rewards": 0.8571428656578064
},
{
"all_correct": 0.0,
"all_wrong": 0.2857142857142857,
"completion_length": 224.71429443359375,
"epoch": 0.2692307692307692,
"grad_norm": 3.98575994223548,
"kl": 0.0213623046875,
"learning_rate": 8.315613291203976e-07,
"loss": 0.0009,
"reward": 1.4410713911056519,
"reward_std": 0.4627973139286041,
"rewards/accuracy_reward": 0.3571428656578064,
"rewards/format_reward": 0.9821429252624512,
"step": 77,
"temporal_rewards": 0.7857142686843872
},
{
"all_correct": 0.7142857142857143,
"all_wrong": 0.14285714285714285,
"completion_length": 174.82144165039062,
"epoch": 0.2727272727272727,
"grad_norm": 1.9033544021109488,
"kl": 0.021240234375,
"learning_rate": 8.274303669726426e-07,
"loss": 0.0008,
"reward": 1.9803571701049805,
"reward_std": 0.10934228450059891,
"rewards/accuracy_reward": 0.8035714626312256,
"rewards/format_reward": 1.0,
"step": 78,
"temporal_rewards": 0.8571428656578064
},
{
"all_correct": 0.14285714285714285,
"all_wrong": 0.0,
"completion_length": 132.1607208251953,
"epoch": 0.2762237762237762,
"grad_norm": 5.777704604421436,
"kl": 0.0255126953125,
"learning_rate": 8.232598970776026e-07,
"loss": 0.001,
"reward": 1.8160715103149414,
"reward_std": 0.5395211577415466,
"rewards/accuracy_reward": 0.625,
"rewards/format_reward": 1.0,
"step": 79,
"temporal_rewards": 1.0
},
{
"all_correct": 0.14285714285714285,
"all_wrong": 0.14285714285714285,
"completion_length": 235.1428680419922,
"epoch": 0.27972027972027974,
"grad_norm": 1.593925925571494,
"kl": 0.0177001953125,
"learning_rate": 8.190504226441653e-07,
"loss": 0.0007,
"reward": 1.4946428537368774,
"reward_std": 0.44739413261413574,
"rewards/accuracy_reward": 0.4285714626312256,
"rewards/format_reward": 0.9821429252624512,
"step": 80,
"temporal_rewards": 0.6428571343421936
},
{
"all_correct": 0.0,
"all_wrong": 0.42857142857142855,
"completion_length": 262.3214416503906,
"epoch": 0.28321678321678323,
"grad_norm": 2.761975886137325,
"kl": 0.0169677734375,
"learning_rate": 8.148024515875056e-07,
"loss": 0.0007,
"reward": 1.3357144594192505,
"reward_std": 0.3343726098537445,
"rewards/accuracy_reward": 0.267857164144516,
"rewards/format_reward": 1.0,
"step": 81,
"temporal_rewards": 0.5714285373687744
},
{
"all_correct": 0.14285714285714285,
"all_wrong": 0.2857142857142857,
"completion_length": 351.4285888671875,
"epoch": 0.2867132867132867,
"grad_norm": 1.0204064062267335,
"kl": 0.01080322265625,
"learning_rate": 8.105164964678009e-07,
"loss": 0.0004,
"reward": 1.4267858266830444,
"reward_std": 0.32611167430877686,
"rewards/accuracy_reward": 0.392857164144516,
"rewards/format_reward": 0.973214328289032,
"step": 82,
"temporal_rewards": 0.5714285373687744
},
{
"all_correct": 0.14285714285714285,
"all_wrong": 0.2857142857142857,
"completion_length": 215.2857208251953,
"epoch": 0.2902097902097902,
"grad_norm": 8.030465893244708,
"kl": 0.02294921875,
"learning_rate": 8.061930744283854e-07,
"loss": 0.0009,
"reward": 1.4482142925262451,
"reward_std": 0.26826655864715576,
"rewards/accuracy_reward": 0.3571428656578064,
"rewards/format_reward": 1.0,
"step": 83,
"temporal_rewards": 0.6428571343421936
},
{
"all_correct": 0.2857142857142857,
"all_wrong": 0.42857142857142855,
"completion_length": 251.44644165039062,
"epoch": 0.2937062937062937,
"grad_norm": 1.8690472432712648,
"kl": 0.0181884765625,
"learning_rate": 8.01832707133352e-07,
"loss": 0.0007,
"reward": 1.289285659790039,
"reward_std": 0.2658645808696747,
"rewards/accuracy_reward": 0.3214285969734192,
"rewards/format_reward": 0.9196429252624512,
"step": 84,
"temporal_rewards": 0.5714285373687744
},
{
"all_correct": 0.0,
"all_wrong": 0.14285714285714285,
"completion_length": 262.2321472167969,
"epoch": 0.2972027972027972,
"grad_norm": 1.818292460461868,
"kl": 0.0166015625,
"learning_rate": 7.97435920704608e-07,
"loss": 0.0007,
"reward": 1.446428656578064,
"reward_std": 0.4745182394981384,
"rewards/accuracy_reward": 0.392857164144516,
"rewards/format_reward": 0.9821429252624512,
"step": 85,
"temporal_rewards": 0.5714285373687744
},
{
"all_correct": 0.5714285714285714,
"all_wrong": 0.14285714285714285,
"completion_length": 226.62501525878906,
"epoch": 0.3006993006993007,
"grad_norm": 2.994767446502154,
"kl": 0.022705078125,
"learning_rate": 7.930032456583931e-07,
"loss": 0.0009,
"reward": 1.8410714864730835,
"reward_std": 0.1394950896501541,
"rewards/accuracy_reward": 0.7142857313156128,
"rewards/format_reward": 1.0,
"step": 86,
"temporal_rewards": 0.6428571343421936
},
{
"all_correct": 0.42857142857142855,
"all_wrong": 0.2857142857142857,
"completion_length": 176.9107208251953,
"epoch": 0.3041958041958042,
"grad_norm": 1.7391609508300692,
"kl": 0.019775390625,
"learning_rate": 7.885352168412675e-07,
"loss": 0.0008,
"reward": 1.6928571462631226,
"reward_std": 0.14970263838768005,
"rewards/accuracy_reward": 0.5535714626312256,
"rewards/format_reward": 1.0,
"step": 87,
"temporal_rewards": 0.714285671710968
},
{
"all_correct": 0.2857142857142857,
"all_wrong": 0.42857142857142855,
"completion_length": 292.5714416503906,
"epoch": 0.3076923076923077,
"grad_norm": 0.8888793089998056,
"kl": 0.0208740234375,
"learning_rate": 7.840323733655778e-07,
"loss": 0.0008,
"reward": 1.4910714626312256,
"reward_std": 0.33167630434036255,
"rewards/accuracy_reward": 0.4642857313156128,
"rewards/format_reward": 0.910714328289032,
"step": 88,
"temporal_rewards": 0.7857142686843872
},
{
"all_correct": 0.14285714285714285,
"all_wrong": 0.14285714285714285,
"completion_length": 262.8214416503906,
"epoch": 0.3111888111888112,
"grad_norm": 3.3773560408418652,
"kl": 0.015625,
"learning_rate": 7.794952585444067e-07,
"loss": 0.0006,
"reward": 1.7625000476837158,
"reward_std": 0.39009442925453186,
"rewards/accuracy_reward": 0.6071428656578064,
"rewards/format_reward": 1.0,
"step": 89,
"temporal_rewards": 0.7857142686843872
},
{
"all_correct": 0.42857142857142855,
"all_wrong": 0.0,
"completion_length": 205.73214721679688,
"epoch": 0.3146853146853147,
"grad_norm": 4.902554291103843,
"kl": 0.022216796875,
"learning_rate": 7.749244198260174e-07,
"loss": 0.0009,
"reward": 1.725000023841858,
"reward_std": 0.3247165381908417,
"rewards/accuracy_reward": 0.6071428656578064,
"rewards/format_reward": 1.0,
"step": 90,
"temporal_rewards": 0.6428571343421936
},
{
"all_correct": 0.0,
"all_wrong": 0.0,
"completion_length": 194.50001525878906,
"epoch": 0.3181818181818182,
"grad_norm": 3.125738974344353,
"kl": 0.01953125,
"learning_rate": 7.703204087277988e-07,
"loss": 0.0008,
"reward": 1.5461667776107788,
"reward_std": 0.5128087401390076,
"rewards/accuracy_reward": 0.44795241951942444,
"rewards/format_reward": 1.0,
"step": 91,
"temporal_rewards": 0.714285671710968
},
{
"all_correct": 0.2857142857142857,
"all_wrong": 0.2857142857142857,
"completion_length": 201.3928680419922,
"epoch": 0.32167832167832167,
"grad_norm": 2.142383489890374,
"kl": 0.02490234375,
"learning_rate": 7.656837807697186e-07,
"loss": 0.001,
"reward": 1.692857265472412,
"reward_std": 0.3641102910041809,
"rewards/accuracy_reward": 0.5535714626312256,
"rewards/format_reward": 0.973214328289032,
"step": 92,
"temporal_rewards": 0.8571428656578064
},
{
"all_correct": 0.14285714285714285,
"all_wrong": 0.2857142857142857,
"completion_length": 277.25,
"epoch": 0.32517482517482516,
"grad_norm": 1.4155785147812223,
"kl": 0.017822265625,
"learning_rate": 7.610150954072952e-07,
"loss": 0.0007,
"reward": 1.5071431398391724,
"reward_std": 0.3736386001110077,
"rewards/accuracy_reward": 0.4464285969734192,
"rewards/format_reward": 0.9821429252624512,
"step": 93,
"temporal_rewards": 0.714285671710968
},
{
"all_correct": 0.14285714285714285,
"all_wrong": 0.14285714285714285,
"completion_length": 301.4285888671875,
"epoch": 0.32867132867132864,
"grad_norm": 1.1616934363002724,
"kl": 0.0162353515625,
"learning_rate": 7.563149159640928e-07,
"loss": 0.0006,
"reward": 1.7053571939468384,
"reward_std": 0.33641549944877625,
"rewards/accuracy_reward": 0.5892857313156128,
"rewards/format_reward": 1.0,
"step": 94,
"temporal_rewards": 0.6428571343421936
},
{
"all_correct": 0.14285714285714285,
"all_wrong": 0.2857142857142857,
"completion_length": 267.21429443359375,
"epoch": 0.3321678321678322,
"grad_norm": 3.0109070395231203,
"kl": 0.0179443359375,
"learning_rate": 7.515838095637518e-07,
"loss": 0.0007,
"reward": 1.5267857313156128,
"reward_std": 0.2861282229423523,
"rewards/accuracy_reward": 0.4642857313156128,
"rewards/format_reward": 0.9910714626312256,
"step": 95,
"temporal_rewards": 0.4285714328289032
},
{
"all_correct": 0.2857142857142857,
"all_wrong": 0.2857142857142857,
"completion_length": 166.55357360839844,
"epoch": 0.3356643356643357,
"grad_norm": 9.975231124520647,
"kl": 0.024658203125,
"learning_rate": 7.468223470615592e-07,
"loss": 0.001,
"reward": 1.5607143640518188,
"reward_std": 0.2182290256023407,
"rewards/accuracy_reward": 0.4642857313156128,
"rewards/format_reward": 1.0,
"step": 96,
"temporal_rewards": 0.6428571343421936
},
{
"all_correct": 0.0,
"all_wrong": 0.42857142857142855,
"completion_length": 212.83929443359375,
"epoch": 0.33916083916083917,
"grad_norm": 1.8684746698680486,
"kl": 0.017578125,
"learning_rate": 7.420311029755687e-07,
"loss": 0.0007,
"reward": 1.283928632736206,
"reward_std": 0.3540671467781067,
"rewards/accuracy_reward": 0.2321428656578064,
"rewards/format_reward": 1.0,
"step": 97,
"temporal_rewards": 0.5714285373687744
},
{
"all_correct": 0.0,
"all_wrong": 0.42857142857142855,
"completion_length": 342.3571472167969,
"epoch": 0.34265734265734266,
"grad_norm": 2.56805644763744,
"kl": 0.0162353515625,
"learning_rate": 7.372106554172801e-07,
"loss": 0.0007,
"reward": 1.3767857551574707,
"reward_std": 0.5145957469940186,
"rewards/accuracy_reward": 0.3750000298023224,
"rewards/format_reward": 0.910714328289032,
"step": 98,
"temporal_rewards": 0.714285671710968
},
{
"all_correct": 0.2857142857142857,
"all_wrong": 0.14285714285714285,
"completion_length": 219.7857208251953,
"epoch": 0.34615384615384615,
"grad_norm": 2.8558528617729753,
"kl": 0.019287109375,
"learning_rate": 7.323615860218842e-07,
"loss": 0.0008,
"reward": 1.6142857074737549,
"reward_std": 0.32218724489212036,
"rewards/accuracy_reward": 0.4821428656578064,
"rewards/format_reward": 1.0,
"step": 99,
"temporal_rewards": 0.8571428656578064
},
{
"all_correct": 0.0,
"all_wrong": 0.42857142857142855,
"completion_length": 349.51788330078125,
"epoch": 0.34965034965034963,
"grad_norm": 1.2509389603357361,
"kl": 0.011474609375,
"learning_rate": 7.274844798780825e-07,
"loss": 0.0005,
"reward": 1.1142858266830444,
"reward_std": 0.3019092082977295,
"rewards/accuracy_reward": 0.125,
"rewards/format_reward": 0.9821429252624512,
"step": 100,
"temporal_rewards": 0.4285714328289032
},
{
"all_correct": 0.0,
"all_wrong": 0.2857142857142857,
"completion_length": 277.0357360839844,
"epoch": 0.3531468531468531,
"grad_norm": 2.7916868507565837,
"kl": 0.0179443359375,
"learning_rate": 7.225799254574903e-07,
"loss": 0.0007,
"reward": 1.5017857551574707,
"reward_std": 0.37953513860702515,
"rewards/accuracy_reward": 0.4107142984867096,
"rewards/format_reward": 1.0,
"step": 101,
"temporal_rewards": 0.7857142686843872
},
{
"all_correct": 0.2857142857142857,
"all_wrong": 0.2857142857142857,
"completion_length": 168.60714721679688,
"epoch": 0.35664335664335667,
"grad_norm": 2.2707554849609437,
"kl": 0.02392578125,
"learning_rate": 7.176485145436324e-07,
"loss": 0.001,
"reward": 1.662500023841858,
"reward_std": 0.23105838894844055,
"rewards/accuracy_reward": 0.5178571939468384,
"rewards/format_reward": 1.0,
"step": 102,
"temporal_rewards": 0.7857142686843872
},
{
"all_correct": 0.14285714285714285,
"all_wrong": 0.5714285714285714,
"completion_length": 334.26788330078125,
"epoch": 0.36013986013986016,
"grad_norm": 2.234886017179779,
"kl": 0.01904296875,
"learning_rate": 7.126908421605374e-07,
"loss": 0.0008,
"reward": 1.4928572177886963,
"reward_std": 0.18182747066020966,
"rewards/accuracy_reward": 0.392857164144516,
"rewards/format_reward": 0.9821429252624512,
"step": 103,
"temporal_rewards": 0.714285671710968
},
{
"all_correct": 0.14285714285714285,
"all_wrong": 0.42857142857142855,
"completion_length": 258.6785888671875,
"epoch": 0.36363636363636365,
"grad_norm": 1.1966647862465727,
"kl": 0.022705078125,
"learning_rate": 7.077075065009433e-07,
"loss": 0.0009,
"reward": 1.25,
"reward_std": 0.2374918907880783,
"rewards/accuracy_reward": 0.2142857313156128,
"rewards/format_reward": 0.9821429252624512,
"step": 104,
"temporal_rewards": 0.7857142686843872
},
{
"all_correct": 0.0,
"all_wrong": 0.42857142857142855,
"completion_length": 181.96429443359375,
"epoch": 0.36713286713286714,
"grad_norm": 2.485952277761037,
"kl": 0.03173828125,
"learning_rate": 7.026991088541183e-07,
"loss": 0.0013,
"reward": 1.255357265472412,
"reward_std": 0.35542815923690796,
"rewards/accuracy_reward": 0.2321428656578064,
"rewards/format_reward": 0.9821429252624512,
"step": 105,
"temporal_rewards": 0.5
},
{
"all_correct": 0.14285714285714285,
"all_wrong": 0.42857142857142855,
"completion_length": 234.71429443359375,
"epoch": 0.3706293706293706,
"grad_norm": 1.2916342200641437,
"kl": 0.0189208984375,
"learning_rate": 6.976662535333107e-07,
"loss": 0.0008,
"reward": 1.446428656578064,
"reward_std": 0.2671079635620117,
"rewards/accuracy_reward": 0.3750000298023224,
"rewards/format_reward": 0.9821429252624512,
"step": 106,
"temporal_rewards": 0.714285671710968
},
{
"all_correct": 0.5714285714285714,
"all_wrong": 0.0,
"completion_length": 256.9821472167969,
"epoch": 0.3741258741258741,
"grad_norm": 1.4490597935648555,
"kl": 0.0201416015625,
"learning_rate": 6.926095478028311e-07,
"loss": 0.0008,
"reward": 1.9785715341567993,
"reward_std": 0.28172439336776733,
"rewards/accuracy_reward": 0.8035714626312256,
"rewards/format_reward": 1.0,
"step": 107,
"temporal_rewards": 0.7857142686843872
},
{
"all_correct": 0.0,
"all_wrong": 0.14285714285714285,
"completion_length": 229.75001525878906,
"epoch": 0.3776223776223776,
"grad_norm": 6.256933238343451,
"kl": 0.027587890625,
"learning_rate": 6.875296018047809e-07,
"loss": 0.0011,
"reward": 1.5000001192092896,
"reward_std": 0.5110898017883301,
"rewards/accuracy_reward": 0.4464285969734192,
"rewards/format_reward": 0.9464285969734192,
"step": 108,
"temporal_rewards": 0.6428571343421936
},
{
"all_correct": 0.42857142857142855,
"all_wrong": 0.2857142857142857,
"completion_length": 210.96429443359375,
"epoch": 0.3811188811188811,
"grad_norm": 7.77848974566679,
"kl": 0.0250244140625,
"learning_rate": 6.824270284854317e-07,
"loss": 0.001,
"reward": 1.7500001192092896,
"reward_std": 0.21283701062202454,
"rewards/accuracy_reward": 0.6071428656578064,
"rewards/format_reward": 0.9821429252624512,
"step": 109,
"temporal_rewards": 0.714285671710968
},
{
"all_correct": 0.14285714285714285,
"all_wrong": 0.2857142857142857,
"completion_length": 283.1607360839844,
"epoch": 0.38461538461538464,
"grad_norm": 2.5557165123350445,
"kl": 0.01953125,
"learning_rate": 6.773024435212677e-07,
"loss": 0.0008,
"reward": 1.4250000715255737,
"reward_std": 0.3038181662559509,
"rewards/accuracy_reward": 0.392857164144516,
"rewards/format_reward": 0.9821429252624512,
"step": 110,
"temporal_rewards": 0.357142835855484
},
{
"all_correct": 0.42857142857142855,
"all_wrong": 0.14285714285714285,
"completion_length": 161.0357208251953,
"epoch": 0.3881118881118881,
"grad_norm": 2.4569515405580624,
"kl": 0.030029296875,
"learning_rate": 6.721564652446987e-07,
"loss": 0.0012,
"reward": 1.8303571939468384,
"reward_std": 0.21543601155281067,
"rewards/accuracy_reward": 0.6428571939468384,
"rewards/format_reward": 1.0,
"step": 111,
"temporal_rewards": 0.9285714030265808
},
{
"all_correct": 0.42857142857142855,
"all_wrong": 0.0,
"completion_length": 245.69644165039062,
"epoch": 0.3916083916083916,
"grad_norm": 1.6931573328130867,
"kl": 0.025390625,
"learning_rate": 6.669897145694506e-07,
"loss": 0.001,
"reward": 1.798214316368103,
"reward_std": 0.2900663912296295,
"rewards/accuracy_reward": 0.6428571939468384,
"rewards/format_reward": 1.0,
"step": 112,
"temporal_rewards": 0.7857142686843872
},
{
"all_correct": 0.2857142857142857,
"all_wrong": 0.0,
"completion_length": 267.26788330078125,
"epoch": 0.3951048951048951,
"grad_norm": 3.226106128346649,
"kl": 0.0194091796875,
"learning_rate": 6.618028149156478e-07,
"loss": 0.0008,
"reward": 1.7946429252624512,
"reward_std": 0.4278576970100403,
"rewards/accuracy_reward": 0.6428571939468384,
"rewards/format_reward": 1.0,
"step": 113,
"temporal_rewards": 0.7857142686843872
},
{
"all_correct": 0.14285714285714285,
"all_wrong": 0.42857142857142855,
"completion_length": 214.71429443359375,
"epoch": 0.3986013986013986,
"grad_norm": 2.474466887500513,
"kl": 0.023681640625,
"learning_rate": 6.565963921345895e-07,
"loss": 0.0009,
"reward": 1.3232142925262451,
"reward_std": 0.2196773886680603,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 1.0,
"step": 114,
"temporal_rewards": 0.714285671710968
},
{
"all_correct": 0.14285714285714285,
"all_wrong": 0.2857142857142857,
"completion_length": 206.46429443359375,
"epoch": 0.4020979020979021,
"grad_norm": 12.40427149026429,
"kl": 0.0235595703125,
"learning_rate": 6.51371074433236e-07,
"loss": 0.0009,
"reward": 1.5821430683135986,
"reward_std": 0.3218930661678314,
"rewards/accuracy_reward": 0.4821428656578064,
"rewards/format_reward": 0.9821429252624512,
"step": 115,
"temporal_rewards": 0.5714285373687744
},
{
"all_correct": 0.42857142857142855,
"all_wrong": 0.14285714285714285,
"completion_length": 275.33929443359375,
"epoch": 0.40559440559440557,
"grad_norm": 4.132371759712955,
"kl": 0.0185546875,
"learning_rate": 6.461274922984086e-07,
"loss": 0.0007,
"reward": 1.9821428060531616,
"reward_std": 0.24592465162277222,
"rewards/accuracy_reward": 0.785714328289032,
"rewards/format_reward": 1.0,
"step": 116,
"temporal_rewards": 0.7857142686843872
},
{
"all_correct": 0.42857142857142855,
"all_wrong": 0.0,
"completion_length": 231.98214721679688,
"epoch": 0.4090909090909091,
"grad_norm": 2.6149401846082427,
"kl": 0.020263671875,
"learning_rate": 6.408662784207149e-07,
"loss": 0.0008,
"reward": 1.9267858266830444,
"reward_std": 0.36489078402519226,
"rewards/accuracy_reward": 0.7321428656578064,
"rewards/format_reward": 0.9910714626312256,
"step": 117,
"temporal_rewards": 0.8571428656578064
},
{
"all_correct": 0.14285714285714285,
"all_wrong": 0.2857142857142857,
"completion_length": 222.0535888671875,
"epoch": 0.4125874125874126,
"grad_norm": 2.8131894560697983,
"kl": 0.018310546875,
"learning_rate": 6.355880676182085e-07,
"loss": 0.0007,
"reward": 1.48035728931427,
"reward_std": 0.3505924344062805,
"rewards/accuracy_reward": 0.4107142984867096,
"rewards/format_reward": 0.9821429252624512,
"step": 118,
"temporal_rewards": 0.6428571343421936
},
{
"all_correct": 0.2857142857142857,
"all_wrong": 0.2857142857142857,
"completion_length": 333.1071472167969,
"epoch": 0.4160839160839161,
"grad_norm": 1.8222547759328855,
"kl": 0.01556396484375,
"learning_rate": 6.302934967597922e-07,
"loss": 0.0006,
"reward": 1.4839287996292114,
"reward_std": 0.27154284715652466,
"rewards/accuracy_reward": 0.4107142984867096,
"rewards/format_reward": 1.0,
"step": 119,
"temporal_rewards": 0.6428571343421936
},
{
"all_correct": 0.0,
"all_wrong": 0.2857142857142857,
"completion_length": 315.2321472167969,
"epoch": 0.4195804195804196,
"grad_norm": 24.56012717937078,
"kl": 0.0164794921875,
"learning_rate": 6.249832046883729e-07,
"loss": 0.0007,
"reward": 1.3053572177886963,
"reward_std": 0.44285857677459717,
"rewards/accuracy_reward": 0.267857164144516,
"rewards/format_reward": 0.973214328289032,
"step": 120,
"temporal_rewards": 0.5714285373687744
},
{
"all_correct": 0.14285714285714285,
"all_wrong": 0.2857142857142857,
"completion_length": 246.4285888671875,
"epoch": 0.4230769230769231,
"grad_norm": 1.9426589659666265,
"kl": 0.0203857421875,
"learning_rate": 6.196578321437789e-07,
"loss": 0.0008,
"reward": 1.4303573369979858,
"reward_std": 0.2891465723514557,
"rewards/accuracy_reward": 0.3392857313156128,
"rewards/format_reward": 1.0,
"step": 121,
"temporal_rewards": 0.714285671710968
},
{
"all_correct": 0.2857142857142857,
"all_wrong": 0.2857142857142857,
"completion_length": 204.07144165039062,
"epoch": 0.42657342657342656,
"grad_norm": 3.0118903186149075,
"kl": 0.0186767578125,
"learning_rate": 6.143180216854486e-07,
"loss": 0.0007,
"reward": 1.571428656578064,
"reward_std": 0.24399259686470032,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.9821429252624512,
"step": 122,
"temporal_rewards": 0.5714285373687744
},
{
"all_correct": 0.2857142857142857,
"all_wrong": 0.2857142857142857,
"completion_length": 162.0,
"epoch": 0.43006993006993005,
"grad_norm": 2.712916811219852,
"kl": 0.0213623046875,
"learning_rate": 6.089644176148991e-07,
"loss": 0.0009,
"reward": 1.5285714864730835,
"reward_std": 0.30187511444091797,
"rewards/accuracy_reward": 0.4285714626312256,
"rewards/format_reward": 0.9821429252624512,
"step": 123,
"temporal_rewards": 0.6428571343421936
},
{
"all_correct": 0.14285714285714285,
"all_wrong": 0.2857142857142857,
"completion_length": 456.982177734375,
"epoch": 0.43356643356643354,
"grad_norm": 0.8620894348694859,
"kl": 0.007232666015625,
"learning_rate": 6.035976658979846e-07,
"loss": 0.0003,
"reward": 1.4910714626312256,
"reward_std": 0.3568207919597626,
"rewards/accuracy_reward": 0.5535714626312256,
"rewards/format_reward": 0.8660714626312256,
"step": 124,
"temporal_rewards": 0.5
},
{
"all_correct": 0.2857142857142857,
"all_wrong": 0.14285714285714285,
"completion_length": 269.625,
"epoch": 0.4370629370629371,
"grad_norm": 3.5704979471789198,
"kl": 0.0206298828125,
"learning_rate": 5.982184140869538e-07,
"loss": 0.0008,
"reward": 1.4535715579986572,
"reward_std": 0.31924429535865784,
"rewards/accuracy_reward": 0.4821428656578064,
"rewards/format_reward": 0.8750000596046448,
"step": 125,
"temporal_rewards": 0.5714285373687744
},
{
"all_correct": 0.14285714285714285,
"all_wrong": 0.42857142857142855,
"completion_length": 301.7857360839844,
"epoch": 0.4405594405594406,
"grad_norm": 1.6422657775507825,
"kl": 0.0174560546875,
"learning_rate": 5.928273112423176e-07,
"loss": 0.0007,
"reward": 1.4392857551574707,
"reward_std": 0.3684910237789154,
"rewards/accuracy_reward": 0.4107142984867096,
"rewards/format_reward": 0.9375000596046448,
"step": 126,
"temporal_rewards": 0.5714285373687744
},
{
"all_correct": 0.42857142857142855,
"all_wrong": 0.14285714285714285,
"completion_length": 263.4464416503906,
"epoch": 0.44405594405594406,
"grad_norm": 1.7374569295760933,
"kl": 0.022705078125,
"learning_rate": 5.874250078545322e-07,
"loss": 0.0009,
"reward": 1.5571428537368774,
"reward_std": 0.2348182499408722,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.9196429252624512,
"step": 127,
"temporal_rewards": 0.714285671710968
},
{
"all_correct": 0.2857142857142857,
"all_wrong": 0.0,
"completion_length": 267.7321472167969,
"epoch": 0.44755244755244755,
"grad_norm": 42.33841040347552,
"kl": 0.018310546875,
"learning_rate": 5.820121557655108e-07,
"loss": 0.0007,
"reward": 1.6982142925262451,
"reward_std": 0.4108830988407135,
"rewards/accuracy_reward": 0.5892857313156128,
"rewards/format_reward": 0.9821429252624512,
"step": 128,
"temporal_rewards": 0.714285671710968
},
{
"all_correct": 0.0,
"all_wrong": 0.0,
"completion_length": 234.57144165039062,
"epoch": 0.45104895104895104,
"grad_norm": 2.480074327680305,
"kl": 0.0255126953125,
"learning_rate": 5.765894080899739e-07,
"loss": 0.001,
"reward": 1.6923317909240723,
"reward_std": 0.5358164310455322,
"rewards/accuracy_reward": 0.542331874370575,
"rewards/format_reward": 0.9821429252624512,
"step": 129,
"temporal_rewards": 0.9285714030265808
},
{
"all_correct": 0.14285714285714285,
"all_wrong": 0.2857142857142857,
"completion_length": 308.1964416503906,
"epoch": 0.45454545454545453,
"grad_norm": 1.5015855581589252,
"kl": 0.0146484375,
"learning_rate": 5.711574191366427e-07,
"loss": 0.0006,
"reward": 1.7357144355773926,
"reward_std": 0.2828700840473175,
"rewards/accuracy_reward": 0.6071428656578064,
"rewards/format_reward": 0.9910714626312256,
"step": 130,
"temporal_rewards": 0.714285671710968
},
{
"all_correct": 0.5714285714285714,
"all_wrong": 0.14285714285714285,
"completion_length": 267.2321472167969,
"epoch": 0.458041958041958,
"grad_norm": 3.0878458835082476,
"kl": 0.0155029296875,
"learning_rate": 5.657168443292908e-07,
"loss": 0.0006,
"reward": 1.9000000953674316,
"reward_std": 0.12276037037372589,
"rewards/accuracy_reward": 0.7321428656578064,
"rewards/format_reward": 1.0,
"step": 131,
"temporal_rewards": 0.6428571343421936
},
{
"all_correct": 0.2857142857142857,
"all_wrong": 0.42857142857142855,
"completion_length": 167.375,
"epoch": 0.46153846153846156,
"grad_norm": 1.9269536633290627,
"kl": 0.02783203125,
"learning_rate": 5.602683401276614e-07,
"loss": 0.0011,
"reward": 1.6267857551574707,
"reward_std": 0.1649283766746521,
"rewards/accuracy_reward": 0.4821428656578064,
"rewards/format_reward": 1.0,
"step": 132,
"temporal_rewards": 0.714285671710968
},
{
"all_correct": 0.2857142857142857,
"all_wrong": 0.14285714285714285,
"completion_length": 235.1785888671875,
"epoch": 0.46503496503496505,
"grad_norm": 6.598227061709475,
"kl": 0.018310546875,
"learning_rate": 5.548125639482586e-07,
"loss": 0.0007,
"reward": 1.5928571224212646,
"reward_std": 0.38033661246299744,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.9910714626312256,
"step": 133,
"temporal_rewards": 0.7857142686843872
},
{
"all_correct": 0.0,
"all_wrong": 0.14285714285714285,
"completion_length": 308.39288330078125,
"epoch": 0.46853146853146854,
"grad_norm": 2.195511154443261,
"kl": 0.0145263671875,
"learning_rate": 5.493501740850227e-07,
"loss": 0.0006,
"reward": 1.5334078073501587,
"reward_std": 0.3899558186531067,
"rewards/accuracy_reward": 0.4476935863494873,
"rewards/format_reward": 1.0,
"step": 134,
"temporal_rewards": 0.5714285373687744
},
{
"all_correct": 0.42857142857142855,
"all_wrong": 0.14285714285714285,
"completion_length": 321.7321472167969,
"epoch": 0.47202797202797203,
"grad_norm": 0.5731888444733847,
"kl": 0.017822265625,
"learning_rate": 5.438818296299015e-07,
"loss": 0.0007,
"reward": 1.782142996788025,
"reward_std": 0.2987748980522156,
"rewards/accuracy_reward": 0.660714328289032,
"rewards/format_reward": 0.9642857313156128,
"step": 135,
"temporal_rewards": 0.714285671710968
},
{
"all_correct": 0.14285714285714285,
"all_wrong": 0.0,
"completion_length": 356.2500305175781,
"epoch": 0.4755244755244755,
"grad_norm": 2.1615424381726127,
"kl": 0.01324462890625,
"learning_rate": 5.384081903933234e-07,
"loss": 0.0005,
"reward": 1.582142949104309,
"reward_std": 0.4733910858631134,
"rewards/accuracy_reward": 0.535714328289032,
"rewards/format_reward": 0.9464285969734192,
"step": 136,
"temporal_rewards": 0.5714285373687744
},
{
"all_correct": 0.0,
"all_wrong": 0.42857142857142855,
"completion_length": 298.8571472167969,
"epoch": 0.479020979020979,
"grad_norm": 2.9906630275434645,
"kl": 0.0159912109375,
"learning_rate": 5.329299168245856e-07,
"loss": 0.0006,
"reward": 1.1678571701049805,
"reward_std": 0.38480064272880554,
"rewards/accuracy_reward": 0.1785714328289032,
"rewards/format_reward": 0.9464285969734192,
"step": 137,
"temporal_rewards": 0.5714285373687744
},
{
"all_correct": 0.2857142857142857,
"all_wrong": 0.0,
"completion_length": 288.1071472167969,
"epoch": 0.4825174825174825,
"grad_norm": 2.8507410343947908,
"kl": 0.0174560546875,
"learning_rate": 5.274476699321637e-07,
"loss": 0.0007,
"reward": 1.9285714626312256,
"reward_std": 0.4282780587673187,
"rewards/accuracy_reward": 0.785714328289032,
"rewards/format_reward": 0.9821429252624512,
"step": 138,
"temporal_rewards": 0.6428571343421936
},
{
"all_correct": 0.42857142857142855,
"all_wrong": 0.0,
"completion_length": 153.875,
"epoch": 0.486013986013986,
"grad_norm": 10.325838545352347,
"kl": 0.0242919921875,
"learning_rate": 5.219621112039543e-07,
"loss": 0.001,
"reward": 1.6428571939468384,
"reward_std": 0.24190634489059448,
"rewards/accuracy_reward": 0.5178571939468384,
"rewards/format_reward": 0.9910714626312256,
"step": 139,
"temporal_rewards": 0.5714285373687744
},
{
"all_correct": 0.2857142857142857,
"all_wrong": 0.0,
"completion_length": 139.5,
"epoch": 0.48951048951048953,
"grad_norm": 2.9463926068589656,
"kl": 0.02294921875,
"learning_rate": 5.164739025274604e-07,
"loss": 0.0009,
"reward": 1.7178571224212646,
"reward_std": 0.3234609365463257,
"rewards/accuracy_reward": 0.5892857313156128,
"rewards/format_reward": 1.0,
"step": 140,
"temporal_rewards": 0.6428571343421936
},
{
"all_correct": 0.5714285714285714,
"all_wrong": 0.0,
"completion_length": 146.4107208251953,
"epoch": 0.493006993006993,
"grad_norm": 4.320723121584521,
"kl": 0.028564453125,
"learning_rate": 5.109837061099273e-07,
"loss": 0.0011,
"reward": 2.0714287757873535,
"reward_std": 0.28830307722091675,
"rewards/accuracy_reward": 0.8214285969734192,
"rewards/format_reward": 1.0,
"step": 141,
"temporal_rewards": 1.0
},
{
"all_correct": 0.2857142857142857,
"all_wrong": 0.14285714285714285,
"completion_length": 342.6607360839844,
"epoch": 0.4965034965034965,
"grad_norm": 1.4042609008851548,
"kl": 0.0169677734375,
"learning_rate": 5.054921843984417e-07,
"loss": 0.0007,
"reward": 1.3928571939468384,
"reward_std": 0.36577218770980835,
"rewards/accuracy_reward": 0.4107142984867096,
"rewards/format_reward": 0.8750000596046448,
"step": 142,
"temporal_rewards": 0.714285671710968
},
{
"all_correct": 0.0,
"all_wrong": 0.42857142857142855,
"completion_length": 230.83929443359375,
"epoch": 0.5,
"grad_norm": 2.760203392117073,
"kl": 0.018310546875,
"learning_rate": 5e-07,
"loss": 0.0007,
"reward": 1.3535715341567993,
"reward_std": 0.39483919739723206,
"rewards/accuracy_reward": 0.3035714328289032,
"rewards/format_reward": 0.9642857313156128,
"step": 143,
"temporal_rewards": 0.8571428656578064
},
{
"all_correct": 0.0,
"all_wrong": 0.14285714285714285,
"completion_length": 174.50001525878906,
"epoch": 0.5034965034965035,
"grad_norm": 3.445363780486131,
"kl": 0.02294921875,
"learning_rate": 4.945078156015581e-07,
"loss": 0.0009,
"reward": 1.5750000476837158,
"reward_std": 0.4295726716518402,
"rewards/accuracy_reward": 0.4464285969734192,
"rewards/format_reward": 1.0,
"step": 144,
"temporal_rewards": 0.9285714030265808
},
{
"all_correct": 0.2857142857142857,
"all_wrong": 0.14285714285714285,
"completion_length": 324.625,
"epoch": 0.506993006993007,
"grad_norm": 2.070258986628781,
"kl": 0.0155029296875,
"learning_rate": 4.890162938900726e-07,
"loss": 0.0006,
"reward": 1.6482144594192505,
"reward_std": 0.47856929898262024,
"rewards/accuracy_reward": 0.5892857313156128,
"rewards/format_reward": 0.9285714626312256,
"step": 145,
"temporal_rewards": 0.714285671710968
},
{
"all_correct": 0.2857142857142857,
"all_wrong": 0.42857142857142855,
"completion_length": 326.1071472167969,
"epoch": 0.5104895104895105,
"grad_norm": 2.8351227359716944,
"kl": 0.01318359375,
"learning_rate": 4.835260974725397e-07,
"loss": 0.0005,
"reward": 1.394642949104309,
"reward_std": 0.23566605150699615,
"rewards/accuracy_reward": 0.4464285969734192,
"rewards/format_reward": 0.8660714626312256,
"step": 146,
"temporal_rewards": 0.6428571343421936
},
{
"all_correct": 0.14285714285714285,
"all_wrong": 0.2857142857142857,
"completion_length": 303.58929443359375,
"epoch": 0.513986013986014,
"grad_norm": 4.877396417730794,
"kl": 0.012939453125,
"learning_rate": 4.780378887960458e-07,
"loss": 0.0005,
"reward": 1.5142858028411865,
"reward_std": 0.27789801359176636,
"rewards/accuracy_reward": 0.4107142984867096,
"rewards/format_reward": 0.9910714626312256,
"step": 147,
"temporal_rewards": 0.714285671710968
},
{
"all_correct": 0.14285714285714285,
"all_wrong": 0.42857142857142855,
"completion_length": 304.625,
"epoch": 0.5174825174825175,
"grad_norm": 1.7202582205652632,
"kl": 0.0142822265625,
"learning_rate": 4.7255233006783624e-07,
"loss": 0.0006,
"reward": 1.5250000953674316,
"reward_std": 0.3306018114089966,
"rewards/accuracy_reward": 0.4285714626312256,
"rewards/format_reward": 0.9642857313156128,
"step": 148,
"temporal_rewards": 0.7857142686843872
},
{
"all_correct": 0.14285714285714285,
"all_wrong": 0.2857142857142857,
"completion_length": 360.0000305175781,
"epoch": 0.5209790209790209,
"grad_norm": 2.0232478358662336,
"kl": 0.0118408203125,
"learning_rate": 4.6707008317541443e-07,
"loss": 0.0005,
"reward": 1.2178571224212646,
"reward_std": 0.34071677923202515,
"rewards/accuracy_reward": 0.3214285969734192,
"rewards/format_reward": 0.8125000596046448,
"step": 149,
"temporal_rewards": 0.714285671710968
},
{
"all_correct": 0.0,
"all_wrong": 0.8571428571428571,
"completion_length": 356.6250305175781,
"epoch": 0.5244755244755245,
"grad_norm": 2.1985564262725608,
"kl": 0.01324462890625,
"learning_rate": 4.6159180960667654e-07,
"loss": 0.0005,
"reward": 1.0017857551574707,
"reward_std": 0.11616755276918411,
"rewards/accuracy_reward": 0.125,
"rewards/format_reward": 0.8392857313156128,
"step": 150,
"temporal_rewards": 0.714285671710968
},
{
"all_correct": 0.2857142857142857,
"all_wrong": 0.42857142857142855,
"completion_length": 255.3035888671875,
"epoch": 0.527972027972028,
"grad_norm": 2.4205533677608644,
"kl": 0.017578125,
"learning_rate": 4.561181703700985e-07,
"loss": 0.0007,
"reward": 1.6035714149475098,
"reward_std": 0.16177618503570557,
"rewards/accuracy_reward": 0.4642857313156128,
"rewards/format_reward": 1.0,
"step": 151,
"temporal_rewards": 0.8571428656578064
},
{
"all_correct": 0.14285714285714285,
"all_wrong": 0.42857142857142855,
"completion_length": 285.0,
"epoch": 0.5314685314685315,
"grad_norm": 2.418272976416168,
"kl": 0.0120849609375,
"learning_rate": 4.506498259149773e-07,
"loss": 0.0005,
"reward": 1.4500000476837158,
"reward_std": 0.3321867883205414,
"rewards/accuracy_reward": 0.4285714626312256,
"rewards/format_reward": 0.9375000596046448,
"step": 152,
"temporal_rewards": 0.6428571343421936
},
{
"all_correct": 0.14285714285714285,
"all_wrong": 0.2857142857142857,
"completion_length": 260.5535888671875,
"epoch": 0.534965034965035,
"grad_norm": 2.658673471345604,
"kl": 0.017822265625,
"learning_rate": 4.451874360517413e-07,
"loss": 0.0007,
"reward": 1.5267857313156128,
"reward_std": 0.38476046919822693,
"rewards/accuracy_reward": 0.5178571939468384,
"rewards/format_reward": 0.8750000596046448,
"step": 153,
"temporal_rewards": 0.714285671710968
},
{
"all_correct": 0.14285714285714285,
"all_wrong": 0.42857142857142855,
"completion_length": 349.51788330078125,
"epoch": 0.5384615384615384,
"grad_norm": 0.9636631029145603,
"kl": 0.01129150390625,
"learning_rate": 4.397316598723385e-07,
"loss": 0.0005,
"reward": 1.4875000715255737,
"reward_std": 0.19214513897895813,
"rewards/accuracy_reward": 0.392857164144516,
"rewards/format_reward": 1.0,
"step": 154,
"temporal_rewards": 0.6428571343421936
},
{
"all_correct": 0.2857142857142857,
"all_wrong": 0.42857142857142855,
"completion_length": 331.7857360839844,
"epoch": 0.541958041958042,
"grad_norm": 1.0144129009185325,
"kl": 0.01190185546875,
"learning_rate": 4.3428315567070923e-07,
"loss": 0.0005,
"reward": 1.5767858028411865,
"reward_std": 0.20042386651039124,
"rewards/accuracy_reward": 0.4642857313156128,
"rewards/format_reward": 0.9821429252624512,
"step": 155,
"temporal_rewards": 0.714285671710968
},
{
"all_correct": 0.14285714285714285,
"all_wrong": 0.0,
"completion_length": 151.5178680419922,
"epoch": 0.5454545454545454,
"grad_norm": 7.232186073668779,
"kl": 0.0198974609375,
"learning_rate": 4.2884258086335745e-07,
"loss": 0.0008,
"reward": 1.8571429252624512,
"reward_std": 0.4870792329311371,
"rewards/accuracy_reward": 0.6964285969734192,
"rewards/format_reward": 1.0,
"step": 156,
"temporal_rewards": 0.714285671710968
},
{
"all_correct": 0.5714285714285714,
"all_wrong": 0.2857142857142857,
"completion_length": 271.4107360839844,
"epoch": 0.548951048951049,
"grad_norm": 0.5118344272065345,
"kl": 0.016845703125,
"learning_rate": 4.234105919100261e-07,
"loss": 0.0007,
"reward": 1.6857142448425293,
"reward_std": 0.14707830548286438,
"rewards/accuracy_reward": 0.6071428656578064,
"rewards/format_reward": 0.9464285969734192,
"step": 157,
"temporal_rewards": 0.7857142686843872
},
{
"all_correct": 0.2857142857142857,
"all_wrong": 0.14285714285714285,
"completion_length": 227.48214721679688,
"epoch": 0.5524475524475524,
"grad_norm": 1.9067107696166985,
"kl": 0.016845703125,
"learning_rate": 4.179878442344892e-07,
"loss": 0.0007,
"reward": 1.8517857789993286,
"reward_std": 0.2894127070903778,
"rewards/accuracy_reward": 0.6785714626312256,
"rewards/format_reward": 1.0,
"step": 158,
"temporal_rewards": 0.714285671710968
},
{
"all_correct": 0.0,
"all_wrong": 0.42857142857142855,
"completion_length": 238.62501525878906,
"epoch": 0.5559440559440559,
"grad_norm": 2.9066876939888244,
"kl": 0.017333984375,
"learning_rate": 4.1257499214546785e-07,
"loss": 0.0007,
"reward": 1.1750000715255737,
"reward_std": 0.3523494303226471,
"rewards/accuracy_reward": 0.1785714328289032,
"rewards/format_reward": 0.9642857313156128,
"step": 159,
"temporal_rewards": 0.6428571343421936
},
{
"all_correct": 0.14285714285714285,
"all_wrong": 0.2857142857142857,
"completion_length": 272.1607360839844,
"epoch": 0.5594405594405595,
"grad_norm": 1.9955742394796372,
"kl": 0.01531982421875,
"learning_rate": 4.071726887576822e-07,
"loss": 0.0006,
"reward": 1.5125000476837158,
"reward_std": 0.4271492063999176,
"rewards/accuracy_reward": 0.4464285969734192,
"rewards/format_reward": 0.9285714626312256,
"step": 160,
"temporal_rewards": 0.8571428656578064
},
{
"all_correct": 0.2857142857142857,
"all_wrong": 0.2857142857142857,
"completion_length": 272.08929443359375,
"epoch": 0.5629370629370629,
"grad_norm": 1.3449257046018408,
"kl": 0.015380859375,
"learning_rate": 4.017815859130461e-07,
"loss": 0.0006,
"reward": 1.5839287042617798,
"reward_std": 0.223629429936409,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.9910714626312256,
"step": 161,
"temporal_rewards": 0.6428571343421936
},
{
"all_correct": 0.0,
"all_wrong": 0.2857142857142857,
"completion_length": 328.3214416503906,
"epoch": 0.5664335664335665,
"grad_norm": 2.3894294160789715,
"kl": 0.0140380859375,
"learning_rate": 3.964023341020155e-07,
"loss": 0.0006,
"reward": 1.2482143640518188,
"reward_std": 0.3738020062446594,
"rewards/accuracy_reward": 0.2321428656578064,
"rewards/format_reward": 0.9821429252624512,
"step": 162,
"temporal_rewards": 0.5
},
{
"all_correct": 0.14285714285714285,
"all_wrong": 0.42857142857142855,
"completion_length": 195.17857360839844,
"epoch": 0.5699300699300699,
"grad_norm": 2.3389942848580025,
"kl": 0.019287109375,
"learning_rate": 3.9103558238510083e-07,
"loss": 0.0008,
"reward": 1.4928572177886963,
"reward_std": 0.3350135385990143,
"rewards/accuracy_reward": 0.392857164144516,
"rewards/format_reward": 0.9821429252624512,
"step": 163,
"temporal_rewards": 0.9285714030265808
},
{
"all_correct": 0.5714285714285714,
"all_wrong": 0.2857142857142857,
"completion_length": 288.26788330078125,
"epoch": 0.5734265734265734,
"grad_norm": 1.4855251481569927,
"kl": 0.01519775390625,
"learning_rate": 3.856819783145514e-07,
"loss": 0.0006,
"reward": 1.7053571939468384,
"reward_std": 0.15428219735622406,
"rewards/accuracy_reward": 0.5892857313156128,
"rewards/format_reward": 0.9821429252624512,
"step": 164,
"temporal_rewards": 0.7857142686843872
},
{
"all_correct": 0.14285714285714285,
"all_wrong": 0.42857142857142855,
"completion_length": 309.4821472167969,
"epoch": 0.5769230769230769,
"grad_norm": 2.75844544825569,
"kl": 0.01434326171875,
"learning_rate": 3.8034216785622125e-07,
"loss": 0.0006,
"reward": 1.3563262224197388,
"reward_std": 0.26625069975852966,
"rewards/accuracy_reward": 0.3259689211845398,
"rewards/format_reward": 0.9285714626312256,
"step": 165,
"temporal_rewards": 0.6428571343421936
},
{
"all_correct": 0.14285714285714285,
"all_wrong": 0.2857142857142857,
"completion_length": 321.625,
"epoch": 0.5804195804195804,
"grad_norm": 1.3890774295899448,
"kl": 0.01361083984375,
"learning_rate": 3.750167953116272e-07,
"loss": 0.0005,
"reward": 1.692857265472412,
"reward_std": 0.4047864079475403,
"rewards/accuracy_reward": 0.5892857313156128,
"rewards/format_reward": 0.9553571939468384,
"step": 166,
"temporal_rewards": 0.714285671710968
},
{
"all_correct": 0.42857142857142855,
"all_wrong": 0.2857142857142857,
"completion_length": 238.19644165039062,
"epoch": 0.583916083916084,
"grad_norm": 1.2961019229402087,
"kl": 0.015869140625,
"learning_rate": 3.697065032402078e-07,
"loss": 0.0006,
"reward": 1.7660715579986572,
"reward_std": 0.21566565334796906,
"rewards/accuracy_reward": 0.625,
"rewards/format_reward": 0.9821429252624512,
"step": 167,
"temporal_rewards": 0.714285671710968
},
{
"all_correct": 0.14285714285714285,
"all_wrong": 0.2857142857142857,
"completion_length": 202.7678680419922,
"epoch": 0.5874125874125874,
"grad_norm": 4.597615166391679,
"kl": 0.0169677734375,
"learning_rate": 3.6441193238179146e-07,
"loss": 0.0007,
"reward": 1.4142857789993286,
"reward_std": 0.26091986894607544,
"rewards/accuracy_reward": 0.3392857313156128,
"rewards/format_reward": 1.0,
"step": 168,
"temporal_rewards": 0.5
},
{
"all_correct": 0.42857142857142855,
"all_wrong": 0.2857142857142857,
"completion_length": 210.48214721679688,
"epoch": 0.5909090909090909,
"grad_norm": 1.624305794360812,
"kl": 0.0164794921875,
"learning_rate": 3.591337215792851e-07,
"loss": 0.0007,
"reward": 1.8267858028411865,
"reward_std": 0.1543826460838318,
"rewards/accuracy_reward": 0.6428571939468384,
"rewards/format_reward": 1.0,
"step": 169,
"temporal_rewards": 0.8571428656578064
},
{
"all_correct": 0.14285714285714285,
"all_wrong": 0.5714285714285714,
"completion_length": 354.46429443359375,
"epoch": 0.5944055944055944,
"grad_norm": 1.4548678641257315,
"kl": 0.01190185546875,
"learning_rate": 3.538725077015915e-07,
"loss": 0.0005,
"reward": 1.1678571701049805,
"reward_std": 0.22238534688949585,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.848214328289032,
"step": 170,
"temporal_rewards": 0.5714285373687744
},
{
"all_correct": 0.2857142857142857,
"all_wrong": 0.14285714285714285,
"completion_length": 278.8214416503906,
"epoch": 0.5979020979020979,
"grad_norm": 1.1660346476914807,
"kl": 0.01434326171875,
"learning_rate": 3.486289255667639e-07,
"loss": 0.0006,
"reward": 1.6160714626312256,
"reward_std": 0.2827339172363281,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.9910714626312256,
"step": 171,
"temporal_rewards": 0.714285671710968
},
{
"all_correct": 0.2857142857142857,
"all_wrong": 0.2857142857142857,
"completion_length": 244.5357208251953,
"epoch": 0.6013986013986014,
"grad_norm": 5.594264279100422,
"kl": 0.0164794921875,
"learning_rate": 3.434036078654106e-07,
"loss": 0.0007,
"reward": 1.5910714864730835,
"reward_std": 0.23809079825878143,
"rewards/accuracy_reward": 0.4821428656578064,
"rewards/format_reward": 0.9910714626312256,
"step": 172,
"temporal_rewards": 0.5714285373687744
},
{
"all_correct": 0.14285714285714285,
"all_wrong": 0.5714285714285714,
"completion_length": 256.6071472167969,
"epoch": 0.6048951048951049,
"grad_norm": 1.3773808667435987,
"kl": 0.012451171875,
"learning_rate": 3.3819718508435226e-07,
"loss": 0.0005,
"reward": 1.355357050895691,
"reward_std": 0.2153625637292862,
"rewards/accuracy_reward": 0.3035714328289032,
"rewards/format_reward": 0.9821429252624512,
"step": 173,
"temporal_rewards": 0.7857142686843872
},
{
"all_correct": 0.14285714285714285,
"all_wrong": 0.42857142857142855,
"completion_length": 281.6607360839844,
"epoch": 0.6083916083916084,
"grad_norm": 1.8959658874179899,
"kl": 0.014404296875,
"learning_rate": 3.330102854305493e-07,
"loss": 0.0006,
"reward": 1.4553571939468384,
"reward_std": 0.22538065910339355,
"rewards/accuracy_reward": 0.392857164144516,
"rewards/format_reward": 1.0,
"step": 174,
"temporal_rewards": 0.6428571343421936
},
{
"all_correct": 0.14285714285714285,
"all_wrong": 0.14285714285714285,
"completion_length": 269.875,
"epoch": 0.6118881118881119,
"grad_norm": 2.2981475604340047,
"kl": 0.0166015625,
"learning_rate": 3.2784353475530135e-07,
"loss": 0.0007,
"reward": 1.469642996788025,
"reward_std": 0.4389226734638214,
"rewards/accuracy_reward": 0.392857164144516,
"rewards/format_reward": 0.9821429252624512,
"step": 175,
"temporal_rewards": 0.6428571343421936
},
{
"all_correct": 0.0,
"all_wrong": 0.42857142857142855,
"completion_length": 214.7678680419922,
"epoch": 0.6153846153846154,
"grad_norm": 2.5195555976201436,
"kl": 0.02001953125,
"learning_rate": 3.2269755647873214e-07,
"loss": 0.0008,
"reward": 1.3071428537368774,
"reward_std": 0.42797765135765076,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.9821429252624512,
"step": 176,
"temporal_rewards": 0.8571428656578064
},
{
"all_correct": 0.2857142857142857,
"all_wrong": 0.14285714285714285,
"completion_length": 233.33929443359375,
"epoch": 0.6188811188811189,
"grad_norm": 2.055673511375122,
"kl": 0.01434326171875,
"learning_rate": 3.175729715145684e-07,
"loss": 0.0006,
"reward": 1.7625000476837158,
"reward_std": 0.3995477557182312,
"rewards/accuracy_reward": 0.625,
"rewards/format_reward": 0.9821429252624512,
"step": 177,
"temporal_rewards": 0.7857142686843872
},
{
"all_correct": 0.14285714285714285,
"all_wrong": 0.2857142857142857,
"completion_length": 280.89288330078125,
"epoch": 0.6223776223776224,
"grad_norm": 1.6236247273765754,
"kl": 0.0162353515625,
"learning_rate": 3.12470398195219e-07,
"loss": 0.0006,
"reward": 1.4589285850524902,
"reward_std": 0.32778358459472656,
"rewards/accuracy_reward": 0.392857164144516,
"rewards/format_reward": 0.9821429252624512,
"step": 178,
"temporal_rewards": 0.5
},
{
"all_correct": 0.2857142857142857,
"all_wrong": 0.42857142857142855,
"completion_length": 304.58929443359375,
"epoch": 0.6258741258741258,
"grad_norm": 1.7193136176143193,
"kl": 0.01519775390625,
"learning_rate": 3.0739045219716884e-07,
"loss": 0.0006,
"reward": 1.4285714626312256,
"reward_std": 0.2324400097131729,
"rewards/accuracy_reward": 0.4107142984867096,
"rewards/format_reward": 0.9642857313156128,
"step": 179,
"temporal_rewards": 0.5714285373687744
},
{
"all_correct": 0.14285714285714285,
"all_wrong": 0.2857142857142857,
"completion_length": 300.8214416503906,
"epoch": 0.6293706293706294,
"grad_norm": 4.499489721815655,
"kl": 0.01348876953125,
"learning_rate": 3.023337464666893e-07,
"loss": 0.0005,
"reward": 1.4296077489852905,
"reward_std": 0.2895960509777069,
"rewards/accuracy_reward": 0.3421076536178589,
"rewards/format_reward": 1.0,
"step": 180,
"temporal_rewards": 0.714285671710968
},
{
"all_correct": 0.14285714285714285,
"all_wrong": 0.2857142857142857,
"completion_length": 259.3571472167969,
"epoch": 0.6328671328671329,
"grad_norm": 2.1837077874780966,
"kl": 0.0157470703125,
"learning_rate": 2.9730089114588157e-07,
"loss": 0.0006,
"reward": 1.3535715341567993,
"reward_std": 0.37215283513069153,
"rewards/accuracy_reward": 0.3214285969734192,
"rewards/format_reward": 0.9642857313156128,
"step": 181,
"temporal_rewards": 0.6428571343421936
},
{
"all_correct": 0.2857142857142857,
"all_wrong": 0.42857142857142855,
"completion_length": 199.46429443359375,
"epoch": 0.6363636363636364,
"grad_norm": 1.3553227080052574,
"kl": 0.0220947265625,
"learning_rate": 2.922924934990568e-07,
"loss": 0.0009,
"reward": 1.4964287281036377,
"reward_std": 0.11663764715194702,
"rewards/accuracy_reward": 0.4107142984867096,
"rewards/format_reward": 1.0,
"step": 182,
"temporal_rewards": 0.6428571343421936
},
{
"all_correct": 0.42857142857142855,
"all_wrong": 0.14285714285714285,
"completion_length": 174.82144165039062,
"epoch": 0.6398601398601399,
"grad_norm": 1.904515168520845,
"kl": 0.020751953125,
"learning_rate": 2.873091578394626e-07,
"loss": 0.0008,
"reward": 1.7767858505249023,
"reward_std": 0.21223808825016022,
"rewards/accuracy_reward": 0.6071428656578064,
"rewards/format_reward": 1.0,
"step": 183,
"temporal_rewards": 0.7857142686843872
},
{
"all_correct": 0.14285714285714285,
"all_wrong": 0.0,
"completion_length": 209.21429443359375,
"epoch": 0.6433566433566433,
"grad_norm": 5.4284279958225605,
"kl": 0.0191650390625,
"learning_rate": 2.823514854563677e-07,
"loss": 0.0008,
"reward": 1.8017858266830444,
"reward_std": 0.5190714597702026,
"rewards/accuracy_reward": 0.6428571939468384,
"rewards/format_reward": 1.0,
"step": 184,
"temporal_rewards": 0.8571428656578064
},
{
"all_correct": 0.14285714285714285,
"all_wrong": 0.42857142857142855,
"completion_length": 232.5357208251953,
"epoch": 0.6468531468531469,
"grad_norm": 2.3909656732286817,
"kl": 0.015625,
"learning_rate": 2.774200745425096e-07,
"loss": 0.0006,
"reward": 1.3982144594192505,
"reward_std": 0.2684382200241089,
"rewards/accuracy_reward": 0.3035714328289032,
"rewards/format_reward": 1.0,
"step": 185,
"temporal_rewards": 0.714285671710968
},
{
"all_correct": 0.0,
"all_wrong": 0.8571428571428571,
"completion_length": 324.26788330078125,
"epoch": 0.6503496503496503,
"grad_norm": 1.1467278777382235,
"kl": 0.0130615234375,
"learning_rate": 2.725155201219176e-07,
"loss": 0.0005,
"reward": 1.1267857551574707,
"reward_std": 0.1317899227142334,
"rewards/accuracy_reward": 0.125,
"rewards/format_reward": 0.9642857313156128,
"step": 186,
"temporal_rewards": 0.714285671710968
},
{
"all_correct": 0.14285714285714285,
"all_wrong": 0.42857142857142855,
"completion_length": 362.7500305175781,
"epoch": 0.6538461538461539,
"grad_norm": 1.467850477853026,
"kl": 0.01495361328125,
"learning_rate": 2.676384139781157e-07,
"loss": 0.0006,
"reward": 1.289285659790039,
"reward_std": 0.2565459609031677,
"rewards/accuracy_reward": 0.267857164144516,
"rewards/format_reward": 0.9821429252624512,
"step": 187,
"temporal_rewards": 0.5
},
{
"all_correct": 0.42857142857142855,
"all_wrong": 0.2857142857142857,
"completion_length": 221.21429443359375,
"epoch": 0.6573426573426573,
"grad_norm": 1.8678333900967852,
"kl": 0.0218505859375,
"learning_rate": 2.6278934458271996e-07,
"loss": 0.0009,
"reward": 1.6946427822113037,
"reward_std": 0.2205595076084137,
"rewards/accuracy_reward": 0.5714285969734192,
"rewards/format_reward": 0.9821429252624512,
"step": 188,
"temporal_rewards": 0.714285671710968
},
{
"all_correct": 0.2857142857142857,
"all_wrong": 0.14285714285714285,
"completion_length": 252.19644165039062,
"epoch": 0.6608391608391608,
"grad_norm": 3.030907404699144,
"kl": 0.0157470703125,
"learning_rate": 2.5796889702443123e-07,
"loss": 0.0006,
"reward": 1.7250001430511475,
"reward_std": 0.3633832633495331,
"rewards/accuracy_reward": 0.6071428656578064,
"rewards/format_reward": 0.9821429252624512,
"step": 189,
"temporal_rewards": 0.7857142686843872
},
{
"all_correct": 0.14285714285714285,
"all_wrong": 0.2857142857142857,
"completion_length": 197.0357208251953,
"epoch": 0.6643356643356644,
"grad_norm": 2.934544973257425,
"kl": 0.019775390625,
"learning_rate": 2.5317765293844067e-07,
"loss": 0.0008,
"reward": 1.787500023841858,
"reward_std": 0.276123046875,
"rewards/accuracy_reward": 0.625,
"rewards/format_reward": 1.0,
"step": 190,
"temporal_rewards": 0.714285671710968
},
{
"all_correct": 0.14285714285714285,
"all_wrong": 0.42857142857142855,
"completion_length": 308.96429443359375,
"epoch": 0.6678321678321678,
"grad_norm": 2.9496641260720375,
"kl": 0.01336669921875,
"learning_rate": 2.4841619043624806e-07,
"loss": 0.0005,
"reward": 1.412500023841858,
"reward_std": 0.34019771218299866,
"rewards/accuracy_reward": 0.3750000298023224,
"rewards/format_reward": 0.9464285969734192,
"step": 191,
"temporal_rewards": 0.6428571343421936
},
{
"all_correct": 0.14285714285714285,
"all_wrong": 0.42857142857142855,
"completion_length": 354.2321472167969,
"epoch": 0.6713286713286714,
"grad_norm": 0.8446216299416934,
"kl": 0.01226806640625,
"learning_rate": 2.4368508403590725e-07,
"loss": 0.0005,
"reward": 1.4000000953674316,
"reward_std": 0.24233335256576538,
"rewards/accuracy_reward": 0.3571428656578064,
"rewards/format_reward": 0.9821429252624512,
"step": 192,
"temporal_rewards": 0.6428571343421936
},
{
"all_correct": 0.2857142857142857,
"all_wrong": 0.2857142857142857,
"completion_length": 205.23214721679688,
"epoch": 0.6748251748251748,
"grad_norm": 3.9294557546039433,
"kl": 0.0166015625,
"learning_rate": 2.389849045927049e-07,
"loss": 0.0007,
"reward": 1.7482143640518188,
"reward_std": 0.2757527232170105,
"rewards/accuracy_reward": 0.5892857313156128,
"rewards/format_reward": 0.9910714626312256,
"step": 193,
"temporal_rewards": 0.7857142686843872
},
{
"all_correct": 0.14285714285714285,
"all_wrong": 0.2857142857142857,
"completion_length": 243.12501525878906,
"epoch": 0.6783216783216783,
"grad_norm": 2.2731968890998067,
"kl": 0.021240234375,
"learning_rate": 2.3431621923028144e-07,
"loss": 0.0008,
"reward": 1.7178571224212646,
"reward_std": 0.34315115213394165,
"rewards/accuracy_reward": 0.5714285969734192,
"rewards/format_reward": 0.9910714626312256,
"step": 194,
"temporal_rewards": 0.7857142686843872
},
{
"all_correct": 0.14285714285714285,
"all_wrong": 0.42857142857142855,
"completion_length": 207.9107208251953,
"epoch": 0.6818181818181818,
"grad_norm": 2.027560604023802,
"kl": 0.0181884765625,
"learning_rate": 2.2967959127220137e-07,
"loss": 0.0007,
"reward": 1.4964287281036377,
"reward_std": 0.24646392464637756,
"rewards/accuracy_reward": 0.392857164144516,
"rewards/format_reward": 1.0,
"step": 195,
"temporal_rewards": 0.8571428656578064
},
{
"all_correct": 0.14285714285714285,
"all_wrong": 0.0,
"completion_length": 187.82144165039062,
"epoch": 0.6853146853146853,
"grad_norm": 3.738312976747544,
"kl": 0.020263671875,
"learning_rate": 2.250755801739826e-07,
"loss": 0.0008,
"reward": 1.826785683631897,
"reward_std": 0.49625372886657715,
"rewards/accuracy_reward": 0.660714328289032,
"rewards/format_reward": 0.973214328289032,
"step": 196,
"temporal_rewards": 0.9285714030265808
},
{
"all_correct": 0.0,
"all_wrong": 0.2857142857142857,
"completion_length": 218.05357360839844,
"epoch": 0.6888111888111889,
"grad_norm": 6.936122875948517,
"kl": 0.021240234375,
"learning_rate": 2.2050474145559323e-07,
"loss": 0.0008,
"reward": 1.3642857074737549,
"reward_std": 0.46779900789260864,
"rewards/accuracy_reward": 0.3214285969734192,
"rewards/format_reward": 0.9642857313156128,
"step": 197,
"temporal_rewards": 0.714285671710968
},
{
"all_correct": 0.0,
"all_wrong": 0.7142857142857143,
"completion_length": 246.4285888671875,
"epoch": 0.6923076923076923,
"grad_norm": 5.92965203555489,
"kl": 0.027587890625,
"learning_rate": 2.1596762663442213e-07,
"loss": 0.0011,
"reward": 1.2428572177886963,
"reward_std": 0.23482269048690796,
"rewards/accuracy_reward": 0.2321428656578064,
"rewards/format_reward": 0.9553571939468384,
"step": 198,
"temporal_rewards": 0.6428571343421936
},
{
"all_correct": 0.0,
"all_wrong": 0.2857142857142857,
"completion_length": 167.30357360839844,
"epoch": 0.6958041958041958,
"grad_norm": 9.899413362253103,
"kl": 0.0206298828125,
"learning_rate": 2.1146478315873233e-07,
"loss": 0.0008,
"reward": 1.4642857313156128,
"reward_std": 0.4399777352809906,
"rewards/accuracy_reward": 0.3571428656578064,
"rewards/format_reward": 1.0,
"step": 199,
"temporal_rewards": 0.9285714030265808
},
{
"all_correct": 0.2857142857142857,
"all_wrong": 0.14285714285714285,
"completion_length": 179.67857360839844,
"epoch": 0.6993006993006993,
"grad_norm": 2.965286701962895,
"kl": 0.0235595703125,
"learning_rate": 2.0699675434160695e-07,
"loss": 0.0009,
"reward": 1.7607142925262451,
"reward_std": 0.328709214925766,
"rewards/accuracy_reward": 0.5892857313156128,
"rewards/format_reward": 1.0,
"step": 200,
"temporal_rewards": 0.7857142686843872
}
],
"logging_steps": 1.0,
"max_steps": 286,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}