| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.6993006993006993, |
| "eval_steps": 500, |
| "global_step": 200, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "all_correct": 0.0, |
| "all_wrong": 0.7142857142857143, |
| "completion_length": 310.1607360839844, |
| "epoch": 0.0034965034965034965, |
| "grad_norm": 2.380299597679801, |
| "kl": 0.0, |
| "learning_rate": 9.999698350006064e-07, |
| "loss": -0.0, |
| "reward": 0.7892857193946838, |
| "reward_std": 0.2723116874694824, |
| "rewards/accuracy_reward": 0.25, |
| "rewards/format_reward": 0.4642857313156128, |
| "step": 1, |
| "temporal_rewards": 0.6428571343421936 |
| }, |
| { |
| "all_correct": 0.0, |
| "all_wrong": 0.14285714285714285, |
| "completion_length": 327.83929443359375, |
| "epoch": 0.006993006993006993, |
| "grad_norm": 1.9019752792945082, |
| "kl": 0.000507354736328125, |
| "learning_rate": 9.99879343642134e-07, |
| "loss": 0.0, |
| "reward": 0.8178572058677673, |
| "reward_std": 0.5231723785400391, |
| "rewards/accuracy_reward": 0.267857164144516, |
| "rewards/format_reward": 0.5178571939468384, |
| "step": 2, |
| "temporal_rewards": 0.5 |
| }, |
| { |
| "all_correct": 0.0, |
| "all_wrong": 0.0, |
| "completion_length": 202.2678680419922, |
| "epoch": 0.01048951048951049, |
| "grad_norm": 4.450412510480093, |
| "kl": 0.00102996826171875, |
| "learning_rate": 9.997285368432701e-07, |
| "loss": 0.0, |
| "reward": 1.4160715341567993, |
| "reward_std": 0.5284246206283569, |
| "rewards/accuracy_reward": 0.6964285969734192, |
| "rewards/format_reward": 0.5892857313156128, |
| "step": 3, |
| "temporal_rewards": 0.5 |
| }, |
| { |
| "all_correct": 0.0, |
| "all_wrong": 0.14285714285714285, |
| "completion_length": 270.08929443359375, |
| "epoch": 0.013986013986013986, |
| "grad_norm": 2.315112630577657, |
| "kl": 0.00173187255859375, |
| "learning_rate": 9.99517432800363e-07, |
| "loss": 0.0001, |
| "reward": 0.8560110330581665, |
| "reward_std": 0.5568417906761169, |
| "rewards/accuracy_reward": 0.300653874874115, |
| "rewards/format_reward": 0.4910714626312256, |
| "step": 4, |
| "temporal_rewards": 0.6428571343421936 |
| }, |
| { |
| "all_correct": 0.0, |
| "all_wrong": 0.0, |
| "completion_length": 188.57144165039062, |
| "epoch": 0.017482517482517484, |
| "grad_norm": 2.724486071626872, |
| "kl": 0.0050048828125, |
| "learning_rate": 9.992460569852254e-07, |
| "loss": 0.0002, |
| "reward": 1.4839285612106323, |
| "reward_std": 0.5056161880493164, |
| "rewards/accuracy_reward": 0.5535714626312256, |
| "rewards/format_reward": 0.7767857313156128, |
| "step": 5, |
| "temporal_rewards": 0.7857142686843872 |
| }, |
| { |
| "all_correct": 0.0, |
| "all_wrong": 0.0, |
| "completion_length": 219.7678680419922, |
| "epoch": 0.02097902097902098, |
| "grad_norm": 2.8351172787613836, |
| "kl": 0.00860595703125, |
| "learning_rate": 9.989144421420628e-07, |
| "loss": 0.0003, |
| "reward": 1.4946428537368774, |
| "reward_std": 0.729103147983551, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.8660714626312256, |
| "step": 6, |
| "temporal_rewards": 0.8571428656578064 |
| }, |
| { |
| "all_correct": 0.14285714285714285, |
| "all_wrong": 0.42857142857142855, |
| "completion_length": 293.9107360839844, |
| "epoch": 0.024475524475524476, |
| "grad_norm": 2.0246275476434126, |
| "kl": 0.00592041015625, |
| "learning_rate": 9.985226282835216e-07, |
| "loss": 0.0002, |
| "reward": 1.1946429014205933, |
| "reward_std": 0.29120072722435, |
| "rewards/accuracy_reward": 0.267857164144516, |
| "rewards/format_reward": 0.8660714626312256, |
| "step": 7, |
| "temporal_rewards": 0.6428571343421936 |
| }, |
| { |
| "all_correct": 0.14285714285714285, |
| "all_wrong": 0.2857142857142857, |
| "completion_length": 220.85714721679688, |
| "epoch": 0.027972027972027972, |
| "grad_norm": 7.642786587891259, |
| "kl": 0.00732421875, |
| "learning_rate": 9.980706626858607e-07, |
| "loss": 0.0003, |
| "reward": 1.4571430683135986, |
| "reward_std": 0.3087652921676636, |
| "rewards/accuracy_reward": 0.3750000298023224, |
| "rewards/format_reward": 0.9910714626312256, |
| "step": 8, |
| "temporal_rewards": 0.714285671710968 |
| }, |
| { |
| "all_correct": 0.2857142857142857, |
| "all_wrong": 0.2857142857142857, |
| "completion_length": 319.51788330078125, |
| "epoch": 0.03146853146853147, |
| "grad_norm": 0.9767130753502494, |
| "kl": 0.00543212890625, |
| "learning_rate": 9.975585998832479e-07, |
| "loss": 0.0002, |
| "reward": 1.462499976158142, |
| "reward_std": 0.24715806543827057, |
| "rewards/accuracy_reward": 0.4464285969734192, |
| "rewards/format_reward": 0.9196429252624512, |
| "step": 9, |
| "temporal_rewards": 0.714285671710968 |
| }, |
| { |
| "all_correct": 0.0, |
| "all_wrong": 0.2857142857142857, |
| "completion_length": 212.07144165039062, |
| "epoch": 0.03496503496503497, |
| "grad_norm": 4.547473584939765, |
| "kl": 0.01116943359375, |
| "learning_rate": 9.9698650166118e-07, |
| "loss": 0.0004, |
| "reward": 1.4553571939468384, |
| "reward_std": 0.4356800317764282, |
| "rewards/accuracy_reward": 0.4285714626312256, |
| "rewards/format_reward": 0.9464285969734192, |
| "step": 10, |
| "temporal_rewards": 0.6428571343421936 |
| }, |
| { |
| "all_correct": 0.2857142857142857, |
| "all_wrong": 0.42857142857142855, |
| "completion_length": 240.62501525878906, |
| "epoch": 0.038461538461538464, |
| "grad_norm": 0.9309831321101983, |
| "kl": 0.0140380859375, |
| "learning_rate": 9.963544370490268e-07, |
| "loss": 0.0006, |
| "reward": 1.567857265472412, |
| "reward_std": 0.1529129594564438, |
| "rewards/accuracy_reward": 0.535714328289032, |
| "rewards/format_reward": 0.9464285969734192, |
| "step": 11, |
| "temporal_rewards": 0.5714285373687744 |
| }, |
| { |
| "all_correct": 0.14285714285714285, |
| "all_wrong": 0.14285714285714285, |
| "completion_length": 183.42857360839844, |
| "epoch": 0.04195804195804196, |
| "grad_norm": 2.703233395202908, |
| "kl": 0.01611328125, |
| "learning_rate": 9.956624823117034e-07, |
| "loss": 0.0006, |
| "reward": 1.5553573369979858, |
| "reward_std": 0.48100805282592773, |
| "rewards/accuracy_reward": 0.535714328289032, |
| "rewards/format_reward": 0.9196429252624512, |
| "step": 12, |
| "temporal_rewards": 0.7857142686843872 |
| }, |
| { |
| "all_correct": 0.14285714285714285, |
| "all_wrong": 0.14285714285714285, |
| "completion_length": 282.0, |
| "epoch": 0.045454545454545456, |
| "grad_norm": 4.458836776433339, |
| "kl": 0.0230712890625, |
| "learning_rate": 9.949107209404663e-07, |
| "loss": 0.0009, |
| "reward": 1.3767857551574707, |
| "reward_std": 0.49624696373939514, |
| "rewards/accuracy_reward": 0.3571428656578064, |
| "rewards/format_reward": 0.9285714626312256, |
| "step": 13, |
| "temporal_rewards": 0.8571428656578064 |
| }, |
| { |
| "all_correct": 0.2857142857142857, |
| "all_wrong": 0.14285714285714285, |
| "completion_length": 194.57144165039062, |
| "epoch": 0.04895104895104895, |
| "grad_norm": 3.5177838483972175, |
| "kl": 0.037353515625, |
| "learning_rate": 9.940992436428409e-07, |
| "loss": 0.0015, |
| "reward": 1.6607143878936768, |
| "reward_std": 0.33305349946022034, |
| "rewards/accuracy_reward": 0.535714328289032, |
| "rewards/format_reward": 0.9910714626312256, |
| "step": 14, |
| "temporal_rewards": 0.714285671710968 |
| }, |
| { |
| "all_correct": 0.0, |
| "all_wrong": 0.2857142857142857, |
| "completion_length": 211.55357360839844, |
| "epoch": 0.05244755244755245, |
| "grad_norm": 1.9694919360921588, |
| "kl": 0.034912109375, |
| "learning_rate": 9.932281483316758e-07, |
| "loss": 0.0014, |
| "reward": 1.350000023841858, |
| "reward_std": 0.3625797629356384, |
| "rewards/accuracy_reward": 0.3035714328289032, |
| "rewards/format_reward": 0.9821429252624512, |
| "step": 15, |
| "temporal_rewards": 0.6428571343421936 |
| }, |
| { |
| "all_correct": 0.14285714285714285, |
| "all_wrong": 0.0, |
| "completion_length": 160.9107208251953, |
| "epoch": 0.055944055944055944, |
| "grad_norm": 8.09731279933887, |
| "kl": 0.032958984375, |
| "learning_rate": 9.922975401133292e-07, |
| "loss": 0.0013, |
| "reward": 1.7142858505249023, |
| "reward_std": 0.534223735332489, |
| "rewards/accuracy_reward": 0.5714285969734192, |
| "rewards/format_reward": 1.0, |
| "step": 16, |
| "temporal_rewards": 0.9285714030265808 |
| }, |
| { |
| "all_correct": 0.0, |
| "all_wrong": 0.5714285714285714, |
| "completion_length": 157.75, |
| "epoch": 0.05944055944055944, |
| "grad_norm": 2.860877842453824, |
| "kl": 0.0262451171875, |
| "learning_rate": 9.913075312749865e-07, |
| "loss": 0.001, |
| "reward": 1.1875, |
| "reward_std": 0.24128873646259308, |
| "rewards/accuracy_reward": 0.1607142984867096, |
| "rewards/format_reward": 1.0, |
| "step": 17, |
| "temporal_rewards": 0.7857142686843872 |
| }, |
| { |
| "all_correct": 0.0, |
| "all_wrong": 0.2857142857142857, |
| "completion_length": 329.9285888671875, |
| "epoch": 0.06293706293706294, |
| "grad_norm": 1.8234744714725273, |
| "kl": 0.020751953125, |
| "learning_rate": 9.902582412711118e-07, |
| "loss": 0.0008, |
| "reward": 1.321428656578064, |
| "reward_std": 0.36161118745803833, |
| "rewards/accuracy_reward": 0.3214285969734192, |
| "rewards/format_reward": 0.9821429252624512, |
| "step": 18, |
| "temporal_rewards": 0.4285714328289032 |
| }, |
| { |
| "all_correct": 0.0, |
| "all_wrong": 0.42857142857142855, |
| "completion_length": 236.12501525878906, |
| "epoch": 0.06643356643356643, |
| "grad_norm": 4.288921643679896, |
| "kl": 0.0203857421875, |
| "learning_rate": 9.891497967090343e-07, |
| "loss": 0.0008, |
| "reward": 1.383928656578064, |
| "reward_std": 0.4247966706752777, |
| "rewards/accuracy_reward": 0.3571428656578064, |
| "rewards/format_reward": 0.9196429252624512, |
| "step": 19, |
| "temporal_rewards": 0.8571428656578064 |
| }, |
| { |
| "all_correct": 0.0, |
| "all_wrong": 0.5714285714285714, |
| "completion_length": 294.375, |
| "epoch": 0.06993006993006994, |
| "grad_norm": 4.6505006273887135, |
| "kl": 0.0223388671875, |
| "learning_rate": 9.879823313336722e-07, |
| "loss": 0.0009, |
| "reward": 1.196428656578064, |
| "reward_std": 0.39018020033836365, |
| "rewards/accuracy_reward": 0.267857164144516, |
| "rewards/format_reward": 0.848214328289032, |
| "step": 20, |
| "temporal_rewards": 0.7857142686843872 |
| }, |
| { |
| "all_correct": 0.0, |
| "all_wrong": 0.2857142857142857, |
| "completion_length": 167.2857208251953, |
| "epoch": 0.07342657342657342, |
| "grad_norm": 4.184221344881508, |
| "kl": 0.03173828125, |
| "learning_rate": 9.86755986011395e-07, |
| "loss": 0.0013, |
| "reward": 1.5053571462631226, |
| "reward_std": 0.3639127314090729, |
| "rewards/accuracy_reward": 0.392857164144516, |
| "rewards/format_reward": 1.0, |
| "step": 21, |
| "temporal_rewards": 0.6428571343421936 |
| }, |
| { |
| "all_correct": 0.2857142857142857, |
| "all_wrong": 0.0, |
| "completion_length": 201.19644165039062, |
| "epoch": 0.07692307692307693, |
| "grad_norm": 2.690611018179203, |
| "kl": 0.0279541015625, |
| "learning_rate": 9.85470908713026e-07, |
| "loss": 0.0011, |
| "reward": 1.7660715579986572, |
| "reward_std": 0.40759509801864624, |
| "rewards/accuracy_reward": 0.6428571939468384, |
| "rewards/format_reward": 1.0, |
| "step": 22, |
| "temporal_rewards": 0.714285671710968 |
| }, |
| { |
| "all_correct": 0.0, |
| "all_wrong": 0.14285714285714285, |
| "completion_length": 335.1964416503906, |
| "epoch": 0.08041958041958042, |
| "grad_norm": 2.7267639716406697, |
| "kl": 0.01708984375, |
| "learning_rate": 9.84127254495989e-07, |
| "loss": 0.0007, |
| "reward": 1.1553571224212646, |
| "reward_std": 0.48766034841537476, |
| "rewards/accuracy_reward": 0.196428582072258, |
| "rewards/format_reward": 0.9285714626312256, |
| "step": 23, |
| "temporal_rewards": 0.5714285373687744 |
| }, |
| { |
| "all_correct": 0.14285714285714285, |
| "all_wrong": 0.2857142857142857, |
| "completion_length": 213.2857208251953, |
| "epoch": 0.08391608391608392, |
| "grad_norm": 4.208996844683101, |
| "kl": 0.0230712890625, |
| "learning_rate": 9.82725185485599e-07, |
| "loss": 0.0009, |
| "reward": 1.5232144594192505, |
| "reward_std": 0.3175846040248871, |
| "rewards/accuracy_reward": 0.4285714626312256, |
| "rewards/format_reward": 0.9910714626312256, |
| "step": 24, |
| "temporal_rewards": 0.5714285373687744 |
| }, |
| { |
| "all_correct": 0.14285714285714285, |
| "all_wrong": 0.14285714285714285, |
| "completion_length": 213.85714721679688, |
| "epoch": 0.08741258741258741, |
| "grad_norm": 3.3203724401356522, |
| "kl": 0.026611328125, |
| "learning_rate": 9.81264870855499e-07, |
| "loss": 0.0011, |
| "reward": 1.5660713911056519, |
| "reward_std": 0.3996223509311676, |
| "rewards/accuracy_reward": 0.4642857313156128, |
| "rewards/format_reward": 1.0, |
| "step": 25, |
| "temporal_rewards": 0.6428571343421936 |
| }, |
| { |
| "all_correct": 0.14285714285714285, |
| "all_wrong": 0.14285714285714285, |
| "completion_length": 143.83929443359375, |
| "epoch": 0.09090909090909091, |
| "grad_norm": 5.297436012938182, |
| "kl": 0.035888671875, |
| "learning_rate": 9.797464868072486e-07, |
| "loss": 0.0014, |
| "reward": 1.5696429014205933, |
| "reward_std": 0.3823689818382263, |
| "rewards/accuracy_reward": 0.4464285969734192, |
| "rewards/format_reward": 1.0, |
| "step": 26, |
| "temporal_rewards": 0.7857142686843872 |
| }, |
| { |
| "all_correct": 0.2857142857142857, |
| "all_wrong": 0.14285714285714285, |
| "completion_length": 169.0357208251953, |
| "epoch": 0.0944055944055944, |
| "grad_norm": 2.83355376716199, |
| "kl": 0.034423828125, |
| "learning_rate": 9.781702165490637e-07, |
| "loss": 0.0014, |
| "reward": 1.783928632736206, |
| "reward_std": 0.2857024669647217, |
| "rewards/accuracy_reward": 0.6071428656578064, |
| "rewards/format_reward": 1.0, |
| "step": 27, |
| "temporal_rewards": 0.8571428656578064 |
| }, |
| { |
| "all_correct": 0.14285714285714285, |
| "all_wrong": 0.5714285714285714, |
| "completion_length": 208.35714721679688, |
| "epoch": 0.0979020979020979, |
| "grad_norm": 1.024100633083295, |
| "kl": 0.0272216796875, |
| "learning_rate": 9.765362502737097e-07, |
| "loss": 0.0011, |
| "reward": 1.399999976158142, |
| "reward_std": 0.16669097542762756, |
| "rewards/accuracy_reward": 0.3214285969734192, |
| "rewards/format_reward": 1.0, |
| "step": 28, |
| "temporal_rewards": 0.714285671710968 |
| }, |
| { |
| "all_correct": 0.14285714285714285, |
| "all_wrong": 0.5714285714285714, |
| "completion_length": 325.6071472167969, |
| "epoch": 0.10139860139860139, |
| "grad_norm": 1.5716694001154234, |
| "kl": 0.016845703125, |
| "learning_rate": 9.748447851355533e-07, |
| "loss": 0.0007, |
| "reward": 1.4339287281036377, |
| "reward_std": 0.26826655864715576, |
| "rewards/accuracy_reward": 0.3750000298023224, |
| "rewards/format_reward": 0.9464285969734192, |
| "step": 29, |
| "temporal_rewards": 0.7857142686843872 |
| }, |
| { |
| "all_correct": 0.14285714285714285, |
| "all_wrong": 0.0, |
| "completion_length": 269.5357360839844, |
| "epoch": 0.1048951048951049, |
| "grad_norm": 2.5384743751593186, |
| "kl": 0.02294921875, |
| "learning_rate": 9.730960252267742e-07, |
| "loss": 0.0009, |
| "reward": 1.7035715579986572, |
| "reward_std": 0.45422056317329407, |
| "rewards/accuracy_reward": 0.6071428656578064, |
| "rewards/format_reward": 0.9910714626312256, |
| "step": 30, |
| "temporal_rewards": 0.6428571343421936 |
| }, |
| { |
| "all_correct": 0.2857142857142857, |
| "all_wrong": 0.2857142857142857, |
| "completion_length": 262.2321472167969, |
| "epoch": 0.10839160839160839, |
| "grad_norm": 2.0666478679768003, |
| "kl": 0.02294921875, |
| "learning_rate": 9.712901815527385e-07, |
| "loss": 0.0009, |
| "reward": 1.6035715341567993, |
| "reward_std": 0.2705501914024353, |
| "rewards/accuracy_reward": 0.4821428656578064, |
| "rewards/format_reward": 0.9821429252624512, |
| "step": 31, |
| "temporal_rewards": 0.7857142686843872 |
| }, |
| { |
| "all_correct": 0.2857142857142857, |
| "all_wrong": 0.0, |
| "completion_length": 134.4107208251953, |
| "epoch": 0.11188811188811189, |
| "grad_norm": 2.5344699018238, |
| "kl": 0.03662109375, |
| "learning_rate": 9.694274720065398e-07, |
| "loss": 0.0015, |
| "reward": 1.892857313156128, |
| "reward_std": 0.3375154435634613, |
| "rewards/accuracy_reward": 0.7321428656578064, |
| "rewards/format_reward": 1.0, |
| "step": 32, |
| "temporal_rewards": 0.6428571343421936 |
| }, |
| { |
| "all_correct": 0.2857142857142857, |
| "all_wrong": 0.0, |
| "completion_length": 191.83929443359375, |
| "epoch": 0.11538461538461539, |
| "grad_norm": 4.114325907280972, |
| "kl": 0.0245361328125, |
| "learning_rate": 9.675081213427074e-07, |
| "loss": 0.001, |
| "reward": 1.8464285135269165, |
| "reward_std": 0.3899296522140503, |
| "rewards/accuracy_reward": 0.6785714626312256, |
| "rewards/format_reward": 1.0, |
| "step": 33, |
| "temporal_rewards": 0.8571428656578064 |
| }, |
| { |
| "all_correct": 0.14285714285714285, |
| "all_wrong": 0.2857142857142857, |
| "completion_length": 283.9285888671875, |
| "epoch": 0.11888111888111888, |
| "grad_norm": 2.057519800361947, |
| "kl": 0.0166015625, |
| "learning_rate": 9.655323611500873e-07, |
| "loss": 0.0007, |
| "reward": 1.2660715579986572, |
| "reward_std": 0.2641655504703522, |
| "rewards/accuracy_reward": 0.2321428656578064, |
| "rewards/format_reward": 0.9821429252624512, |
| "step": 34, |
| "temporal_rewards": 0.6428571343421936 |
| }, |
| { |
| "all_correct": 0.0, |
| "all_wrong": 0.2857142857142857, |
| "completion_length": 263.08929443359375, |
| "epoch": 0.12237762237762238, |
| "grad_norm": 1.6784752125291083, |
| "kl": 0.01470947265625, |
| "learning_rate": 9.635004298239002e-07, |
| "loss": 0.0006, |
| "reward": 1.3589285612106323, |
| "reward_std": 0.31579551100730896, |
| "rewards/accuracy_reward": 0.3035714328289032, |
| "rewards/format_reward": 1.0, |
| "step": 35, |
| "temporal_rewards": 0.4285714328289032 |
| }, |
| { |
| "all_correct": 0.14285714285714285, |
| "all_wrong": 0.2857142857142857, |
| "completion_length": 209.21429443359375, |
| "epoch": 0.1258741258741259, |
| "grad_norm": 3.2262252650312333, |
| "kl": 0.0191650390625, |
| "learning_rate": 9.614125725369745e-07, |
| "loss": 0.0008, |
| "reward": 1.4517858028411865, |
| "reward_std": 0.3842121958732605, |
| "rewards/accuracy_reward": 0.392857164144516, |
| "rewards/format_reward": 0.9910714626312256, |
| "step": 36, |
| "temporal_rewards": 0.7857142686843872 |
| }, |
| { |
| "all_correct": 0.14285714285714285, |
| "all_wrong": 0.14285714285714285, |
| "completion_length": 254.5357208251953, |
| "epoch": 0.12937062937062938, |
| "grad_norm": 2.461213562184743, |
| "kl": 0.0172119140625, |
| "learning_rate": 9.592690412101657e-07, |
| "loss": 0.0007, |
| "reward": 1.7017858028411865, |
| "reward_std": 0.44310158491134644, |
| "rewards/accuracy_reward": 0.5892857313156128, |
| "rewards/format_reward": 0.9821429252624512, |
| "step": 37, |
| "temporal_rewards": 0.6428571343421936 |
| }, |
| { |
| "all_correct": 0.2857142857142857, |
| "all_wrong": 0.42857142857142855, |
| "completion_length": 226.35714721679688, |
| "epoch": 0.13286713286713286, |
| "grad_norm": 3.674434851466659, |
| "kl": 0.0172119140625, |
| "learning_rate": 9.570700944819582e-07, |
| "loss": 0.0007, |
| "reward": 1.3678572177886963, |
| "reward_std": 0.14704714715480804, |
| "rewards/accuracy_reward": 0.3571428656578064, |
| "rewards/format_reward": 1.0, |
| "step": 38, |
| "temporal_rewards": 0.5 |
| }, |
| { |
| "all_correct": 0.14285714285714285, |
| "all_wrong": 0.5714285714285714, |
| "completion_length": 317.2321472167969, |
| "epoch": 0.13636363636363635, |
| "grad_norm": 1.9427116571962744, |
| "kl": 0.01251220703125, |
| "learning_rate": 9.548159976772592e-07, |
| "loss": 0.0005, |
| "reward": 1.2660715579986572, |
| "reward_std": 0.3043068051338196, |
| "rewards/accuracy_reward": 0.267857164144516, |
| "rewards/format_reward": 0.9285714626312256, |
| "step": 39, |
| "temporal_rewards": 0.6428571343421936 |
| }, |
| { |
| "all_correct": 0.5714285714285714, |
| "all_wrong": 0.0, |
| "completion_length": 178.9107208251953, |
| "epoch": 0.13986013986013987, |
| "grad_norm": 1.4904776222689382, |
| "kl": 0.01708984375, |
| "learning_rate": 9.525070227753833e-07, |
| "loss": 0.0007, |
| "reward": 1.9839286804199219, |
| "reward_std": 0.20042385160923004, |
| "rewards/accuracy_reward": 0.7678571939468384, |
| "rewards/format_reward": 1.0, |
| "step": 40, |
| "temporal_rewards": 0.7857142686843872 |
| }, |
| { |
| "all_correct": 0.14285714285714285, |
| "all_wrong": 0.14285714285714285, |
| "completion_length": 232.73214721679688, |
| "epoch": 0.14335664335664336, |
| "grad_norm": 3.261090873434451, |
| "kl": 0.0159912109375, |
| "learning_rate": 9.50143448377237e-07, |
| "loss": 0.0006, |
| "reward": 1.692857265472412, |
| "reward_std": 0.45394080877304077, |
| "rewards/accuracy_reward": 0.5892857313156128, |
| "rewards/format_reward": 0.9642857313156128, |
| "step": 41, |
| "temporal_rewards": 0.7857142686843872 |
| }, |
| { |
| "all_correct": 0.0, |
| "all_wrong": 0.42857142857142855, |
| "completion_length": 261.64288330078125, |
| "epoch": 0.14685314685314685, |
| "grad_norm": 3.8796318426121963, |
| "kl": 0.0174560546875, |
| "learning_rate": 9.477255596717011e-07, |
| "loss": 0.0007, |
| "reward": 1.3589285612106323, |
| "reward_std": 0.31432804465293884, |
| "rewards/accuracy_reward": 0.3035714328289032, |
| "rewards/format_reward": 0.973214328289032, |
| "step": 42, |
| "temporal_rewards": 0.5714285373687744 |
| }, |
| { |
| "all_correct": 0.0, |
| "all_wrong": 0.42857142857142855, |
| "completion_length": 314.01788330078125, |
| "epoch": 0.15034965034965034, |
| "grad_norm": 1.9128498151636428, |
| "kl": 0.0133056640625, |
| "learning_rate": 9.452536484012212e-07, |
| "loss": 0.0005, |
| "reward": 1.4642857313156128, |
| "reward_std": 0.3462127447128296, |
| "rewards/accuracy_reward": 0.4107142984867096, |
| "rewards/format_reward": 0.9642857313156128, |
| "step": 43, |
| "temporal_rewards": 0.5714285373687744 |
| }, |
| { |
| "all_correct": 0.14285714285714285, |
| "all_wrong": 0.2857142857142857, |
| "completion_length": 265.0714416503906, |
| "epoch": 0.15384615384615385, |
| "grad_norm": 2.8462472228365914, |
| "kl": 0.0174560546875, |
| "learning_rate": 9.427280128266049e-07, |
| "loss": 0.0007, |
| "reward": 1.485714316368103, |
| "reward_std": 0.41046157479286194, |
| "rewards/accuracy_reward": 0.4642857313156128, |
| "rewards/format_reward": 0.8928571939468384, |
| "step": 44, |
| "temporal_rewards": 0.714285671710968 |
| }, |
| { |
| "all_correct": 0.0, |
| "all_wrong": 0.2857142857142857, |
| "completion_length": 185.5178680419922, |
| "epoch": 0.15734265734265734, |
| "grad_norm": 3.3769919224351135, |
| "kl": 0.0216064453125, |
| "learning_rate": 9.401489576910348e-07, |
| "loss": 0.0009, |
| "reward": 1.255357265472412, |
| "reward_std": 0.3892122805118561, |
| "rewards/accuracy_reward": 0.196428582072258, |
| "rewards/format_reward": 1.0, |
| "step": 45, |
| "temporal_rewards": 0.9285714030265808 |
| }, |
| { |
| "all_correct": 0.14285714285714285, |
| "all_wrong": 0.14285714285714285, |
| "completion_length": 310.3571472167969, |
| "epoch": 0.16083916083916083, |
| "grad_norm": 3.5940256384708946, |
| "kl": 0.0137939453125, |
| "learning_rate": 9.375167941832973e-07, |
| "loss": 0.0006, |
| "reward": 1.5750000476837158, |
| "reward_std": 0.41830208897590637, |
| "rewards/accuracy_reward": 0.4821428656578064, |
| "rewards/format_reward": 0.9821429252624512, |
| "step": 46, |
| "temporal_rewards": 0.714285671710968 |
| }, |
| { |
| "all_correct": 0.0, |
| "all_wrong": 0.2857142857142857, |
| "completion_length": 222.12501525878906, |
| "epoch": 0.16433566433566432, |
| "grad_norm": 3.3517712947005456, |
| "kl": 0.0159912109375, |
| "learning_rate": 9.348318399002345e-07, |
| "loss": 0.0006, |
| "reward": 1.3285715579986572, |
| "reward_std": 0.34704774618148804, |
| "rewards/accuracy_reward": 0.267857164144516, |
| "rewards/format_reward": 1.0, |
| "step": 47, |
| "temporal_rewards": 0.6428571343421936 |
| }, |
| { |
| "all_correct": 0.2857142857142857, |
| "all_wrong": 0.0, |
| "completion_length": 239.0357208251953, |
| "epoch": 0.16783216783216784, |
| "grad_norm": 1.5367703370703922, |
| "kl": 0.0230712890625, |
| "learning_rate": 9.320944188084241e-07, |
| "loss": 0.0009, |
| "reward": 1.6750000715255737, |
| "reward_std": 0.33668023347854614, |
| "rewards/accuracy_reward": 0.535714328289032, |
| "rewards/format_reward": 1.0, |
| "step": 48, |
| "temporal_rewards": 0.7857142686843872 |
| }, |
| { |
| "all_correct": 0.14285714285714285, |
| "all_wrong": 0.2857142857142857, |
| "completion_length": 174.7678680419922, |
| "epoch": 0.17132867132867133, |
| "grad_norm": 2.4741938852675536, |
| "kl": 0.020263671875, |
| "learning_rate": 9.293048612050883e-07, |
| "loss": 0.0008, |
| "reward": 1.4267858266830444, |
| "reward_std": 0.3323812484741211, |
| "rewards/accuracy_reward": 0.3571428656578064, |
| "rewards/format_reward": 1.0, |
| "step": 49, |
| "temporal_rewards": 0.5 |
| }, |
| { |
| "all_correct": 0.14285714285714285, |
| "all_wrong": 0.42857142857142855, |
| "completion_length": 246.62501525878906, |
| "epoch": 0.17482517482517482, |
| "grad_norm": 1.6031937748680414, |
| "kl": 0.0189208984375, |
| "learning_rate": 9.264635036782405e-07, |
| "loss": 0.0008, |
| "reward": 1.3767857551574707, |
| "reward_std": 0.23027247190475464, |
| "rewards/accuracy_reward": 0.3392857313156128, |
| "rewards/format_reward": 1.0, |
| "step": 50, |
| "temporal_rewards": 0.5714285373687744 |
| }, |
| { |
| "all_correct": 0.42857142857142855, |
| "all_wrong": 0.0, |
| "completion_length": 172.92857360839844, |
| "epoch": 0.17832167832167833, |
| "grad_norm": 2.1468520628879197, |
| "kl": 0.0201416015625, |
| "learning_rate": 9.235706890660732e-07, |
| "loss": 0.0008, |
| "reward": 2.0625, |
| "reward_std": 0.2902068495750427, |
| "rewards/accuracy_reward": 0.8571429252624512, |
| "rewards/format_reward": 1.0, |
| "step": 51, |
| "temporal_rewards": 0.8571428656578064 |
| }, |
| { |
| "all_correct": 0.2857142857142857, |
| "all_wrong": 0.2857142857142857, |
| "completion_length": 225.3928680419922, |
| "epoch": 0.18181818181818182, |
| "grad_norm": 2.041359786136277, |
| "kl": 0.0166015625, |
| "learning_rate": 9.206267664155906e-07, |
| "loss": 0.0007, |
| "reward": 1.5571428537368774, |
| "reward_std": 0.16162440180778503, |
| "rewards/accuracy_reward": 0.4464285969734192, |
| "rewards/format_reward": 1.0, |
| "step": 52, |
| "temporal_rewards": 0.5 |
| }, |
| { |
| "all_correct": 0.14285714285714285, |
| "all_wrong": 0.2857142857142857, |
| "completion_length": 219.5178680419922, |
| "epoch": 0.1853146853146853, |
| "grad_norm": 2.1860526381838405, |
| "kl": 0.01953125, |
| "learning_rate": 9.176320909404923e-07, |
| "loss": 0.0008, |
| "reward": 1.4892857074737549, |
| "reward_std": 0.34200409054756165, |
| "rewards/accuracy_reward": 0.4285714626312256, |
| "rewards/format_reward": 0.9821429252624512, |
| "step": 53, |
| "temporal_rewards": 0.714285671710968 |
| }, |
| { |
| "all_correct": 0.2857142857142857, |
| "all_wrong": 0.14285714285714285, |
| "completion_length": 146.5, |
| "epoch": 0.1888111888111888, |
| "grad_norm": 7.135493678756918, |
| "kl": 0.0250244140625, |
| "learning_rate": 9.145870239783141e-07, |
| "loss": 0.001, |
| "reward": 1.889285683631897, |
| "reward_std": 0.3193110227584839, |
| "rewards/accuracy_reward": 0.7142857313156128, |
| "rewards/format_reward": 1.0, |
| "step": 54, |
| "temporal_rewards": 0.9285714030265808 |
| }, |
| { |
| "all_correct": 0.2857142857142857, |
| "all_wrong": 0.14285714285714285, |
| "completion_length": 199.71429443359375, |
| "epoch": 0.19230769230769232, |
| "grad_norm": 1.6651581992503819, |
| "kl": 0.018798828125, |
| "learning_rate": 9.114919329468282e-07, |
| "loss": 0.0008, |
| "reward": 1.8017858266830444, |
| "reward_std": 0.29444825649261475, |
| "rewards/accuracy_reward": 0.6428571939468384, |
| "rewards/format_reward": 1.0, |
| "step": 55, |
| "temporal_rewards": 0.8571428656578064 |
| }, |
| { |
| "all_correct": 0.0, |
| "all_wrong": 0.42857142857142855, |
| "completion_length": 267.8571472167969, |
| "epoch": 0.1958041958041958, |
| "grad_norm": 5.502551920924967, |
| "kl": 0.01953125, |
| "learning_rate": 9.083471912997108e-07, |
| "loss": 0.0008, |
| "reward": 1.1821428537368774, |
| "reward_std": 0.2820361256599426, |
| "rewards/accuracy_reward": 0.1607142984867096, |
| "rewards/format_reward": 1.0, |
| "step": 56, |
| "temporal_rewards": 0.4285714328289032 |
| }, |
| { |
| "all_correct": 0.2857142857142857, |
| "all_wrong": 0.2857142857142857, |
| "completion_length": 286.6607360839844, |
| "epoch": 0.1993006993006993, |
| "grad_norm": 1.0605637999415247, |
| "kl": 0.0162353515625, |
| "learning_rate": 9.051531784814816e-07, |
| "loss": 0.0007, |
| "reward": 1.662500023841858, |
| "reward_std": 0.24488236010074615, |
| "rewards/accuracy_reward": 0.5714285969734192, |
| "rewards/format_reward": 0.9464285969734192, |
| "step": 57, |
| "temporal_rewards": 0.7857142686843872 |
| }, |
| { |
| "all_correct": 0.14285714285714285, |
| "all_wrong": 0.14285714285714285, |
| "completion_length": 388.0357360839844, |
| "epoch": 0.20279720279720279, |
| "grad_norm": 2.4145653726848453, |
| "kl": 0.01318359375, |
| "learning_rate": 9.019102798817195e-07, |
| "loss": 0.0005, |
| "reward": 1.3767857551574707, |
| "reward_std": 0.5423558950424194, |
| "rewards/accuracy_reward": 0.4285714626312256, |
| "rewards/format_reward": 0.8571429252624512, |
| "step": 58, |
| "temporal_rewards": 0.6428571343421936 |
| }, |
| { |
| "all_correct": 0.14285714285714285, |
| "all_wrong": 0.2857142857142857, |
| "completion_length": 199.3928680419922, |
| "epoch": 0.2062937062937063, |
| "grad_norm": 3.84081401020837, |
| "kl": 0.0224609375, |
| "learning_rate": 8.986188867885616e-07, |
| "loss": 0.0009, |
| "reward": 1.3732143640518188, |
| "reward_std": 0.3065285086631775, |
| "rewards/accuracy_reward": 0.3035714328289032, |
| "rewards/format_reward": 1.0, |
| "step": 59, |
| "temporal_rewards": 0.5714285373687744 |
| }, |
| { |
| "all_correct": 0.5714285714285714, |
| "all_wrong": 0.0, |
| "completion_length": 220.35714721679688, |
| "epoch": 0.2097902097902098, |
| "grad_norm": 2.5367802513821687, |
| "kl": 0.02001953125, |
| "learning_rate": 8.952793963414906e-07, |
| "loss": 0.0008, |
| "reward": 1.9267857074737549, |
| "reward_std": 0.3000243604183197, |
| "rewards/accuracy_reward": 0.7678571939468384, |
| "rewards/format_reward": 0.9821429252624512, |
| "step": 60, |
| "temporal_rewards": 0.714285671710968 |
| }, |
| { |
| "all_correct": 0.2857142857142857, |
| "all_wrong": 0.14285714285714285, |
| "completion_length": 299.0535888671875, |
| "epoch": 0.21328671328671328, |
| "grad_norm": 5.034973765435732, |
| "kl": 0.016845703125, |
| "learning_rate": 8.918922114834156e-07, |
| "loss": 0.0007, |
| "reward": 1.7000000476837158, |
| "reward_std": 0.38362255692481995, |
| "rewards/accuracy_reward": 0.625, |
| "rewards/format_reward": 0.9464285969734192, |
| "step": 61, |
| "temporal_rewards": 0.6428571343421936 |
| }, |
| { |
| "all_correct": 0.5714285714285714, |
| "all_wrong": 0.0, |
| "completion_length": 154.21429443359375, |
| "epoch": 0.21678321678321677, |
| "grad_norm": 2.038565297311163, |
| "kl": 0.0269775390625, |
| "learning_rate": 8.884577409120535e-07, |
| "loss": 0.0011, |
| "reward": 2.1178572177886963, |
| "reward_std": 0.24774520099163055, |
| "rewards/accuracy_reward": 0.8928571939468384, |
| "rewards/format_reward": 1.0, |
| "step": 62, |
| "temporal_rewards": 0.9285714030265808 |
| }, |
| { |
| "all_correct": 0.5714285714285714, |
| "all_wrong": 0.2857142857142857, |
| "completion_length": 190.6607208251953, |
| "epoch": 0.2202797202797203, |
| "grad_norm": 1.081625687458147, |
| "kl": 0.02294921875, |
| "learning_rate": 8.849763990306152e-07, |
| "loss": 0.0009, |
| "reward": 1.7767858505249023, |
| "reward_std": 0.16224628686904907, |
| "rewards/accuracy_reward": 0.625, |
| "rewards/format_reward": 0.9642857313156128, |
| "step": 63, |
| "temporal_rewards": 0.9285714030265808 |
| }, |
| { |
| "all_correct": 0.0, |
| "all_wrong": 0.42857142857142855, |
| "completion_length": 286.1785888671875, |
| "epoch": 0.22377622377622378, |
| "grad_norm": 9.904092820497874, |
| "kl": 0.0238037109375, |
| "learning_rate": 8.814486058978033e-07, |
| "loss": 0.001, |
| "reward": 1.321428656578064, |
| "reward_std": 0.394111305475235, |
| "rewards/accuracy_reward": 0.3571428656578064, |
| "rewards/format_reward": 0.8571429252624512, |
| "step": 64, |
| "temporal_rewards": 0.7857142686843872 |
| }, |
| { |
| "all_correct": 0.14285714285714285, |
| "all_wrong": 0.2857142857142857, |
| "completion_length": 344.1785888671875, |
| "epoch": 0.22727272727272727, |
| "grad_norm": 1.3046406890165314, |
| "kl": 0.01422119140625, |
| "learning_rate": 8.778747871771291e-07, |
| "loss": 0.0006, |
| "reward": 1.289285659790039, |
| "reward_std": 0.37180283665657043, |
| "rewards/accuracy_reward": 0.2857142984867096, |
| "rewards/format_reward": 0.9464285969734192, |
| "step": 65, |
| "temporal_rewards": 0.5714285373687744 |
| }, |
| { |
| "all_correct": 0.0, |
| "all_wrong": 0.42857142857142855, |
| "completion_length": 252.8035888671875, |
| "epoch": 0.23076923076923078, |
| "grad_norm": 2.5493063713880852, |
| "kl": 0.017333984375, |
| "learning_rate": 8.742553740855505e-07, |
| "loss": 0.0007, |
| "reward": 1.1375000476837158, |
| "reward_std": 0.41169679164886475, |
| "rewards/accuracy_reward": 0.196428582072258, |
| "rewards/format_reward": 0.910714328289032, |
| "step": 66, |
| "temporal_rewards": 0.4285714328289032 |
| }, |
| { |
| "all_correct": 0.14285714285714285, |
| "all_wrong": 0.2857142857142857, |
| "completion_length": 242.57144165039062, |
| "epoch": 0.23426573426573427, |
| "grad_norm": 5.785405122787179, |
| "kl": 0.021484375, |
| "learning_rate": 8.705908033414424e-07, |
| "loss": 0.0009, |
| "reward": 1.6410715579986572, |
| "reward_std": 0.3037279546260834, |
| "rewards/accuracy_reward": 0.5178571939468384, |
| "rewards/format_reward": 1.0, |
| "step": 67, |
| "temporal_rewards": 0.7857142686843872 |
| }, |
| { |
| "all_correct": 0.0, |
| "all_wrong": 0.5714285714285714, |
| "completion_length": 249.73214721679688, |
| "epoch": 0.23776223776223776, |
| "grad_norm": 2.926217120408116, |
| "kl": 0.0189208984375, |
| "learning_rate": 8.668815171119019e-07, |
| "loss": 0.0008, |
| "reward": 1.1678571701049805, |
| "reward_std": 0.30887019634246826, |
| "rewards/accuracy_reward": 0.25, |
| "rewards/format_reward": 0.8750000596046448, |
| "step": 68, |
| "temporal_rewards": 0.5714285373687744 |
| }, |
| { |
| "all_correct": 0.0, |
| "all_wrong": 0.14285714285714285, |
| "completion_length": 237.94644165039062, |
| "epoch": 0.24125874125874125, |
| "grad_norm": 7.506566104564083, |
| "kl": 0.0230712890625, |
| "learning_rate": 8.631279629593966e-07, |
| "loss": 0.0009, |
| "reward": 1.35535728931427, |
| "reward_std": 0.43541181087493896, |
| "rewards/accuracy_reward": 0.3214285969734192, |
| "rewards/format_reward": 0.9642857313156128, |
| "step": 69, |
| "temporal_rewards": 0.4285714328289032 |
| }, |
| { |
| "all_correct": 0.14285714285714285, |
| "all_wrong": 0.2857142857142857, |
| "completion_length": 245.5357208251953, |
| "epoch": 0.24475524475524477, |
| "grad_norm": 3.175217838092476, |
| "kl": 0.0198974609375, |
| "learning_rate": 8.593305937877613e-07, |
| "loss": 0.0008, |
| "reward": 1.4392857551574707, |
| "reward_std": 0.328709214925766, |
| "rewards/accuracy_reward": 0.3750000298023224, |
| "rewards/format_reward": 1.0, |
| "step": 70, |
| "temporal_rewards": 0.7857142686843872 |
| }, |
| { |
| "all_correct": 0.14285714285714285, |
| "all_wrong": 0.7142857142857143, |
| "completion_length": 236.2678680419922, |
| "epoch": 0.24825174825174826, |
| "grad_norm": 1.0790210342784936, |
| "kl": 0.019775390625, |
| "learning_rate": 8.554898677875508e-07, |
| "loss": 0.0008, |
| "reward": 1.3071428537368774, |
| "reward_std": 0.13647663593292236, |
| "rewards/accuracy_reward": 0.25, |
| "rewards/format_reward": 0.9821429252624512, |
| "step": 71, |
| "temporal_rewards": 0.714285671710968 |
| }, |
| { |
| "all_correct": 0.7142857142857143, |
| "all_wrong": 0.0, |
| "completion_length": 148.4107208251953, |
| "epoch": 0.2517482517482518, |
| "grad_norm": 7.833869612393734, |
| "kl": 0.027587890625, |
| "learning_rate": 8.516062483807554e-07, |
| "loss": 0.0011, |
| "reward": 2.1125001907348633, |
| "reward_std": 0.1543826460838318, |
| "rewards/accuracy_reward": 0.8928571939468384, |
| "rewards/format_reward": 1.0, |
| "step": 72, |
| "temporal_rewards": 0.7857142686843872 |
| }, |
| { |
| "all_correct": 0.2857142857142857, |
| "all_wrong": 0.42857142857142855, |
| "completion_length": 171.07144165039062, |
| "epoch": 0.25524475524475526, |
| "grad_norm": 1.8151636448140855, |
| "kl": 0.0240478515625, |
| "learning_rate": 8.476802041648831e-07, |
| "loss": 0.001, |
| "reward": 1.5642857551574707, |
| "reward_std": 0.1599045991897583, |
| "rewards/accuracy_reward": 0.4464285969734192, |
| "rewards/format_reward": 1.0, |
| "step": 73, |
| "temporal_rewards": 0.714285671710968 |
| }, |
| { |
| "all_correct": 0.14285714285714285, |
| "all_wrong": 0.14285714285714285, |
| "completion_length": 217.07144165039062, |
| "epoch": 0.25874125874125875, |
| "grad_norm": 4.21791710784553, |
| "kl": 0.01953125, |
| "learning_rate": 8.437122088564197e-07, |
| "loss": 0.0008, |
| "reward": 1.673214316368103, |
| "reward_std": 0.43455594778060913, |
| "rewards/accuracy_reward": 0.535714328289032, |
| "rewards/format_reward": 1.0, |
| "step": 74, |
| "temporal_rewards": 0.714285671710968 |
| }, |
| { |
| "all_correct": 0.0, |
| "all_wrong": 0.42857142857142855, |
| "completion_length": 243.8928680419922, |
| "epoch": 0.26223776223776224, |
| "grad_norm": 2.3789653415719485, |
| "kl": 0.0189208984375, |
| "learning_rate": 8.39702741233669e-07, |
| "loss": 0.0008, |
| "reward": 1.2517858743667603, |
| "reward_std": 0.2406107783317566, |
| "rewards/accuracy_reward": 0.3571428656578064, |
| "rewards/format_reward": 0.8571429252624512, |
| "step": 75, |
| "temporal_rewards": 0.357142835855484 |
| }, |
| { |
| "all_correct": 0.2857142857142857, |
| "all_wrong": 0.0, |
| "completion_length": 255.62501525878906, |
| "epoch": 0.26573426573426573, |
| "grad_norm": 3.628990949164193, |
| "kl": 0.022705078125, |
| "learning_rate": 8.356522850789851e-07, |
| "loss": 0.0009, |
| "reward": 1.4482142925262451, |
| "reward_std": 0.44743552803993225, |
| "rewards/accuracy_reward": 0.4642857313156128, |
| "rewards/format_reward": 0.8571429252624512, |
| "step": 76, |
| "temporal_rewards": 0.8571428656578064 |
| }, |
| { |
| "all_correct": 0.0, |
| "all_wrong": 0.2857142857142857, |
| "completion_length": 224.71429443359375, |
| "epoch": 0.2692307692307692, |
| "grad_norm": 3.98575994223548, |
| "kl": 0.0213623046875, |
| "learning_rate": 8.315613291203976e-07, |
| "loss": 0.0009, |
| "reward": 1.4410713911056519, |
| "reward_std": 0.4627973139286041, |
| "rewards/accuracy_reward": 0.3571428656578064, |
| "rewards/format_reward": 0.9821429252624512, |
| "step": 77, |
| "temporal_rewards": 0.7857142686843872 |
| }, |
| { |
| "all_correct": 0.7142857142857143, |
| "all_wrong": 0.14285714285714285, |
| "completion_length": 174.82144165039062, |
| "epoch": 0.2727272727272727, |
| "grad_norm": 1.9033544021109488, |
| "kl": 0.021240234375, |
| "learning_rate": 8.274303669726426e-07, |
| "loss": 0.0008, |
| "reward": 1.9803571701049805, |
| "reward_std": 0.10934228450059891, |
| "rewards/accuracy_reward": 0.8035714626312256, |
| "rewards/format_reward": 1.0, |
| "step": 78, |
| "temporal_rewards": 0.8571428656578064 |
| }, |
| { |
| "all_correct": 0.14285714285714285, |
| "all_wrong": 0.0, |
| "completion_length": 132.1607208251953, |
| "epoch": 0.2762237762237762, |
| "grad_norm": 5.777704604421436, |
| "kl": 0.0255126953125, |
| "learning_rate": 8.232598970776026e-07, |
| "loss": 0.001, |
| "reward": 1.8160715103149414, |
| "reward_std": 0.5395211577415466, |
| "rewards/accuracy_reward": 0.625, |
| "rewards/format_reward": 1.0, |
| "step": 79, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 0.14285714285714285, |
| "all_wrong": 0.14285714285714285, |
| "completion_length": 235.1428680419922, |
| "epoch": 0.27972027972027974, |
| "grad_norm": 1.593925925571494, |
| "kl": 0.0177001953125, |
| "learning_rate": 8.190504226441653e-07, |
| "loss": 0.0007, |
| "reward": 1.4946428537368774, |
| "reward_std": 0.44739413261413574, |
| "rewards/accuracy_reward": 0.4285714626312256, |
| "rewards/format_reward": 0.9821429252624512, |
| "step": 80, |
| "temporal_rewards": 0.6428571343421936 |
| }, |
| { |
| "all_correct": 0.0, |
| "all_wrong": 0.42857142857142855, |
| "completion_length": 262.3214416503906, |
| "epoch": 0.28321678321678323, |
| "grad_norm": 2.761975886137325, |
| "kl": 0.0169677734375, |
| "learning_rate": 8.148024515875056e-07, |
| "loss": 0.0007, |
| "reward": 1.3357144594192505, |
| "reward_std": 0.3343726098537445, |
| "rewards/accuracy_reward": 0.267857164144516, |
| "rewards/format_reward": 1.0, |
| "step": 81, |
| "temporal_rewards": 0.5714285373687744 |
| }, |
| { |
| "all_correct": 0.14285714285714285, |
| "all_wrong": 0.2857142857142857, |
| "completion_length": 351.4285888671875, |
| "epoch": 0.2867132867132867, |
| "grad_norm": 1.0204064062267335, |
| "kl": 0.01080322265625, |
| "learning_rate": 8.105164964678009e-07, |
| "loss": 0.0004, |
| "reward": 1.4267858266830444, |
| "reward_std": 0.32611167430877686, |
| "rewards/accuracy_reward": 0.392857164144516, |
| "rewards/format_reward": 0.973214328289032, |
| "step": 82, |
| "temporal_rewards": 0.5714285373687744 |
| }, |
| { |
| "all_correct": 0.14285714285714285, |
| "all_wrong": 0.2857142857142857, |
| "completion_length": 215.2857208251953, |
| "epoch": 0.2902097902097902, |
| "grad_norm": 8.030465893244708, |
| "kl": 0.02294921875, |
| "learning_rate": 8.061930744283854e-07, |
| "loss": 0.0009, |
| "reward": 1.4482142925262451, |
| "reward_std": 0.26826655864715576, |
| "rewards/accuracy_reward": 0.3571428656578064, |
| "rewards/format_reward": 1.0, |
| "step": 83, |
| "temporal_rewards": 0.6428571343421936 |
| }, |
| { |
| "all_correct": 0.2857142857142857, |
| "all_wrong": 0.42857142857142855, |
| "completion_length": 251.44644165039062, |
| "epoch": 0.2937062937062937, |
| "grad_norm": 1.8690472432712648, |
| "kl": 0.0181884765625, |
| "learning_rate": 8.01832707133352e-07, |
| "loss": 0.0007, |
| "reward": 1.289285659790039, |
| "reward_std": 0.2658645808696747, |
| "rewards/accuracy_reward": 0.3214285969734192, |
| "rewards/format_reward": 0.9196429252624512, |
| "step": 84, |
| "temporal_rewards": 0.5714285373687744 |
| }, |
| { |
| "all_correct": 0.0, |
| "all_wrong": 0.14285714285714285, |
| "completion_length": 262.2321472167969, |
| "epoch": 0.2972027972027972, |
| "grad_norm": 1.818292460461868, |
| "kl": 0.0166015625, |
| "learning_rate": 7.97435920704608e-07, |
| "loss": 0.0007, |
| "reward": 1.446428656578064, |
| "reward_std": 0.4745182394981384, |
| "rewards/accuracy_reward": 0.392857164144516, |
| "rewards/format_reward": 0.9821429252624512, |
| "step": 85, |
| "temporal_rewards": 0.5714285373687744 |
| }, |
| { |
| "all_correct": 0.5714285714285714, |
| "all_wrong": 0.14285714285714285, |
| "completion_length": 226.62501525878906, |
| "epoch": 0.3006993006993007, |
| "grad_norm": 2.994767446502154, |
| "kl": 0.022705078125, |
| "learning_rate": 7.930032456583931e-07, |
| "loss": 0.0009, |
| "reward": 1.8410714864730835, |
| "reward_std": 0.1394950896501541, |
| "rewards/accuracy_reward": 0.7142857313156128, |
| "rewards/format_reward": 1.0, |
| "step": 86, |
| "temporal_rewards": 0.6428571343421936 |
| }, |
| { |
| "all_correct": 0.42857142857142855, |
| "all_wrong": 0.2857142857142857, |
| "completion_length": 176.9107208251953, |
| "epoch": 0.3041958041958042, |
| "grad_norm": 1.7391609508300692, |
| "kl": 0.019775390625, |
| "learning_rate": 7.885352168412675e-07, |
| "loss": 0.0008, |
| "reward": 1.6928571462631226, |
| "reward_std": 0.14970263838768005, |
| "rewards/accuracy_reward": 0.5535714626312256, |
| "rewards/format_reward": 1.0, |
| "step": 87, |
| "temporal_rewards": 0.714285671710968 |
| }, |
| { |
| "all_correct": 0.2857142857142857, |
| "all_wrong": 0.42857142857142855, |
| "completion_length": 292.5714416503906, |
| "epoch": 0.3076923076923077, |
| "grad_norm": 0.8888793089998056, |
| "kl": 0.0208740234375, |
| "learning_rate": 7.840323733655778e-07, |
| "loss": 0.0008, |
| "reward": 1.4910714626312256, |
| "reward_std": 0.33167630434036255, |
| "rewards/accuracy_reward": 0.4642857313156128, |
| "rewards/format_reward": 0.910714328289032, |
| "step": 88, |
| "temporal_rewards": 0.7857142686843872 |
| }, |
| { |
| "all_correct": 0.14285714285714285, |
| "all_wrong": 0.14285714285714285, |
| "completion_length": 262.8214416503906, |
| "epoch": 0.3111888111888112, |
| "grad_norm": 3.3773560408418652, |
| "kl": 0.015625, |
| "learning_rate": 7.794952585444067e-07, |
| "loss": 0.0006, |
| "reward": 1.7625000476837158, |
| "reward_std": 0.39009442925453186, |
| "rewards/accuracy_reward": 0.6071428656578064, |
| "rewards/format_reward": 1.0, |
| "step": 89, |
| "temporal_rewards": 0.7857142686843872 |
| }, |
| { |
| "all_correct": 0.42857142857142855, |
| "all_wrong": 0.0, |
| "completion_length": 205.73214721679688, |
| "epoch": 0.3146853146853147, |
| "grad_norm": 4.902554291103843, |
| "kl": 0.022216796875, |
| "learning_rate": 7.749244198260174e-07, |
| "loss": 0.0009, |
| "reward": 1.725000023841858, |
| "reward_std": 0.3247165381908417, |
| "rewards/accuracy_reward": 0.6071428656578064, |
| "rewards/format_reward": 1.0, |
| "step": 90, |
| "temporal_rewards": 0.6428571343421936 |
| }, |
| { |
| "all_correct": 0.0, |
| "all_wrong": 0.0, |
| "completion_length": 194.50001525878906, |
| "epoch": 0.3181818181818182, |
| "grad_norm": 3.125738974344353, |
| "kl": 0.01953125, |
| "learning_rate": 7.703204087277988e-07, |
| "loss": 0.0008, |
| "reward": 1.5461667776107788, |
| "reward_std": 0.5128087401390076, |
| "rewards/accuracy_reward": 0.44795241951942444, |
| "rewards/format_reward": 1.0, |
| "step": 91, |
| "temporal_rewards": 0.714285671710968 |
| }, |
| { |
| "all_correct": 0.2857142857142857, |
| "all_wrong": 0.2857142857142857, |
| "completion_length": 201.3928680419922, |
| "epoch": 0.32167832167832167, |
| "grad_norm": 2.142383489890374, |
| "kl": 0.02490234375, |
| "learning_rate": 7.656837807697186e-07, |
| "loss": 0.001, |
| "reward": 1.692857265472412, |
| "reward_std": 0.3641102910041809, |
| "rewards/accuracy_reward": 0.5535714626312256, |
| "rewards/format_reward": 0.973214328289032, |
| "step": 92, |
| "temporal_rewards": 0.8571428656578064 |
| }, |
| { |
| "all_correct": 0.14285714285714285, |
| "all_wrong": 0.2857142857142857, |
| "completion_length": 277.25, |
| "epoch": 0.32517482517482516, |
| "grad_norm": 1.4155785147812223, |
| "kl": 0.017822265625, |
| "learning_rate": 7.610150954072952e-07, |
| "loss": 0.0007, |
| "reward": 1.5071431398391724, |
| "reward_std": 0.3736386001110077, |
| "rewards/accuracy_reward": 0.4464285969734192, |
| "rewards/format_reward": 0.9821429252624512, |
| "step": 93, |
| "temporal_rewards": 0.714285671710968 |
| }, |
| { |
| "all_correct": 0.14285714285714285, |
| "all_wrong": 0.14285714285714285, |
| "completion_length": 301.4285888671875, |
| "epoch": 0.32867132867132864, |
| "grad_norm": 1.1616934363002724, |
| "kl": 0.0162353515625, |
| "learning_rate": 7.563149159640928e-07, |
| "loss": 0.0006, |
| "reward": 1.7053571939468384, |
| "reward_std": 0.33641549944877625, |
| "rewards/accuracy_reward": 0.5892857313156128, |
| "rewards/format_reward": 1.0, |
| "step": 94, |
| "temporal_rewards": 0.6428571343421936 |
| }, |
| { |
| "all_correct": 0.14285714285714285, |
| "all_wrong": 0.2857142857142857, |
| "completion_length": 267.21429443359375, |
| "epoch": 0.3321678321678322, |
| "grad_norm": 3.0109070395231203, |
| "kl": 0.0179443359375, |
| "learning_rate": 7.515838095637518e-07, |
| "loss": 0.0007, |
| "reward": 1.5267857313156128, |
| "reward_std": 0.2861282229423523, |
| "rewards/accuracy_reward": 0.4642857313156128, |
| "rewards/format_reward": 0.9910714626312256, |
| "step": 95, |
| "temporal_rewards": 0.4285714328289032 |
| }, |
| { |
| "all_correct": 0.2857142857142857, |
| "all_wrong": 0.2857142857142857, |
| "completion_length": 166.55357360839844, |
| "epoch": 0.3356643356643357, |
| "grad_norm": 9.975231124520647, |
| "kl": 0.024658203125, |
| "learning_rate": 7.468223470615592e-07, |
| "loss": 0.001, |
| "reward": 1.5607143640518188, |
| "reward_std": 0.2182290256023407, |
| "rewards/accuracy_reward": 0.4642857313156128, |
| "rewards/format_reward": 1.0, |
| "step": 96, |
| "temporal_rewards": 0.6428571343421936 |
| }, |
| { |
| "all_correct": 0.0, |
| "all_wrong": 0.42857142857142855, |
| "completion_length": 212.83929443359375, |
| "epoch": 0.33916083916083917, |
| "grad_norm": 1.8684746698680486, |
| "kl": 0.017578125, |
| "learning_rate": 7.420311029755687e-07, |
| "loss": 0.0007, |
| "reward": 1.283928632736206, |
| "reward_std": 0.3540671467781067, |
| "rewards/accuracy_reward": 0.2321428656578064, |
| "rewards/format_reward": 1.0, |
| "step": 97, |
| "temporal_rewards": 0.5714285373687744 |
| }, |
| { |
| "all_correct": 0.0, |
| "all_wrong": 0.42857142857142855, |
| "completion_length": 342.3571472167969, |
| "epoch": 0.34265734265734266, |
| "grad_norm": 2.56805644763744, |
| "kl": 0.0162353515625, |
| "learning_rate": 7.372106554172801e-07, |
| "loss": 0.0007, |
| "reward": 1.3767857551574707, |
| "reward_std": 0.5145957469940186, |
| "rewards/accuracy_reward": 0.3750000298023224, |
| "rewards/format_reward": 0.910714328289032, |
| "step": 98, |
| "temporal_rewards": 0.714285671710968 |
| }, |
| { |
| "all_correct": 0.2857142857142857, |
| "all_wrong": 0.14285714285714285, |
| "completion_length": 219.7857208251953, |
| "epoch": 0.34615384615384615, |
| "grad_norm": 2.8558528617729753, |
| "kl": 0.019287109375, |
| "learning_rate": 7.323615860218842e-07, |
| "loss": 0.0008, |
| "reward": 1.6142857074737549, |
| "reward_std": 0.32218724489212036, |
| "rewards/accuracy_reward": 0.4821428656578064, |
| "rewards/format_reward": 1.0, |
| "step": 99, |
| "temporal_rewards": 0.8571428656578064 |
| }, |
| { |
| "all_correct": 0.0, |
| "all_wrong": 0.42857142857142855, |
| "completion_length": 349.51788330078125, |
| "epoch": 0.34965034965034963, |
| "grad_norm": 1.2509389603357361, |
| "kl": 0.011474609375, |
| "learning_rate": 7.274844798780825e-07, |
| "loss": 0.0005, |
| "reward": 1.1142858266830444, |
| "reward_std": 0.3019092082977295, |
| "rewards/accuracy_reward": 0.125, |
| "rewards/format_reward": 0.9821429252624512, |
| "step": 100, |
| "temporal_rewards": 0.4285714328289032 |
| }, |
| { |
| "all_correct": 0.0, |
| "all_wrong": 0.2857142857142857, |
| "completion_length": 277.0357360839844, |
| "epoch": 0.3531468531468531, |
| "grad_norm": 2.7916868507565837, |
| "kl": 0.0179443359375, |
| "learning_rate": 7.225799254574903e-07, |
| "loss": 0.0007, |
| "reward": 1.5017857551574707, |
| "reward_std": 0.37953513860702515, |
| "rewards/accuracy_reward": 0.4107142984867096, |
| "rewards/format_reward": 1.0, |
| "step": 101, |
| "temporal_rewards": 0.7857142686843872 |
| }, |
| { |
| "all_correct": 0.2857142857142857, |
| "all_wrong": 0.2857142857142857, |
| "completion_length": 168.60714721679688, |
| "epoch": 0.35664335664335667, |
| "grad_norm": 2.2707554849609437, |
| "kl": 0.02392578125, |
| "learning_rate": 7.176485145436324e-07, |
| "loss": 0.001, |
| "reward": 1.662500023841858, |
| "reward_std": 0.23105838894844055, |
| "rewards/accuracy_reward": 0.5178571939468384, |
| "rewards/format_reward": 1.0, |
| "step": 102, |
| "temporal_rewards": 0.7857142686843872 |
| }, |
| { |
| "all_correct": 0.14285714285714285, |
| "all_wrong": 0.5714285714285714, |
| "completion_length": 334.26788330078125, |
| "epoch": 0.36013986013986016, |
| "grad_norm": 2.234886017179779, |
| "kl": 0.01904296875, |
| "learning_rate": 7.126908421605374e-07, |
| "loss": 0.0008, |
| "reward": 1.4928572177886963, |
| "reward_std": 0.18182747066020966, |
| "rewards/accuracy_reward": 0.392857164144516, |
| "rewards/format_reward": 0.9821429252624512, |
| "step": 103, |
| "temporal_rewards": 0.714285671710968 |
| }, |
| { |
| "all_correct": 0.14285714285714285, |
| "all_wrong": 0.42857142857142855, |
| "completion_length": 258.6785888671875, |
| "epoch": 0.36363636363636365, |
| "grad_norm": 1.1966647862465727, |
| "kl": 0.022705078125, |
| "learning_rate": 7.077075065009433e-07, |
| "loss": 0.0009, |
| "reward": 1.25, |
| "reward_std": 0.2374918907880783, |
| "rewards/accuracy_reward": 0.2142857313156128, |
| "rewards/format_reward": 0.9821429252624512, |
| "step": 104, |
| "temporal_rewards": 0.7857142686843872 |
| }, |
| { |
| "all_correct": 0.0, |
| "all_wrong": 0.42857142857142855, |
| "completion_length": 181.96429443359375, |
| "epoch": 0.36713286713286714, |
| "grad_norm": 2.485952277761037, |
| "kl": 0.03173828125, |
| "learning_rate": 7.026991088541183e-07, |
| "loss": 0.0013, |
| "reward": 1.255357265472412, |
| "reward_std": 0.35542815923690796, |
| "rewards/accuracy_reward": 0.2321428656578064, |
| "rewards/format_reward": 0.9821429252624512, |
| "step": 105, |
| "temporal_rewards": 0.5 |
| }, |
| { |
| "all_correct": 0.14285714285714285, |
| "all_wrong": 0.42857142857142855, |
| "completion_length": 234.71429443359375, |
| "epoch": 0.3706293706293706, |
| "grad_norm": 1.2916342200641437, |
| "kl": 0.0189208984375, |
| "learning_rate": 6.976662535333107e-07, |
| "loss": 0.0008, |
| "reward": 1.446428656578064, |
| "reward_std": 0.2671079635620117, |
| "rewards/accuracy_reward": 0.3750000298023224, |
| "rewards/format_reward": 0.9821429252624512, |
| "step": 106, |
| "temporal_rewards": 0.714285671710968 |
| }, |
| { |
| "all_correct": 0.5714285714285714, |
| "all_wrong": 0.0, |
| "completion_length": 256.9821472167969, |
| "epoch": 0.3741258741258741, |
| "grad_norm": 1.4490597935648555, |
| "kl": 0.0201416015625, |
| "learning_rate": 6.926095478028311e-07, |
| "loss": 0.0008, |
| "reward": 1.9785715341567993, |
| "reward_std": 0.28172439336776733, |
| "rewards/accuracy_reward": 0.8035714626312256, |
| "rewards/format_reward": 1.0, |
| "step": 107, |
| "temporal_rewards": 0.7857142686843872 |
| }, |
| { |
| "all_correct": 0.0, |
| "all_wrong": 0.14285714285714285, |
| "completion_length": 229.75001525878906, |
| "epoch": 0.3776223776223776, |
| "grad_norm": 6.256933238343451, |
| "kl": 0.027587890625, |
| "learning_rate": 6.875296018047809e-07, |
| "loss": 0.0011, |
| "reward": 1.5000001192092896, |
| "reward_std": 0.5110898017883301, |
| "rewards/accuracy_reward": 0.4464285969734192, |
| "rewards/format_reward": 0.9464285969734192, |
| "step": 108, |
| "temporal_rewards": 0.6428571343421936 |
| }, |
| { |
| "all_correct": 0.42857142857142855, |
| "all_wrong": 0.2857142857142857, |
| "completion_length": 210.96429443359375, |
| "epoch": 0.3811188811188811, |
| "grad_norm": 7.77848974566679, |
| "kl": 0.0250244140625, |
| "learning_rate": 6.824270284854317e-07, |
| "loss": 0.001, |
| "reward": 1.7500001192092896, |
| "reward_std": 0.21283701062202454, |
| "rewards/accuracy_reward": 0.6071428656578064, |
| "rewards/format_reward": 0.9821429252624512, |
| "step": 109, |
| "temporal_rewards": 0.714285671710968 |
| }, |
| { |
| "all_correct": 0.14285714285714285, |
| "all_wrong": 0.2857142857142857, |
| "completion_length": 283.1607360839844, |
| "epoch": 0.38461538461538464, |
| "grad_norm": 2.5557165123350445, |
| "kl": 0.01953125, |
| "learning_rate": 6.773024435212677e-07, |
| "loss": 0.0008, |
| "reward": 1.4250000715255737, |
| "reward_std": 0.3038181662559509, |
| "rewards/accuracy_reward": 0.392857164144516, |
| "rewards/format_reward": 0.9821429252624512, |
| "step": 110, |
| "temporal_rewards": 0.357142835855484 |
| }, |
| { |
| "all_correct": 0.42857142857142855, |
| "all_wrong": 0.14285714285714285, |
| "completion_length": 161.0357208251953, |
| "epoch": 0.3881118881118881, |
| "grad_norm": 2.4569515405580624, |
| "kl": 0.030029296875, |
| "learning_rate": 6.721564652446987e-07, |
| "loss": 0.0012, |
| "reward": 1.8303571939468384, |
| "reward_std": 0.21543601155281067, |
| "rewards/accuracy_reward": 0.6428571939468384, |
| "rewards/format_reward": 1.0, |
| "step": 111, |
| "temporal_rewards": 0.9285714030265808 |
| }, |
| { |
| "all_correct": 0.42857142857142855, |
| "all_wrong": 0.0, |
| "completion_length": 245.69644165039062, |
| "epoch": 0.3916083916083916, |
| "grad_norm": 1.6931573328130867, |
| "kl": 0.025390625, |
| "learning_rate": 6.669897145694506e-07, |
| "loss": 0.001, |
| "reward": 1.798214316368103, |
| "reward_std": 0.2900663912296295, |
| "rewards/accuracy_reward": 0.6428571939468384, |
| "rewards/format_reward": 1.0, |
| "step": 112, |
| "temporal_rewards": 0.7857142686843872 |
| }, |
| { |
| "all_correct": 0.2857142857142857, |
| "all_wrong": 0.0, |
| "completion_length": 267.26788330078125, |
| "epoch": 0.3951048951048951, |
| "grad_norm": 3.226106128346649, |
| "kl": 0.0194091796875, |
| "learning_rate": 6.618028149156478e-07, |
| "loss": 0.0008, |
| "reward": 1.7946429252624512, |
| "reward_std": 0.4278576970100403, |
| "rewards/accuracy_reward": 0.6428571939468384, |
| "rewards/format_reward": 1.0, |
| "step": 113, |
| "temporal_rewards": 0.7857142686843872 |
| }, |
| { |
| "all_correct": 0.14285714285714285, |
| "all_wrong": 0.42857142857142855, |
| "completion_length": 214.71429443359375, |
| "epoch": 0.3986013986013986, |
| "grad_norm": 2.474466887500513, |
| "kl": 0.023681640625, |
| "learning_rate": 6.565963921345895e-07, |
| "loss": 0.0009, |
| "reward": 1.3232142925262451, |
| "reward_std": 0.2196773886680603, |
| "rewards/accuracy_reward": 0.25, |
| "rewards/format_reward": 1.0, |
| "step": 114, |
| "temporal_rewards": 0.714285671710968 |
| }, |
| { |
| "all_correct": 0.14285714285714285, |
| "all_wrong": 0.2857142857142857, |
| "completion_length": 206.46429443359375, |
| "epoch": 0.4020979020979021, |
| "grad_norm": 12.40427149026429, |
| "kl": 0.0235595703125, |
| "learning_rate": 6.51371074433236e-07, |
| "loss": 0.0009, |
| "reward": 1.5821430683135986, |
| "reward_std": 0.3218930661678314, |
| "rewards/accuracy_reward": 0.4821428656578064, |
| "rewards/format_reward": 0.9821429252624512, |
| "step": 115, |
| "temporal_rewards": 0.5714285373687744 |
| }, |
| { |
| "all_correct": 0.42857142857142855, |
| "all_wrong": 0.14285714285714285, |
| "completion_length": 275.33929443359375, |
| "epoch": 0.40559440559440557, |
| "grad_norm": 4.132371759712955, |
| "kl": 0.0185546875, |
| "learning_rate": 6.461274922984086e-07, |
| "loss": 0.0007, |
| "reward": 1.9821428060531616, |
| "reward_std": 0.24592465162277222, |
| "rewards/accuracy_reward": 0.785714328289032, |
| "rewards/format_reward": 1.0, |
| "step": 116, |
| "temporal_rewards": 0.7857142686843872 |
| }, |
| { |
| "all_correct": 0.42857142857142855, |
| "all_wrong": 0.0, |
| "completion_length": 231.98214721679688, |
| "epoch": 0.4090909090909091, |
| "grad_norm": 2.6149401846082427, |
| "kl": 0.020263671875, |
| "learning_rate": 6.408662784207149e-07, |
| "loss": 0.0008, |
| "reward": 1.9267858266830444, |
| "reward_std": 0.36489078402519226, |
| "rewards/accuracy_reward": 0.7321428656578064, |
| "rewards/format_reward": 0.9910714626312256, |
| "step": 117, |
| "temporal_rewards": 0.8571428656578064 |
| }, |
| { |
| "all_correct": 0.14285714285714285, |
| "all_wrong": 0.2857142857142857, |
| "completion_length": 222.0535888671875, |
| "epoch": 0.4125874125874126, |
| "grad_norm": 2.8131894560697983, |
| "kl": 0.018310546875, |
| "learning_rate": 6.355880676182085e-07, |
| "loss": 0.0007, |
| "reward": 1.48035728931427, |
| "reward_std": 0.3505924344062805, |
| "rewards/accuracy_reward": 0.4107142984867096, |
| "rewards/format_reward": 0.9821429252624512, |
| "step": 118, |
| "temporal_rewards": 0.6428571343421936 |
| }, |
| { |
| "all_correct": 0.2857142857142857, |
| "all_wrong": 0.2857142857142857, |
| "completion_length": 333.1071472167969, |
| "epoch": 0.4160839160839161, |
| "grad_norm": 1.8222547759328855, |
| "kl": 0.01556396484375, |
| "learning_rate": 6.302934967597922e-07, |
| "loss": 0.0006, |
| "reward": 1.4839287996292114, |
| "reward_std": 0.27154284715652466, |
| "rewards/accuracy_reward": 0.4107142984867096, |
| "rewards/format_reward": 1.0, |
| "step": 119, |
| "temporal_rewards": 0.6428571343421936 |
| }, |
| { |
| "all_correct": 0.0, |
| "all_wrong": 0.2857142857142857, |
| "completion_length": 315.2321472167969, |
| "epoch": 0.4195804195804196, |
| "grad_norm": 24.56012717937078, |
| "kl": 0.0164794921875, |
| "learning_rate": 6.249832046883729e-07, |
| "loss": 0.0007, |
| "reward": 1.3053572177886963, |
| "reward_std": 0.44285857677459717, |
| "rewards/accuracy_reward": 0.267857164144516, |
| "rewards/format_reward": 0.973214328289032, |
| "step": 120, |
| "temporal_rewards": 0.5714285373687744 |
| }, |
| { |
| "all_correct": 0.14285714285714285, |
| "all_wrong": 0.2857142857142857, |
| "completion_length": 246.4285888671875, |
| "epoch": 0.4230769230769231, |
| "grad_norm": 1.9426589659666265, |
| "kl": 0.0203857421875, |
| "learning_rate": 6.196578321437789e-07, |
| "loss": 0.0008, |
| "reward": 1.4303573369979858, |
| "reward_std": 0.2891465723514557, |
| "rewards/accuracy_reward": 0.3392857313156128, |
| "rewards/format_reward": 1.0, |
| "step": 121, |
| "temporal_rewards": 0.714285671710968 |
| }, |
| { |
| "all_correct": 0.2857142857142857, |
| "all_wrong": 0.2857142857142857, |
| "completion_length": 204.07144165039062, |
| "epoch": 0.42657342657342656, |
| "grad_norm": 3.0118903186149075, |
| "kl": 0.0186767578125, |
| "learning_rate": 6.143180216854486e-07, |
| "loss": 0.0007, |
| "reward": 1.571428656578064, |
| "reward_std": 0.24399259686470032, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.9821429252624512, |
| "step": 122, |
| "temporal_rewards": 0.5714285373687744 |
| }, |
| { |
| "all_correct": 0.2857142857142857, |
| "all_wrong": 0.2857142857142857, |
| "completion_length": 162.0, |
| "epoch": 0.43006993006993005, |
| "grad_norm": 2.712916811219852, |
| "kl": 0.0213623046875, |
| "learning_rate": 6.089644176148991e-07, |
| "loss": 0.0009, |
| "reward": 1.5285714864730835, |
| "reward_std": 0.30187511444091797, |
| "rewards/accuracy_reward": 0.4285714626312256, |
| "rewards/format_reward": 0.9821429252624512, |
| "step": 123, |
| "temporal_rewards": 0.6428571343421936 |
| }, |
| { |
| "all_correct": 0.14285714285714285, |
| "all_wrong": 0.2857142857142857, |
| "completion_length": 456.982177734375, |
| "epoch": 0.43356643356643354, |
| "grad_norm": 0.8620894348694859, |
| "kl": 0.007232666015625, |
| "learning_rate": 6.035976658979846e-07, |
| "loss": 0.0003, |
| "reward": 1.4910714626312256, |
| "reward_std": 0.3568207919597626, |
| "rewards/accuracy_reward": 0.5535714626312256, |
| "rewards/format_reward": 0.8660714626312256, |
| "step": 124, |
| "temporal_rewards": 0.5 |
| }, |
| { |
| "all_correct": 0.2857142857142857, |
| "all_wrong": 0.14285714285714285, |
| "completion_length": 269.625, |
| "epoch": 0.4370629370629371, |
| "grad_norm": 3.5704979471789198, |
| "kl": 0.0206298828125, |
| "learning_rate": 5.982184140869538e-07, |
| "loss": 0.0008, |
| "reward": 1.4535715579986572, |
| "reward_std": 0.31924429535865784, |
| "rewards/accuracy_reward": 0.4821428656578064, |
| "rewards/format_reward": 0.8750000596046448, |
| "step": 125, |
| "temporal_rewards": 0.5714285373687744 |
| }, |
| { |
| "all_correct": 0.14285714285714285, |
| "all_wrong": 0.42857142857142855, |
| "completion_length": 301.7857360839844, |
| "epoch": 0.4405594405594406, |
| "grad_norm": 1.6422657775507825, |
| "kl": 0.0174560546875, |
| "learning_rate": 5.928273112423176e-07, |
| "loss": 0.0007, |
| "reward": 1.4392857551574707, |
| "reward_std": 0.3684910237789154, |
| "rewards/accuracy_reward": 0.4107142984867096, |
| "rewards/format_reward": 0.9375000596046448, |
| "step": 126, |
| "temporal_rewards": 0.5714285373687744 |
| }, |
| { |
| "all_correct": 0.42857142857142855, |
| "all_wrong": 0.14285714285714285, |
| "completion_length": 263.4464416503906, |
| "epoch": 0.44405594405594406, |
| "grad_norm": 1.7374569295760933, |
| "kl": 0.022705078125, |
| "learning_rate": 5.874250078545322e-07, |
| "loss": 0.0009, |
| "reward": 1.5571428537368774, |
| "reward_std": 0.2348182499408722, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.9196429252624512, |
| "step": 127, |
| "temporal_rewards": 0.714285671710968 |
| }, |
| { |
| "all_correct": 0.2857142857142857, |
| "all_wrong": 0.0, |
| "completion_length": 267.7321472167969, |
| "epoch": 0.44755244755244755, |
| "grad_norm": 42.33841040347552, |
| "kl": 0.018310546875, |
| "learning_rate": 5.820121557655108e-07, |
| "loss": 0.0007, |
| "reward": 1.6982142925262451, |
| "reward_std": 0.4108830988407135, |
| "rewards/accuracy_reward": 0.5892857313156128, |
| "rewards/format_reward": 0.9821429252624512, |
| "step": 128, |
| "temporal_rewards": 0.714285671710968 |
| }, |
| { |
| "all_correct": 0.0, |
| "all_wrong": 0.0, |
| "completion_length": 234.57144165039062, |
| "epoch": 0.45104895104895104, |
| "grad_norm": 2.480074327680305, |
| "kl": 0.0255126953125, |
| "learning_rate": 5.765894080899739e-07, |
| "loss": 0.001, |
| "reward": 1.6923317909240723, |
| "reward_std": 0.5358164310455322, |
| "rewards/accuracy_reward": 0.542331874370575, |
| "rewards/format_reward": 0.9821429252624512, |
| "step": 129, |
| "temporal_rewards": 0.9285714030265808 |
| }, |
| { |
| "all_correct": 0.14285714285714285, |
| "all_wrong": 0.2857142857142857, |
| "completion_length": 308.1964416503906, |
| "epoch": 0.45454545454545453, |
| "grad_norm": 1.5015855581589252, |
| "kl": 0.0146484375, |
| "learning_rate": 5.711574191366427e-07, |
| "loss": 0.0006, |
| "reward": 1.7357144355773926, |
| "reward_std": 0.2828700840473175, |
| "rewards/accuracy_reward": 0.6071428656578064, |
| "rewards/format_reward": 0.9910714626312256, |
| "step": 130, |
| "temporal_rewards": 0.714285671710968 |
| }, |
| { |
| "all_correct": 0.5714285714285714, |
| "all_wrong": 0.14285714285714285, |
| "completion_length": 267.2321472167969, |
| "epoch": 0.458041958041958, |
| "grad_norm": 3.0878458835082476, |
| "kl": 0.0155029296875, |
| "learning_rate": 5.657168443292908e-07, |
| "loss": 0.0006, |
| "reward": 1.9000000953674316, |
| "reward_std": 0.12276037037372589, |
| "rewards/accuracy_reward": 0.7321428656578064, |
| "rewards/format_reward": 1.0, |
| "step": 131, |
| "temporal_rewards": 0.6428571343421936 |
| }, |
| { |
| "all_correct": 0.2857142857142857, |
| "all_wrong": 0.42857142857142855, |
| "completion_length": 167.375, |
| "epoch": 0.46153846153846156, |
| "grad_norm": 1.9269536633290627, |
| "kl": 0.02783203125, |
| "learning_rate": 5.602683401276614e-07, |
| "loss": 0.0011, |
| "reward": 1.6267857551574707, |
| "reward_std": 0.1649283766746521, |
| "rewards/accuracy_reward": 0.4821428656578064, |
| "rewards/format_reward": 1.0, |
| "step": 132, |
| "temporal_rewards": 0.714285671710968 |
| }, |
| { |
| "all_correct": 0.2857142857142857, |
| "all_wrong": 0.14285714285714285, |
| "completion_length": 235.1785888671875, |
| "epoch": 0.46503496503496505, |
| "grad_norm": 6.598227061709475, |
| "kl": 0.018310546875, |
| "learning_rate": 5.548125639482586e-07, |
| "loss": 0.0007, |
| "reward": 1.5928571224212646, |
| "reward_std": 0.38033661246299744, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.9910714626312256, |
| "step": 133, |
| "temporal_rewards": 0.7857142686843872 |
| }, |
| { |
| "all_correct": 0.0, |
| "all_wrong": 0.14285714285714285, |
| "completion_length": 308.39288330078125, |
| "epoch": 0.46853146853146854, |
| "grad_norm": 2.195511154443261, |
| "kl": 0.0145263671875, |
| "learning_rate": 5.493501740850227e-07, |
| "loss": 0.0006, |
| "reward": 1.5334078073501587, |
| "reward_std": 0.3899558186531067, |
| "rewards/accuracy_reward": 0.4476935863494873, |
| "rewards/format_reward": 1.0, |
| "step": 134, |
| "temporal_rewards": 0.5714285373687744 |
| }, |
| { |
| "all_correct": 0.42857142857142855, |
| "all_wrong": 0.14285714285714285, |
| "completion_length": 321.7321472167969, |
| "epoch": 0.47202797202797203, |
| "grad_norm": 0.5731888444733847, |
| "kl": 0.017822265625, |
| "learning_rate": 5.438818296299015e-07, |
| "loss": 0.0007, |
| "reward": 1.782142996788025, |
| "reward_std": 0.2987748980522156, |
| "rewards/accuracy_reward": 0.660714328289032, |
| "rewards/format_reward": 0.9642857313156128, |
| "step": 135, |
| "temporal_rewards": 0.714285671710968 |
| }, |
| { |
| "all_correct": 0.14285714285714285, |
| "all_wrong": 0.0, |
| "completion_length": 356.2500305175781, |
| "epoch": 0.4755244755244755, |
| "grad_norm": 2.1615424381726127, |
| "kl": 0.01324462890625, |
| "learning_rate": 5.384081903933234e-07, |
| "loss": 0.0005, |
| "reward": 1.582142949104309, |
| "reward_std": 0.4733910858631134, |
| "rewards/accuracy_reward": 0.535714328289032, |
| "rewards/format_reward": 0.9464285969734192, |
| "step": 136, |
| "temporal_rewards": 0.5714285373687744 |
| }, |
| { |
| "all_correct": 0.0, |
| "all_wrong": 0.42857142857142855, |
| "completion_length": 298.8571472167969, |
| "epoch": 0.479020979020979, |
| "grad_norm": 2.9906630275434645, |
| "kl": 0.0159912109375, |
| "learning_rate": 5.329299168245856e-07, |
| "loss": 0.0006, |
| "reward": 1.1678571701049805, |
| "reward_std": 0.38480064272880554, |
| "rewards/accuracy_reward": 0.1785714328289032, |
| "rewards/format_reward": 0.9464285969734192, |
| "step": 137, |
| "temporal_rewards": 0.5714285373687744 |
| }, |
| { |
| "all_correct": 0.2857142857142857, |
| "all_wrong": 0.0, |
| "completion_length": 288.1071472167969, |
| "epoch": 0.4825174825174825, |
| "grad_norm": 2.8507410343947908, |
| "kl": 0.0174560546875, |
| "learning_rate": 5.274476699321637e-07, |
| "loss": 0.0007, |
| "reward": 1.9285714626312256, |
| "reward_std": 0.4282780587673187, |
| "rewards/accuracy_reward": 0.785714328289032, |
| "rewards/format_reward": 0.9821429252624512, |
| "step": 138, |
| "temporal_rewards": 0.6428571343421936 |
| }, |
| { |
| "all_correct": 0.42857142857142855, |
| "all_wrong": 0.0, |
| "completion_length": 153.875, |
| "epoch": 0.486013986013986, |
| "grad_norm": 10.325838545352347, |
| "kl": 0.0242919921875, |
| "learning_rate": 5.219621112039543e-07, |
| "loss": 0.001, |
| "reward": 1.6428571939468384, |
| "reward_std": 0.24190634489059448, |
| "rewards/accuracy_reward": 0.5178571939468384, |
| "rewards/format_reward": 0.9910714626312256, |
| "step": 139, |
| "temporal_rewards": 0.5714285373687744 |
| }, |
| { |
| "all_correct": 0.2857142857142857, |
| "all_wrong": 0.0, |
| "completion_length": 139.5, |
| "epoch": 0.48951048951048953, |
| "grad_norm": 2.9463926068589656, |
| "kl": 0.02294921875, |
| "learning_rate": 5.164739025274604e-07, |
| "loss": 0.0009, |
| "reward": 1.7178571224212646, |
| "reward_std": 0.3234609365463257, |
| "rewards/accuracy_reward": 0.5892857313156128, |
| "rewards/format_reward": 1.0, |
| "step": 140, |
| "temporal_rewards": 0.6428571343421936 |
| }, |
| { |
| "all_correct": 0.5714285714285714, |
| "all_wrong": 0.0, |
| "completion_length": 146.4107208251953, |
| "epoch": 0.493006993006993, |
| "grad_norm": 4.320723121584521, |
| "kl": 0.028564453125, |
| "learning_rate": 5.109837061099273e-07, |
| "loss": 0.0011, |
| "reward": 2.0714287757873535, |
| "reward_std": 0.28830307722091675, |
| "rewards/accuracy_reward": 0.8214285969734192, |
| "rewards/format_reward": 1.0, |
| "step": 141, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 0.2857142857142857, |
| "all_wrong": 0.14285714285714285, |
| "completion_length": 342.6607360839844, |
| "epoch": 0.4965034965034965, |
| "grad_norm": 1.4042609008851548, |
| "kl": 0.0169677734375, |
| "learning_rate": 5.054921843984417e-07, |
| "loss": 0.0007, |
| "reward": 1.3928571939468384, |
| "reward_std": 0.36577218770980835, |
| "rewards/accuracy_reward": 0.4107142984867096, |
| "rewards/format_reward": 0.8750000596046448, |
| "step": 142, |
| "temporal_rewards": 0.714285671710968 |
| }, |
| { |
| "all_correct": 0.0, |
| "all_wrong": 0.42857142857142855, |
| "completion_length": 230.83929443359375, |
| "epoch": 0.5, |
| "grad_norm": 2.760203392117073, |
| "kl": 0.018310546875, |
| "learning_rate": 5e-07, |
| "loss": 0.0007, |
| "reward": 1.3535715341567993, |
| "reward_std": 0.39483919739723206, |
| "rewards/accuracy_reward": 0.3035714328289032, |
| "rewards/format_reward": 0.9642857313156128, |
| "step": 143, |
| "temporal_rewards": 0.8571428656578064 |
| }, |
| { |
| "all_correct": 0.0, |
| "all_wrong": 0.14285714285714285, |
| "completion_length": 174.50001525878906, |
| "epoch": 0.5034965034965035, |
| "grad_norm": 3.445363780486131, |
| "kl": 0.02294921875, |
| "learning_rate": 4.945078156015581e-07, |
| "loss": 0.0009, |
| "reward": 1.5750000476837158, |
| "reward_std": 0.4295726716518402, |
| "rewards/accuracy_reward": 0.4464285969734192, |
| "rewards/format_reward": 1.0, |
| "step": 144, |
| "temporal_rewards": 0.9285714030265808 |
| }, |
| { |
| "all_correct": 0.2857142857142857, |
| "all_wrong": 0.14285714285714285, |
| "completion_length": 324.625, |
| "epoch": 0.506993006993007, |
| "grad_norm": 2.070258986628781, |
| "kl": 0.0155029296875, |
| "learning_rate": 4.890162938900726e-07, |
| "loss": 0.0006, |
| "reward": 1.6482144594192505, |
| "reward_std": 0.47856929898262024, |
| "rewards/accuracy_reward": 0.5892857313156128, |
| "rewards/format_reward": 0.9285714626312256, |
| "step": 145, |
| "temporal_rewards": 0.714285671710968 |
| }, |
| { |
| "all_correct": 0.2857142857142857, |
| "all_wrong": 0.42857142857142855, |
| "completion_length": 326.1071472167969, |
| "epoch": 0.5104895104895105, |
| "grad_norm": 2.8351227359716944, |
| "kl": 0.01318359375, |
| "learning_rate": 4.835260974725397e-07, |
| "loss": 0.0005, |
| "reward": 1.394642949104309, |
| "reward_std": 0.23566605150699615, |
| "rewards/accuracy_reward": 0.4464285969734192, |
| "rewards/format_reward": 0.8660714626312256, |
| "step": 146, |
| "temporal_rewards": 0.6428571343421936 |
| }, |
| { |
| "all_correct": 0.14285714285714285, |
| "all_wrong": 0.2857142857142857, |
| "completion_length": 303.58929443359375, |
| "epoch": 0.513986013986014, |
| "grad_norm": 4.877396417730794, |
| "kl": 0.012939453125, |
| "learning_rate": 4.780378887960458e-07, |
| "loss": 0.0005, |
| "reward": 1.5142858028411865, |
| "reward_std": 0.27789801359176636, |
| "rewards/accuracy_reward": 0.4107142984867096, |
| "rewards/format_reward": 0.9910714626312256, |
| "step": 147, |
| "temporal_rewards": 0.714285671710968 |
| }, |
| { |
| "all_correct": 0.14285714285714285, |
| "all_wrong": 0.42857142857142855, |
| "completion_length": 304.625, |
| "epoch": 0.5174825174825175, |
| "grad_norm": 1.7202582205652632, |
| "kl": 0.0142822265625, |
| "learning_rate": 4.7255233006783624e-07, |
| "loss": 0.0006, |
| "reward": 1.5250000953674316, |
| "reward_std": 0.3306018114089966, |
| "rewards/accuracy_reward": 0.4285714626312256, |
| "rewards/format_reward": 0.9642857313156128, |
| "step": 148, |
| "temporal_rewards": 0.7857142686843872 |
| }, |
| { |
| "all_correct": 0.14285714285714285, |
| "all_wrong": 0.2857142857142857, |
| "completion_length": 360.0000305175781, |
| "epoch": 0.5209790209790209, |
| "grad_norm": 2.0232478358662336, |
| "kl": 0.0118408203125, |
| "learning_rate": 4.6707008317541443e-07, |
| "loss": 0.0005, |
| "reward": 1.2178571224212646, |
| "reward_std": 0.34071677923202515, |
| "rewards/accuracy_reward": 0.3214285969734192, |
| "rewards/format_reward": 0.8125000596046448, |
| "step": 149, |
| "temporal_rewards": 0.714285671710968 |
| }, |
| { |
| "all_correct": 0.0, |
| "all_wrong": 0.8571428571428571, |
| "completion_length": 356.6250305175781, |
| "epoch": 0.5244755244755245, |
| "grad_norm": 2.1985564262725608, |
| "kl": 0.01324462890625, |
| "learning_rate": 4.6159180960667654e-07, |
| "loss": 0.0005, |
| "reward": 1.0017857551574707, |
| "reward_std": 0.11616755276918411, |
| "rewards/accuracy_reward": 0.125, |
| "rewards/format_reward": 0.8392857313156128, |
| "step": 150, |
| "temporal_rewards": 0.714285671710968 |
| }, |
| { |
| "all_correct": 0.2857142857142857, |
| "all_wrong": 0.42857142857142855, |
| "completion_length": 255.3035888671875, |
| "epoch": 0.527972027972028, |
| "grad_norm": 2.4205533677608644, |
| "kl": 0.017578125, |
| "learning_rate": 4.561181703700985e-07, |
| "loss": 0.0007, |
| "reward": 1.6035714149475098, |
| "reward_std": 0.16177618503570557, |
| "rewards/accuracy_reward": 0.4642857313156128, |
| "rewards/format_reward": 1.0, |
| "step": 151, |
| "temporal_rewards": 0.8571428656578064 |
| }, |
| { |
| "all_correct": 0.14285714285714285, |
| "all_wrong": 0.42857142857142855, |
| "completion_length": 285.0, |
| "epoch": 0.5314685314685315, |
| "grad_norm": 2.418272976416168, |
| "kl": 0.0120849609375, |
| "learning_rate": 4.506498259149773e-07, |
| "loss": 0.0005, |
| "reward": 1.4500000476837158, |
| "reward_std": 0.3321867883205414, |
| "rewards/accuracy_reward": 0.4285714626312256, |
| "rewards/format_reward": 0.9375000596046448, |
| "step": 152, |
| "temporal_rewards": 0.6428571343421936 |
| }, |
| { |
| "all_correct": 0.14285714285714285, |
| "all_wrong": 0.2857142857142857, |
| "completion_length": 260.5535888671875, |
| "epoch": 0.534965034965035, |
| "grad_norm": 2.658673471345604, |
| "kl": 0.017822265625, |
| "learning_rate": 4.451874360517413e-07, |
| "loss": 0.0007, |
| "reward": 1.5267857313156128, |
| "reward_std": 0.38476046919822693, |
| "rewards/accuracy_reward": 0.5178571939468384, |
| "rewards/format_reward": 0.8750000596046448, |
| "step": 153, |
| "temporal_rewards": 0.714285671710968 |
| }, |
| { |
| "all_correct": 0.14285714285714285, |
| "all_wrong": 0.42857142857142855, |
| "completion_length": 349.51788330078125, |
| "epoch": 0.5384615384615384, |
| "grad_norm": 0.9636631029145603, |
| "kl": 0.01129150390625, |
| "learning_rate": 4.397316598723385e-07, |
| "loss": 0.0005, |
| "reward": 1.4875000715255737, |
| "reward_std": 0.19214513897895813, |
| "rewards/accuracy_reward": 0.392857164144516, |
| "rewards/format_reward": 1.0, |
| "step": 154, |
| "temporal_rewards": 0.6428571343421936 |
| }, |
| { |
| "all_correct": 0.2857142857142857, |
| "all_wrong": 0.42857142857142855, |
| "completion_length": 331.7857360839844, |
| "epoch": 0.541958041958042, |
| "grad_norm": 1.0144129009185325, |
| "kl": 0.01190185546875, |
| "learning_rate": 4.3428315567070923e-07, |
| "loss": 0.0005, |
| "reward": 1.5767858028411865, |
| "reward_std": 0.20042386651039124, |
| "rewards/accuracy_reward": 0.4642857313156128, |
| "rewards/format_reward": 0.9821429252624512, |
| "step": 155, |
| "temporal_rewards": 0.714285671710968 |
| }, |
| { |
| "all_correct": 0.14285714285714285, |
| "all_wrong": 0.0, |
| "completion_length": 151.5178680419922, |
| "epoch": 0.5454545454545454, |
| "grad_norm": 7.232186073668779, |
| "kl": 0.0198974609375, |
| "learning_rate": 4.2884258086335745e-07, |
| "loss": 0.0008, |
| "reward": 1.8571429252624512, |
| "reward_std": 0.4870792329311371, |
| "rewards/accuracy_reward": 0.6964285969734192, |
| "rewards/format_reward": 1.0, |
| "step": 156, |
| "temporal_rewards": 0.714285671710968 |
| }, |
| { |
| "all_correct": 0.5714285714285714, |
| "all_wrong": 0.2857142857142857, |
| "completion_length": 271.4107360839844, |
| "epoch": 0.548951048951049, |
| "grad_norm": 0.5118344272065345, |
| "kl": 0.016845703125, |
| "learning_rate": 4.234105919100261e-07, |
| "loss": 0.0007, |
| "reward": 1.6857142448425293, |
| "reward_std": 0.14707830548286438, |
| "rewards/accuracy_reward": 0.6071428656578064, |
| "rewards/format_reward": 0.9464285969734192, |
| "step": 157, |
| "temporal_rewards": 0.7857142686843872 |
| }, |
| { |
| "all_correct": 0.2857142857142857, |
| "all_wrong": 0.14285714285714285, |
| "completion_length": 227.48214721679688, |
| "epoch": 0.5524475524475524, |
| "grad_norm": 1.9067107696166985, |
| "kl": 0.016845703125, |
| "learning_rate": 4.179878442344892e-07, |
| "loss": 0.0007, |
| "reward": 1.8517857789993286, |
| "reward_std": 0.2894127070903778, |
| "rewards/accuracy_reward": 0.6785714626312256, |
| "rewards/format_reward": 1.0, |
| "step": 158, |
| "temporal_rewards": 0.714285671710968 |
| }, |
| { |
| "all_correct": 0.0, |
| "all_wrong": 0.42857142857142855, |
| "completion_length": 238.62501525878906, |
| "epoch": 0.5559440559440559, |
| "grad_norm": 2.9066876939888244, |
| "kl": 0.017333984375, |
| "learning_rate": 4.1257499214546785e-07, |
| "loss": 0.0007, |
| "reward": 1.1750000715255737, |
| "reward_std": 0.3523494303226471, |
| "rewards/accuracy_reward": 0.1785714328289032, |
| "rewards/format_reward": 0.9642857313156128, |
| "step": 159, |
| "temporal_rewards": 0.6428571343421936 |
| }, |
| { |
| "all_correct": 0.14285714285714285, |
| "all_wrong": 0.2857142857142857, |
| "completion_length": 272.1607360839844, |
| "epoch": 0.5594405594405595, |
| "grad_norm": 1.9955742394796372, |
| "kl": 0.01531982421875, |
| "learning_rate": 4.071726887576822e-07, |
| "loss": 0.0006, |
| "reward": 1.5125000476837158, |
| "reward_std": 0.4271492063999176, |
| "rewards/accuracy_reward": 0.4464285969734192, |
| "rewards/format_reward": 0.9285714626312256, |
| "step": 160, |
| "temporal_rewards": 0.8571428656578064 |
| }, |
| { |
| "all_correct": 0.2857142857142857, |
| "all_wrong": 0.2857142857142857, |
| "completion_length": 272.08929443359375, |
| "epoch": 0.5629370629370629, |
| "grad_norm": 1.3449257046018408, |
| "kl": 0.015380859375, |
| "learning_rate": 4.017815859130461e-07, |
| "loss": 0.0006, |
| "reward": 1.5839287042617798, |
| "reward_std": 0.223629429936409, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.9910714626312256, |
| "step": 161, |
| "temporal_rewards": 0.6428571343421936 |
| }, |
| { |
| "all_correct": 0.0, |
| "all_wrong": 0.2857142857142857, |
| "completion_length": 328.3214416503906, |
| "epoch": 0.5664335664335665, |
| "grad_norm": 2.3894294160789715, |
| "kl": 0.0140380859375, |
| "learning_rate": 3.964023341020155e-07, |
| "loss": 0.0006, |
| "reward": 1.2482143640518188, |
| "reward_std": 0.3738020062446594, |
| "rewards/accuracy_reward": 0.2321428656578064, |
| "rewards/format_reward": 0.9821429252624512, |
| "step": 162, |
| "temporal_rewards": 0.5 |
| }, |
| { |
| "all_correct": 0.14285714285714285, |
| "all_wrong": 0.42857142857142855, |
| "completion_length": 195.17857360839844, |
| "epoch": 0.5699300699300699, |
| "grad_norm": 2.3389942848580025, |
| "kl": 0.019287109375, |
| "learning_rate": 3.9103558238510083e-07, |
| "loss": 0.0008, |
| "reward": 1.4928572177886963, |
| "reward_std": 0.3350135385990143, |
| "rewards/accuracy_reward": 0.392857164144516, |
| "rewards/format_reward": 0.9821429252624512, |
| "step": 163, |
| "temporal_rewards": 0.9285714030265808 |
| }, |
| { |
| "all_correct": 0.5714285714285714, |
| "all_wrong": 0.2857142857142857, |
| "completion_length": 288.26788330078125, |
| "epoch": 0.5734265734265734, |
| "grad_norm": 1.4855251481569927, |
| "kl": 0.01519775390625, |
| "learning_rate": 3.856819783145514e-07, |
| "loss": 0.0006, |
| "reward": 1.7053571939468384, |
| "reward_std": 0.15428219735622406, |
| "rewards/accuracy_reward": 0.5892857313156128, |
| "rewards/format_reward": 0.9821429252624512, |
| "step": 164, |
| "temporal_rewards": 0.7857142686843872 |
| }, |
| { |
| "all_correct": 0.14285714285714285, |
| "all_wrong": 0.42857142857142855, |
| "completion_length": 309.4821472167969, |
| "epoch": 0.5769230769230769, |
| "grad_norm": 2.75844544825569, |
| "kl": 0.01434326171875, |
| "learning_rate": 3.8034216785622125e-07, |
| "loss": 0.0006, |
| "reward": 1.3563262224197388, |
| "reward_std": 0.26625069975852966, |
| "rewards/accuracy_reward": 0.3259689211845398, |
| "rewards/format_reward": 0.9285714626312256, |
| "step": 165, |
| "temporal_rewards": 0.6428571343421936 |
| }, |
| { |
| "all_correct": 0.14285714285714285, |
| "all_wrong": 0.2857142857142857, |
| "completion_length": 321.625, |
| "epoch": 0.5804195804195804, |
| "grad_norm": 1.3890774295899448, |
| "kl": 0.01361083984375, |
| "learning_rate": 3.750167953116272e-07, |
| "loss": 0.0005, |
| "reward": 1.692857265472412, |
| "reward_std": 0.4047864079475403, |
| "rewards/accuracy_reward": 0.5892857313156128, |
| "rewards/format_reward": 0.9553571939468384, |
| "step": 166, |
| "temporal_rewards": 0.714285671710968 |
| }, |
| { |
| "all_correct": 0.42857142857142855, |
| "all_wrong": 0.2857142857142857, |
| "completion_length": 238.19644165039062, |
| "epoch": 0.583916083916084, |
| "grad_norm": 1.2961019229402087, |
| "kl": 0.015869140625, |
| "learning_rate": 3.697065032402078e-07, |
| "loss": 0.0006, |
| "reward": 1.7660715579986572, |
| "reward_std": 0.21566565334796906, |
| "rewards/accuracy_reward": 0.625, |
| "rewards/format_reward": 0.9821429252624512, |
| "step": 167, |
| "temporal_rewards": 0.714285671710968 |
| }, |
| { |
| "all_correct": 0.14285714285714285, |
| "all_wrong": 0.2857142857142857, |
| "completion_length": 202.7678680419922, |
| "epoch": 0.5874125874125874, |
| "grad_norm": 4.597615166391679, |
| "kl": 0.0169677734375, |
| "learning_rate": 3.6441193238179146e-07, |
| "loss": 0.0007, |
| "reward": 1.4142857789993286, |
| "reward_std": 0.26091986894607544, |
| "rewards/accuracy_reward": 0.3392857313156128, |
| "rewards/format_reward": 1.0, |
| "step": 168, |
| "temporal_rewards": 0.5 |
| }, |
| { |
| "all_correct": 0.42857142857142855, |
| "all_wrong": 0.2857142857142857, |
| "completion_length": 210.48214721679688, |
| "epoch": 0.5909090909090909, |
| "grad_norm": 1.624305794360812, |
| "kl": 0.0164794921875, |
| "learning_rate": 3.591337215792851e-07, |
| "loss": 0.0007, |
| "reward": 1.8267858028411865, |
| "reward_std": 0.1543826460838318, |
| "rewards/accuracy_reward": 0.6428571939468384, |
| "rewards/format_reward": 1.0, |
| "step": 169, |
| "temporal_rewards": 0.8571428656578064 |
| }, |
| { |
| "all_correct": 0.14285714285714285, |
| "all_wrong": 0.5714285714285714, |
| "completion_length": 354.46429443359375, |
| "epoch": 0.5944055944055944, |
| "grad_norm": 1.4548678641257315, |
| "kl": 0.01190185546875, |
| "learning_rate": 3.538725077015915e-07, |
| "loss": 0.0005, |
| "reward": 1.1678571701049805, |
| "reward_std": 0.22238534688949585, |
| "rewards/accuracy_reward": 0.25, |
| "rewards/format_reward": 0.848214328289032, |
| "step": 170, |
| "temporal_rewards": 0.5714285373687744 |
| }, |
| { |
| "all_correct": 0.2857142857142857, |
| "all_wrong": 0.14285714285714285, |
| "completion_length": 278.8214416503906, |
| "epoch": 0.5979020979020979, |
| "grad_norm": 1.1660346476914807, |
| "kl": 0.01434326171875, |
| "learning_rate": 3.486289255667639e-07, |
| "loss": 0.0006, |
| "reward": 1.6160714626312256, |
| "reward_std": 0.2827339172363281, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.9910714626312256, |
| "step": 171, |
| "temporal_rewards": 0.714285671710968 |
| }, |
| { |
| "all_correct": 0.2857142857142857, |
| "all_wrong": 0.2857142857142857, |
| "completion_length": 244.5357208251953, |
| "epoch": 0.6013986013986014, |
| "grad_norm": 5.594264279100422, |
| "kl": 0.0164794921875, |
| "learning_rate": 3.434036078654106e-07, |
| "loss": 0.0007, |
| "reward": 1.5910714864730835, |
| "reward_std": 0.23809079825878143, |
| "rewards/accuracy_reward": 0.4821428656578064, |
| "rewards/format_reward": 0.9910714626312256, |
| "step": 172, |
| "temporal_rewards": 0.5714285373687744 |
| }, |
| { |
| "all_correct": 0.14285714285714285, |
| "all_wrong": 0.5714285714285714, |
| "completion_length": 256.6071472167969, |
| "epoch": 0.6048951048951049, |
| "grad_norm": 1.3773808667435987, |
| "kl": 0.012451171875, |
| "learning_rate": 3.3819718508435226e-07, |
| "loss": 0.0005, |
| "reward": 1.355357050895691, |
| "reward_std": 0.2153625637292862, |
| "rewards/accuracy_reward": 0.3035714328289032, |
| "rewards/format_reward": 0.9821429252624512, |
| "step": 173, |
| "temporal_rewards": 0.7857142686843872 |
| }, |
| { |
| "all_correct": 0.14285714285714285, |
| "all_wrong": 0.42857142857142855, |
| "completion_length": 281.6607360839844, |
| "epoch": 0.6083916083916084, |
| "grad_norm": 1.8959658874179899, |
| "kl": 0.014404296875, |
| "learning_rate": 3.330102854305493e-07, |
| "loss": 0.0006, |
| "reward": 1.4553571939468384, |
| "reward_std": 0.22538065910339355, |
| "rewards/accuracy_reward": 0.392857164144516, |
| "rewards/format_reward": 1.0, |
| "step": 174, |
| "temporal_rewards": 0.6428571343421936 |
| }, |
| { |
| "all_correct": 0.14285714285714285, |
| "all_wrong": 0.14285714285714285, |
| "completion_length": 269.875, |
| "epoch": 0.6118881118881119, |
| "grad_norm": 2.2981475604340047, |
| "kl": 0.0166015625, |
| "learning_rate": 3.2784353475530135e-07, |
| "loss": 0.0007, |
| "reward": 1.469642996788025, |
| "reward_std": 0.4389226734638214, |
| "rewards/accuracy_reward": 0.392857164144516, |
| "rewards/format_reward": 0.9821429252624512, |
| "step": 175, |
| "temporal_rewards": 0.6428571343421936 |
| }, |
| { |
| "all_correct": 0.0, |
| "all_wrong": 0.42857142857142855, |
| "completion_length": 214.7678680419922, |
| "epoch": 0.6153846153846154, |
| "grad_norm": 2.5195555976201436, |
| "kl": 0.02001953125, |
| "learning_rate": 3.2269755647873214e-07, |
| "loss": 0.0008, |
| "reward": 1.3071428537368774, |
| "reward_std": 0.42797765135765076, |
| "rewards/accuracy_reward": 0.25, |
| "rewards/format_reward": 0.9821429252624512, |
| "step": 176, |
| "temporal_rewards": 0.8571428656578064 |
| }, |
| { |
| "all_correct": 0.2857142857142857, |
| "all_wrong": 0.14285714285714285, |
| "completion_length": 233.33929443359375, |
| "epoch": 0.6188811188811189, |
| "grad_norm": 2.055673511375122, |
| "kl": 0.01434326171875, |
| "learning_rate": 3.175729715145684e-07, |
| "loss": 0.0006, |
| "reward": 1.7625000476837158, |
| "reward_std": 0.3995477557182312, |
| "rewards/accuracy_reward": 0.625, |
| "rewards/format_reward": 0.9821429252624512, |
| "step": 177, |
| "temporal_rewards": 0.7857142686843872 |
| }, |
| { |
| "all_correct": 0.14285714285714285, |
| "all_wrong": 0.2857142857142857, |
| "completion_length": 280.89288330078125, |
| "epoch": 0.6223776223776224, |
| "grad_norm": 1.6236247273765754, |
| "kl": 0.0162353515625, |
| "learning_rate": 3.12470398195219e-07, |
| "loss": 0.0006, |
| "reward": 1.4589285850524902, |
| "reward_std": 0.32778358459472656, |
| "rewards/accuracy_reward": 0.392857164144516, |
| "rewards/format_reward": 0.9821429252624512, |
| "step": 178, |
| "temporal_rewards": 0.5 |
| }, |
| { |
| "all_correct": 0.2857142857142857, |
| "all_wrong": 0.42857142857142855, |
| "completion_length": 304.58929443359375, |
| "epoch": 0.6258741258741258, |
| "grad_norm": 1.7193136176143193, |
| "kl": 0.01519775390625, |
| "learning_rate": 3.0739045219716884e-07, |
| "loss": 0.0006, |
| "reward": 1.4285714626312256, |
| "reward_std": 0.2324400097131729, |
| "rewards/accuracy_reward": 0.4107142984867096, |
| "rewards/format_reward": 0.9642857313156128, |
| "step": 179, |
| "temporal_rewards": 0.5714285373687744 |
| }, |
| { |
| "all_correct": 0.14285714285714285, |
| "all_wrong": 0.2857142857142857, |
| "completion_length": 300.8214416503906, |
| "epoch": 0.6293706293706294, |
| "grad_norm": 4.499489721815655, |
| "kl": 0.01348876953125, |
| "learning_rate": 3.023337464666893e-07, |
| "loss": 0.0005, |
| "reward": 1.4296077489852905, |
| "reward_std": 0.2895960509777069, |
| "rewards/accuracy_reward": 0.3421076536178589, |
| "rewards/format_reward": 1.0, |
| "step": 180, |
| "temporal_rewards": 0.714285671710968 |
| }, |
| { |
| "all_correct": 0.14285714285714285, |
| "all_wrong": 0.2857142857142857, |
| "completion_length": 259.3571472167969, |
| "epoch": 0.6328671328671329, |
| "grad_norm": 2.1837077874780966, |
| "kl": 0.0157470703125, |
| "learning_rate": 2.9730089114588157e-07, |
| "loss": 0.0006, |
| "reward": 1.3535715341567993, |
| "reward_std": 0.37215283513069153, |
| "rewards/accuracy_reward": 0.3214285969734192, |
| "rewards/format_reward": 0.9642857313156128, |
| "step": 181, |
| "temporal_rewards": 0.6428571343421936 |
| }, |
| { |
| "all_correct": 0.2857142857142857, |
| "all_wrong": 0.42857142857142855, |
| "completion_length": 199.46429443359375, |
| "epoch": 0.6363636363636364, |
| "grad_norm": 1.3553227080052574, |
| "kl": 0.0220947265625, |
| "learning_rate": 2.922924934990568e-07, |
| "loss": 0.0009, |
| "reward": 1.4964287281036377, |
| "reward_std": 0.11663764715194702, |
| "rewards/accuracy_reward": 0.4107142984867096, |
| "rewards/format_reward": 1.0, |
| "step": 182, |
| "temporal_rewards": 0.6428571343421936 |
| }, |
| { |
| "all_correct": 0.42857142857142855, |
| "all_wrong": 0.14285714285714285, |
| "completion_length": 174.82144165039062, |
| "epoch": 0.6398601398601399, |
| "grad_norm": 1.904515168520845, |
| "kl": 0.020751953125, |
| "learning_rate": 2.873091578394626e-07, |
| "loss": 0.0008, |
| "reward": 1.7767858505249023, |
| "reward_std": 0.21223808825016022, |
| "rewards/accuracy_reward": 0.6071428656578064, |
| "rewards/format_reward": 1.0, |
| "step": 183, |
| "temporal_rewards": 0.7857142686843872 |
| }, |
| { |
| "all_correct": 0.14285714285714285, |
| "all_wrong": 0.0, |
| "completion_length": 209.21429443359375, |
| "epoch": 0.6433566433566433, |
| "grad_norm": 5.4284279958225605, |
| "kl": 0.0191650390625, |
| "learning_rate": 2.823514854563677e-07, |
| "loss": 0.0008, |
| "reward": 1.8017858266830444, |
| "reward_std": 0.5190714597702026, |
| "rewards/accuracy_reward": 0.6428571939468384, |
| "rewards/format_reward": 1.0, |
| "step": 184, |
| "temporal_rewards": 0.8571428656578064 |
| }, |
| { |
| "all_correct": 0.14285714285714285, |
| "all_wrong": 0.42857142857142855, |
| "completion_length": 232.5357208251953, |
| "epoch": 0.6468531468531469, |
| "grad_norm": 2.3909656732286817, |
| "kl": 0.015625, |
| "learning_rate": 2.774200745425096e-07, |
| "loss": 0.0006, |
| "reward": 1.3982144594192505, |
| "reward_std": 0.2684382200241089, |
| "rewards/accuracy_reward": 0.3035714328289032, |
| "rewards/format_reward": 1.0, |
| "step": 185, |
| "temporal_rewards": 0.714285671710968 |
| }, |
| { |
| "all_correct": 0.0, |
| "all_wrong": 0.8571428571428571, |
| "completion_length": 324.26788330078125, |
| "epoch": 0.6503496503496503, |
| "grad_norm": 1.1467278777382235, |
| "kl": 0.0130615234375, |
| "learning_rate": 2.725155201219176e-07, |
| "loss": 0.0005, |
| "reward": 1.1267857551574707, |
| "reward_std": 0.1317899227142334, |
| "rewards/accuracy_reward": 0.125, |
| "rewards/format_reward": 0.9642857313156128, |
| "step": 186, |
| "temporal_rewards": 0.714285671710968 |
| }, |
| { |
| "all_correct": 0.14285714285714285, |
| "all_wrong": 0.42857142857142855, |
| "completion_length": 362.7500305175781, |
| "epoch": 0.6538461538461539, |
| "grad_norm": 1.467850477853026, |
| "kl": 0.01495361328125, |
| "learning_rate": 2.676384139781157e-07, |
| "loss": 0.0006, |
| "reward": 1.289285659790039, |
| "reward_std": 0.2565459609031677, |
| "rewards/accuracy_reward": 0.267857164144516, |
| "rewards/format_reward": 0.9821429252624512, |
| "step": 187, |
| "temporal_rewards": 0.5 |
| }, |
| { |
| "all_correct": 0.42857142857142855, |
| "all_wrong": 0.2857142857142857, |
| "completion_length": 221.21429443359375, |
| "epoch": 0.6573426573426573, |
| "grad_norm": 1.8678333900967852, |
| "kl": 0.0218505859375, |
| "learning_rate": 2.6278934458271996e-07, |
| "loss": 0.0009, |
| "reward": 1.6946427822113037, |
| "reward_std": 0.2205595076084137, |
| "rewards/accuracy_reward": 0.5714285969734192, |
| "rewards/format_reward": 0.9821429252624512, |
| "step": 188, |
| "temporal_rewards": 0.714285671710968 |
| }, |
| { |
| "all_correct": 0.2857142857142857, |
| "all_wrong": 0.14285714285714285, |
| "completion_length": 252.19644165039062, |
| "epoch": 0.6608391608391608, |
| "grad_norm": 3.030907404699144, |
| "kl": 0.0157470703125, |
| "learning_rate": 2.5796889702443123e-07, |
| "loss": 0.0006, |
| "reward": 1.7250001430511475, |
| "reward_std": 0.3633832633495331, |
| "rewards/accuracy_reward": 0.6071428656578064, |
| "rewards/format_reward": 0.9821429252624512, |
| "step": 189, |
| "temporal_rewards": 0.7857142686843872 |
| }, |
| { |
| "all_correct": 0.14285714285714285, |
| "all_wrong": 0.2857142857142857, |
| "completion_length": 197.0357208251953, |
| "epoch": 0.6643356643356644, |
| "grad_norm": 2.934544973257425, |
| "kl": 0.019775390625, |
| "learning_rate": 2.5317765293844067e-07, |
| "loss": 0.0008, |
| "reward": 1.787500023841858, |
| "reward_std": 0.276123046875, |
| "rewards/accuracy_reward": 0.625, |
| "rewards/format_reward": 1.0, |
| "step": 190, |
| "temporal_rewards": 0.714285671710968 |
| }, |
| { |
| "all_correct": 0.14285714285714285, |
| "all_wrong": 0.42857142857142855, |
| "completion_length": 308.96429443359375, |
| "epoch": 0.6678321678321678, |
| "grad_norm": 2.9496641260720375, |
| "kl": 0.01336669921875, |
| "learning_rate": 2.4841619043624806e-07, |
| "loss": 0.0005, |
| "reward": 1.412500023841858, |
| "reward_std": 0.34019771218299866, |
| "rewards/accuracy_reward": 0.3750000298023224, |
| "rewards/format_reward": 0.9464285969734192, |
| "step": 191, |
| "temporal_rewards": 0.6428571343421936 |
| }, |
| { |
| "all_correct": 0.14285714285714285, |
| "all_wrong": 0.42857142857142855, |
| "completion_length": 354.2321472167969, |
| "epoch": 0.6713286713286714, |
| "grad_norm": 0.8446216299416934, |
| "kl": 0.01226806640625, |
| "learning_rate": 2.4368508403590725e-07, |
| "loss": 0.0005, |
| "reward": 1.4000000953674316, |
| "reward_std": 0.24233335256576538, |
| "rewards/accuracy_reward": 0.3571428656578064, |
| "rewards/format_reward": 0.9821429252624512, |
| "step": 192, |
| "temporal_rewards": 0.6428571343421936 |
| }, |
| { |
| "all_correct": 0.2857142857142857, |
| "all_wrong": 0.2857142857142857, |
| "completion_length": 205.23214721679688, |
| "epoch": 0.6748251748251748, |
| "grad_norm": 3.9294557546039433, |
| "kl": 0.0166015625, |
| "learning_rate": 2.389849045927049e-07, |
| "loss": 0.0007, |
| "reward": 1.7482143640518188, |
| "reward_std": 0.2757527232170105, |
| "rewards/accuracy_reward": 0.5892857313156128, |
| "rewards/format_reward": 0.9910714626312256, |
| "step": 193, |
| "temporal_rewards": 0.7857142686843872 |
| }, |
| { |
| "all_correct": 0.14285714285714285, |
| "all_wrong": 0.2857142857142857, |
| "completion_length": 243.12501525878906, |
| "epoch": 0.6783216783216783, |
| "grad_norm": 2.2731968890998067, |
| "kl": 0.021240234375, |
| "learning_rate": 2.3431621923028144e-07, |
| "loss": 0.0008, |
| "reward": 1.7178571224212646, |
| "reward_std": 0.34315115213394165, |
| "rewards/accuracy_reward": 0.5714285969734192, |
| "rewards/format_reward": 0.9910714626312256, |
| "step": 194, |
| "temporal_rewards": 0.7857142686843872 |
| }, |
| { |
| "all_correct": 0.14285714285714285, |
| "all_wrong": 0.42857142857142855, |
| "completion_length": 207.9107208251953, |
| "epoch": 0.6818181818181818, |
| "grad_norm": 2.027560604023802, |
| "kl": 0.0181884765625, |
| "learning_rate": 2.2967959127220137e-07, |
| "loss": 0.0007, |
| "reward": 1.4964287281036377, |
| "reward_std": 0.24646392464637756, |
| "rewards/accuracy_reward": 0.392857164144516, |
| "rewards/format_reward": 1.0, |
| "step": 195, |
| "temporal_rewards": 0.8571428656578064 |
| }, |
| { |
| "all_correct": 0.14285714285714285, |
| "all_wrong": 0.0, |
| "completion_length": 187.82144165039062, |
| "epoch": 0.6853146853146853, |
| "grad_norm": 3.738312976747544, |
| "kl": 0.020263671875, |
| "learning_rate": 2.250755801739826e-07, |
| "loss": 0.0008, |
| "reward": 1.826785683631897, |
| "reward_std": 0.49625372886657715, |
| "rewards/accuracy_reward": 0.660714328289032, |
| "rewards/format_reward": 0.973214328289032, |
| "step": 196, |
| "temporal_rewards": 0.9285714030265808 |
| }, |
| { |
| "all_correct": 0.0, |
| "all_wrong": 0.2857142857142857, |
| "completion_length": 218.05357360839844, |
| "epoch": 0.6888111888111889, |
| "grad_norm": 6.936122875948517, |
| "kl": 0.021240234375, |
| "learning_rate": 2.2050474145559323e-07, |
| "loss": 0.0008, |
| "reward": 1.3642857074737549, |
| "reward_std": 0.46779900789260864, |
| "rewards/accuracy_reward": 0.3214285969734192, |
| "rewards/format_reward": 0.9642857313156128, |
| "step": 197, |
| "temporal_rewards": 0.714285671710968 |
| }, |
| { |
| "all_correct": 0.0, |
| "all_wrong": 0.7142857142857143, |
| "completion_length": 246.4285888671875, |
| "epoch": 0.6923076923076923, |
| "grad_norm": 5.92965203555489, |
| "kl": 0.027587890625, |
| "learning_rate": 2.1596762663442213e-07, |
| "loss": 0.0011, |
| "reward": 1.2428572177886963, |
| "reward_std": 0.23482269048690796, |
| "rewards/accuracy_reward": 0.2321428656578064, |
| "rewards/format_reward": 0.9553571939468384, |
| "step": 198, |
| "temporal_rewards": 0.6428571343421936 |
| }, |
| { |
| "all_correct": 0.0, |
| "all_wrong": 0.2857142857142857, |
| "completion_length": 167.30357360839844, |
| "epoch": 0.6958041958041958, |
| "grad_norm": 9.899413362253103, |
| "kl": 0.0206298828125, |
| "learning_rate": 2.1146478315873233e-07, |
| "loss": 0.0008, |
| "reward": 1.4642857313156128, |
| "reward_std": 0.4399777352809906, |
| "rewards/accuracy_reward": 0.3571428656578064, |
| "rewards/format_reward": 1.0, |
| "step": 199, |
| "temporal_rewards": 0.9285714030265808 |
| }, |
| { |
| "all_correct": 0.2857142857142857, |
| "all_wrong": 0.14285714285714285, |
| "completion_length": 179.67857360839844, |
| "epoch": 0.6993006993006993, |
| "grad_norm": 2.965286701962895, |
| "kl": 0.0235595703125, |
| "learning_rate": 2.0699675434160695e-07, |
| "loss": 0.0009, |
| "reward": 1.7607142925262451, |
| "reward_std": 0.328709214925766, |
| "rewards/accuracy_reward": 0.5892857313156128, |
| "rewards/format_reward": 1.0, |
| "step": 200, |
| "temporal_rewards": 0.7857142686843872 |
| } |
| ], |
| "logging_steps": 1.0, |
| "max_steps": 286, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 100, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|