{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9966996699669967, "eval_steps": 10, "global_step": 151, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 761.6875305175781, "epoch": 0.006600660066006601, "grad_norm": 0.09082216769456863, "kl": 0.0, "learning_rate": 1.875e-07, "loss": -0.0159, "reward": 0.2291666679084301, "reward_std": 0.1705273911356926, "rewards/accuracy_reward": 0.2291666679084301, "rewards/format_reward": 0.0, "step": 1 }, { "completion_length": 897.8541717529297, "epoch": 0.013201320132013201, "grad_norm": 0.12771357595920563, "kl": 0.0, "learning_rate": 3.75e-07, "loss": 0.0257, "reward": 0.3750000111758709, "reward_std": 0.2861081659793854, "rewards/accuracy_reward": 0.3750000111758709, "rewards/format_reward": 0.0, "step": 2 }, { "completion_length": 946.2291717529297, "epoch": 0.019801980198019802, "grad_norm": 0.202493816614151, "kl": 0.0002696514129638672, "learning_rate": 5.625e-07, "loss": 0.0767, "reward": 0.43750002048909664, "reward_std": 0.33713919669389725, "rewards/accuracy_reward": 0.43750002048909664, "rewards/format_reward": 0.0, "step": 3 }, { "completion_length": 947.2708740234375, "epoch": 0.026402640264026403, "grad_norm": 0.13421419262886047, "kl": 0.00023603439331054688, "learning_rate": 7.5e-07, "loss": 0.0437, "reward": 0.3541666716337204, "reward_std": 0.4932760149240494, "rewards/accuracy_reward": 0.3541666716337204, "rewards/format_reward": 0.0, "step": 4 }, { "completion_length": 949.7708587646484, "epoch": 0.033003300330033, "grad_norm": 0.15579567849636078, "kl": 0.0001881122589111328, "learning_rate": 9.375e-07, "loss": 0.082, "reward": 0.583333358168602, "reward_std": 0.4701542407274246, "rewards/accuracy_reward": 0.583333358168602, "rewards/format_reward": 0.0, "step": 5 }, { "completion_length": 679.7500305175781, "epoch": 0.039603960396039604, "grad_norm": 0.19298173487186432, "kl": 0.0002353191375732422, "learning_rate": 1.125e-06, "loss": -0.055, "reward": 0.7500000149011612, "reward_std": 0.3332235999405384, "rewards/accuracy_reward": 0.7500000149011612, "rewards/format_reward": 0.0, "step": 6 }, { "completion_length": 713.4375305175781, "epoch": 0.0462046204620462, "grad_norm": 0.151686891913414, "kl": 0.00026154518127441406, "learning_rate": 1.3125000000000001e-06, "loss": 0.0054, "reward": 0.416666679084301, "reward_std": 0.377695269882679, "rewards/accuracy_reward": 0.416666679084301, "rewards/format_reward": 0.0, "step": 7 }, { "completion_length": 721.6250076293945, "epoch": 0.052805280528052806, "grad_norm": 0.0006056017591618001, "kl": 0.00021958351135253906, "learning_rate": 1.5e-06, "loss": 0.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 8 }, { "completion_length": 756.0000228881836, "epoch": 0.0594059405940594, "grad_norm": 0.14626246690750122, "kl": 0.0002689361572265625, "learning_rate": 1.6875e-06, "loss": -0.0, "reward": 0.33333334140479565, "reward_std": 0.23116152361035347, "rewards/accuracy_reward": 0.33333334140479565, "rewards/format_reward": 0.0, "step": 9 }, { "completion_length": 755.75, "epoch": 0.066006600660066, "grad_norm": 0.14428134262561798, "kl": 0.00023508071899414062, "learning_rate": 1.875e-06, "loss": -0.0386, "reward": 0.7291666865348816, "reward_std": 0.21764282882213593, "rewards/accuracy_reward": 0.7291666865348816, "rewards/format_reward": 0.0, "step": 10 }, { "completion_length": 993.1042022705078, "epoch": 0.07260726072607261, "grad_norm": 0.1510692834854126, "kl": 0.0003027915954589844, "learning_rate": 2.0625e-06, "loss": 0.118, "reward": 0.5208333432674408, "reward_std": 0.4932760149240494, "rewards/accuracy_reward": 0.5208333432674408, "rewards/format_reward": 0.0, "step": 11 }, { "completion_length": 831.3125152587891, "epoch": 0.07920792079207921, "grad_norm": 0.08587031811475754, "kl": 0.00022935867309570312, "learning_rate": 2.25e-06, "loss": 0.0158, "reward": 0.708333358168602, "reward_std": 0.24859580025076866, "rewards/accuracy_reward": 0.708333358168602, "rewards/format_reward": 0.0, "step": 12 }, { "completion_length": 786.7708435058594, "epoch": 0.0858085808580858, "grad_norm": 0.15709738433361053, "kl": 0.0002751350402832031, "learning_rate": 2.4375e-06, "loss": 0.0416, "reward": 0.4375000149011612, "reward_std": 0.28219255432486534, "rewards/accuracy_reward": 0.4375000149011612, "rewards/format_reward": 0.0, "step": 13 }, { "completion_length": 896.3958587646484, "epoch": 0.0924092409240924, "grad_norm": 0.1950385421514511, "kl": 0.0004210472106933594, "learning_rate": 2.6250000000000003e-06, "loss": -0.0025, "reward": 0.416666679084301, "reward_std": 0.30354245752096176, "rewards/accuracy_reward": 0.416666679084301, "rewards/format_reward": 0.0, "step": 14 }, { "completion_length": 915.1458587646484, "epoch": 0.09900990099009901, "grad_norm": 0.1464419662952423, "kl": 0.00046825408935546875, "learning_rate": 2.8125e-06, "loss": 0.0378, "reward": 0.5208333358168602, "reward_std": 0.3720077723264694, "rewards/accuracy_reward": 0.5208333358168602, "rewards/format_reward": 0.0, "step": 15 }, { "completion_length": 733.4792022705078, "epoch": 0.10561056105610561, "grad_norm": 0.195227712392807, "kl": 0.0011754035949707031, "learning_rate": 3e-06, "loss": -0.026, "reward": 0.645833358168602, "reward_std": 0.41912318766117096, "rewards/accuracy_reward": 0.645833358168602, "rewards/format_reward": 0.0, "step": 16 }, { "completion_length": 864.2083740234375, "epoch": 0.11221122112211221, "grad_norm": 0.29113319516181946, "kl": 0.0006542205810546875, "learning_rate": 2.9995938617691924e-06, "loss": 0.0484, "reward": 0.5000000186264515, "reward_std": 0.4152075983583927, "rewards/accuracy_reward": 0.5000000186264515, "rewards/format_reward": 0.0, "step": 17 }, { "completion_length": 795.2291870117188, "epoch": 0.1188118811881188, "grad_norm": 0.13108719885349274, "kl": 0.0011267662048339844, "learning_rate": 2.998375667007787e-06, "loss": 0.0592, "reward": 0.6666666865348816, "reward_std": 0.3035424277186394, "rewards/accuracy_reward": 0.6666666865348816, "rewards/format_reward": 0.0, "step": 18 }, { "completion_length": 722.3542022705078, "epoch": 0.1254125412541254, "grad_norm": 0.17488506436347961, "kl": 0.0010900497436523438, "learning_rate": 2.9963460753897363e-06, "loss": 0.0007, "reward": 0.6458333432674408, "reward_std": 0.29962683096528053, "rewards/accuracy_reward": 0.6458333432674408, "rewards/format_reward": 0.0, "step": 19 }, { "completion_length": 776.3750152587891, "epoch": 0.132013201320132, "grad_norm": 0.11859464645385742, "kl": 0.00139617919921875, "learning_rate": 2.9935061859747068e-06, "loss": 0.0307, "reward": 0.3333333432674408, "reward_std": 0.26603007316589355, "rewards/accuracy_reward": 0.3333333432674408, "rewards/format_reward": 0.0, "step": 20 }, { "completion_length": 661.0625152587891, "epoch": 0.13861386138613863, "grad_norm": 0.15094302594661713, "kl": 0.0016927719116210938, "learning_rate": 2.989857536612915e-06, "loss": -0.0411, "reward": 0.770833358168602, "reward_std": 0.33713918551802635, "rewards/accuracy_reward": 0.770833358168602, "rewards/format_reward": 0.0, "step": 21 }, { "completion_length": 848.6666870117188, "epoch": 0.14521452145214522, "grad_norm": 0.2171953022480011, "kl": 0.00218963623046875, "learning_rate": 2.9854021031123555e-06, "loss": 0.074, "reward": 0.6041666716337204, "reward_std": 0.18796169012784958, "rewards/accuracy_reward": 0.6041666716337204, "rewards/format_reward": 0.0, "step": 22 }, { "completion_length": 764.2083587646484, "epoch": 0.15181518151815182, "grad_norm": 0.19823089241981506, "kl": 0.004238128662109375, "learning_rate": 2.980142298168869e-06, "loss": 0.0469, "reward": 0.35416667722165585, "reward_std": 0.36417657136917114, "rewards/accuracy_reward": 0.35416667722165585, "rewards/format_reward": 0.0, "step": 23 }, { "completion_length": 1017.3333435058594, "epoch": 0.15841584158415842, "grad_norm": 0.1434755176305771, "kl": 0.006870269775390625, "learning_rate": 2.97408097005962e-06, "loss": 0.0274, "reward": 0.2500000074505806, "reward_std": 0.3506578952074051, "rewards/accuracy_reward": 0.2500000074505806, "rewards/format_reward": 0.0, "step": 24 }, { "completion_length": 750.2917022705078, "epoch": 0.16501650165016502, "grad_norm": 0.13002969324588776, "kl": 0.0044384002685546875, "learning_rate": 2.9672214011007086e-06, "loss": 0.0874, "reward": 0.583333358168602, "reward_std": 0.3061862215399742, "rewards/accuracy_reward": 0.583333358168602, "rewards/format_reward": 0.0, "step": 25 }, { "completion_length": 810.7916870117188, "epoch": 0.1716171617161716, "grad_norm": 0.23493841290473938, "kl": 0.008636474609375, "learning_rate": 2.959567305869736e-06, "loss": 0.0438, "reward": 0.4583333432674408, "reward_std": 0.32274864614009857, "rewards/accuracy_reward": 0.4583333432674408, "rewards/format_reward": 0.0, "step": 26 }, { "completion_length": 814.3750152587891, "epoch": 0.1782178217821782, "grad_norm": 0.319170206785202, "kl": 0.007965087890625, "learning_rate": 2.951122829194296e-06, "loss": -0.0679, "reward": 0.5833333507180214, "reward_std": 0.24859580025076866, "rewards/accuracy_reward": 0.5833333507180214, "rewards/format_reward": 0.0, "step": 27 }, { "completion_length": 758.1041870117188, "epoch": 0.1848184818481848, "grad_norm": 0.27551642060279846, "kl": 0.007470130920410156, "learning_rate": 2.9418925439074784e-06, "loss": 0.0304, "reward": 0.6875000298023224, "reward_std": 0.3842546343803406, "rewards/accuracy_reward": 0.6875000298023224, "rewards/format_reward": 0.0, "step": 28 }, { "completion_length": 950.4375152587891, "epoch": 0.19141914191419143, "grad_norm": 0.10155142843723297, "kl": 0.021270751953125, "learning_rate": 2.9318814483715983e-06, "loss": 0.0413, "reward": 0.3541666679084301, "reward_std": 0.13301505148410797, "rewards/accuracy_reward": 0.3541666679084301, "rewards/format_reward": 0.0, "step": 29 }, { "completion_length": 718.0208587646484, "epoch": 0.19801980198019803, "grad_norm": 0.25280967354774475, "kl": 0.01032257080078125, "learning_rate": 2.921094963771494e-06, "loss": -0.0332, "reward": 0.5416666772216558, "reward_std": 0.2861081622540951, "rewards/accuracy_reward": 0.5416666772216558, "rewards/format_reward": 0.0, "step": 30 }, { "completion_length": 836.3333587646484, "epoch": 0.20462046204620463, "grad_norm": 0.42321765422821045, "kl": 0.0301666259765625, "learning_rate": 2.9095389311788626e-06, "loss": -0.0053, "reward": 0.4583333544433117, "reward_std": 0.3506578803062439, "rewards/accuracy_reward": 0.4583333544433117, "rewards/format_reward": 0.0, "step": 31 }, { "completion_length": 704.520866394043, "epoch": 0.21122112211221122, "grad_norm": 0.23243634402751923, "kl": 0.015577316284179688, "learning_rate": 2.8972196083892137e-06, "loss": 0.0638, "reward": 0.6666666865348816, "reward_std": 0.3035424277186394, "rewards/accuracy_reward": 0.6666666865348816, "rewards/format_reward": 0.0, "step": 32 }, { "completion_length": 819.6458587646484, "epoch": 0.21782178217821782, "grad_norm": 0.2788753807544708, "kl": 0.017120361328125, "learning_rate": 2.8841436665331635e-06, "loss": 0.1618, "reward": 0.583333358168602, "reward_std": 0.4326418936252594, "rewards/accuracy_reward": 0.583333358168602, "rewards/format_reward": 0.0, "step": 33 }, { "completion_length": 953.5417175292969, "epoch": 0.22442244224422442, "grad_norm": 0.168825164437294, "kl": 0.0284271240234375, "learning_rate": 2.8703181864639013e-06, "loss": 0.1058, "reward": 0.4375000149011612, "reward_std": 0.43655748665332794, "rewards/accuracy_reward": 0.4375000149011612, "rewards/format_reward": 0.0, "step": 34 }, { "completion_length": 712.7916870117188, "epoch": 0.23102310231023102, "grad_norm": 0.3284476101398468, "kl": 0.02169036865234375, "learning_rate": 2.855750654922781e-06, "loss": 0.0903, "reward": 0.708333358168602, "reward_std": 0.3236205019056797, "rewards/accuracy_reward": 0.708333358168602, "rewards/format_reward": 0.0, "step": 35 }, { "completion_length": 975.9791870117188, "epoch": 0.2376237623762376, "grad_norm": 0.11752771586179733, "kl": 0.04449462890625, "learning_rate": 2.8404489604851183e-06, "loss": 0.0141, "reward": 0.3125000074505806, "reward_std": 0.3720077723264694, "rewards/accuracy_reward": 0.3125000074505806, "rewards/format_reward": 0.0, "step": 36 }, { "completion_length": 727.8125, "epoch": 0.24422442244224424, "grad_norm": 0.20371825993061066, "kl": 0.0565185546875, "learning_rate": 2.8244213892883906e-06, "loss": 0.0715, "reward": 0.6666667014360428, "reward_std": 0.350657869130373, "rewards/accuracy_reward": 0.6666667014360428, "rewards/format_reward": 0.0, "step": 37 }, { "completion_length": 971.9375305175781, "epoch": 0.2508250825082508, "grad_norm": 0.10167910903692245, "kl": 0.0328369140625, "learning_rate": 2.8076766205451433e-06, "loss": 0.0277, "reward": 0.39583334885537624, "reward_std": 0.21764282882213593, "rewards/accuracy_reward": 0.39583334885537624, "rewards/format_reward": 0.0, "step": 38 }, { "completion_length": 996.3750305175781, "epoch": 0.25742574257425743, "grad_norm": 0.10488853603601456, "kl": 0.072265625, "learning_rate": 2.7902237218430485e-06, "loss": 0.0558, "reward": 0.3958333432674408, "reward_std": 0.29962684214115143, "rewards/accuracy_reward": 0.3958333432674408, "rewards/format_reward": 0.0, "step": 39 }, { "completion_length": 786.3125152587891, "epoch": 0.264026402640264, "grad_norm": 0.11269073933362961, "kl": 0.0630340576171875, "learning_rate": 2.772072144234639e-06, "loss": 0.0412, "reward": 0.5833333432674408, "reward_std": 0.16661179438233376, "rewards/accuracy_reward": 0.5833333432674408, "rewards/format_reward": 0.0, "step": 40 }, { "completion_length": 884.9375305175781, "epoch": 0.2706270627062706, "grad_norm": 0.1405339539051056, "kl": 0.0887451171875, "learning_rate": 2.753231717119405e-06, "loss": 0.1139, "reward": 0.6458333432674408, "reward_std": 0.44616057723760605, "rewards/accuracy_reward": 0.6458333432674408, "rewards/format_reward": 0.0, "step": 41 }, { "completion_length": 861.7291870117188, "epoch": 0.27722772277227725, "grad_norm": 0.11663912236690521, "kl": 0.08587646484375, "learning_rate": 2.7337126429209934e-06, "loss": 0.1666, "reward": 0.5833333544433117, "reward_std": 0.23116152733564377, "rewards/accuracy_reward": 0.5833333544433117, "rewards/format_reward": 0.0, "step": 42 }, { "completion_length": 920.8958587646484, "epoch": 0.2838283828382838, "grad_norm": 0.1270524263381958, "kl": 0.1448974609375, "learning_rate": 2.713525491562421e-06, "loss": 0.1552, "reward": 0.5208333432674408, "reward_std": 0.28219256550073624, "rewards/accuracy_reward": 0.5208333432674408, "rewards/format_reward": 0.0, "step": 43 }, { "completion_length": 1177.687515258789, "epoch": 0.29042904290429045, "grad_norm": 0.17139725387096405, "kl": 0.176513671875, "learning_rate": 2.6926811947422717e-06, "loss": 0.0787, "reward": 0.29166666977107525, "reward_std": 0.2957112528383732, "rewards/accuracy_reward": 0.29166666977107525, "rewards/format_reward": 0.0, "step": 44 }, { "completion_length": 951.2500305175781, "epoch": 0.297029702970297, "grad_norm": 0.11752501130104065, "kl": 0.1422119140625, "learning_rate": 2.671191040014989e-06, "loss": 0.1136, "reward": 0.4375000149011612, "reward_std": 0.309229951351881, "rewards/accuracy_reward": 0.4375000149011612, "rewards/format_reward": 0.0, "step": 45 }, { "completion_length": 922.6458435058594, "epoch": 0.30363036303630364, "grad_norm": 0.16094517707824707, "kl": 0.15997314453125, "learning_rate": 2.649066664678467e-06, "loss": 0.0865, "reward": 0.416666679084301, "reward_std": 0.32274864614009857, "rewards/accuracy_reward": 0.416666679084301, "rewards/format_reward": 0.0, "step": 46 }, { "completion_length": 713.6458740234375, "epoch": 0.3102310231023102, "grad_norm": 0.17246519029140472, "kl": 0.08984375, "learning_rate": 2.626320049472249e-06, "loss": 0.1438, "reward": 0.5000000260770321, "reward_std": 0.3332235999405384, "rewards/accuracy_reward": 0.5000000260770321, "rewards/format_reward": 0.0, "step": 47 }, { "completion_length": 924.0208435058594, "epoch": 0.31683168316831684, "grad_norm": 0.13788333535194397, "kl": 0.11041259765625, "learning_rate": 2.6029635120897432e-06, "loss": 0.1128, "reward": 0.3125000111758709, "reward_std": 0.40168891102075577, "rewards/accuracy_reward": 0.3125000111758709, "rewards/format_reward": 0.0, "step": 48 }, { "completion_length": 1077.2292175292969, "epoch": 0.3234323432343234, "grad_norm": 0.12129193544387817, "kl": 0.18906784057617188, "learning_rate": 2.5790097005079765e-06, "loss": 0.0767, "reward": 0.3125000037252903, "reward_std": 0.2350771278142929, "rewards/accuracy_reward": 0.3125000037252903, "rewards/format_reward": 0.0, "step": 49 }, { "completion_length": 866.0416870117188, "epoch": 0.33003300330033003, "grad_norm": 0.12490338832139969, "kl": 0.1341552734375, "learning_rate": 2.5544715861384928e-06, "loss": 0.1316, "reward": 0.41666667722165585, "reward_std": 0.2957112640142441, "rewards/accuracy_reward": 0.41666667722165585, "rewards/format_reward": 0.0, "step": 50 }, { "completion_length": 851.5208587646484, "epoch": 0.33663366336633666, "grad_norm": 0.1823125034570694, "kl": 0.0992431640625, "learning_rate": 2.529362456803101e-06, "loss": 0.0959, "reward": 0.4791666828095913, "reward_std": 0.43655748665332794, "rewards/accuracy_reward": 0.4791666828095913, "rewards/format_reward": 0.0, "step": 51 }, { "completion_length": 1083.4583435058594, "epoch": 0.3432343234323432, "grad_norm": 0.12086265534162521, "kl": 0.25439453125, "learning_rate": 2.5036959095382875e-06, "loss": 0.1457, "reward": 0.16666667349636555, "reward_std": 0.24859580397605896, "rewards/accuracy_reward": 0.16666667349636555, "rewards/format_reward": 0.0, "step": 52 }, { "completion_length": 663.4583587646484, "epoch": 0.34983498349834985, "grad_norm": 0.1067744717001915, "kl": 0.0916290283203125, "learning_rate": 2.477485843232183e-06, "loss": 0.1077, "reward": 0.6666666716337204, "reward_std": 0.11949635669589043, "rewards/accuracy_reward": 0.6666666716337204, "rewards/format_reward": 0.0, "step": 53 }, { "completion_length": 1239.0834045410156, "epoch": 0.3564356435643564, "grad_norm": 0.1427147090435028, "kl": 0.22607421875, "learning_rate": 2.4507464510980654e-06, "loss": 0.1453, "reward": 0.22916667722165585, "reward_std": 0.23507710918784142, "rewards/accuracy_reward": 0.22916667722165585, "rewards/format_reward": 0.0, "step": 54 }, { "completion_length": 835.8750152587891, "epoch": 0.36303630363036304, "grad_norm": 0.171806201338768, "kl": 0.1243896484375, "learning_rate": 2.4234922129884873e-06, "loss": 0.1779, "reward": 0.5625000055879354, "reward_std": 0.31970490142703056, "rewards/accuracy_reward": 0.5625000055879354, "rewards/format_reward": 0.0, "step": 55 }, { "completion_length": 1060.2916870117188, "epoch": 0.3696369636963696, "grad_norm": 0.11252501606941223, "kl": 0.179443359375, "learning_rate": 2.3957378875541795e-06, "loss": 0.1761, "reward": 0.3333333432674408, "reward_std": 0.30354245379567146, "rewards/accuracy_reward": 0.3333333432674408, "rewards/format_reward": 0.0, "step": 56 }, { "completion_length": 1031.6458435058594, "epoch": 0.37623762376237624, "grad_norm": 0.08250569552183151, "kl": 0.1865234375, "learning_rate": 2.36749850425198e-06, "loss": 0.0708, "reward": 0.33333334140479565, "reward_std": 0.31314554437994957, "rewards/accuracy_reward": 0.33333334140479565, "rewards/format_reward": 0.0, "step": 57 }, { "completion_length": 948.4583587646484, "epoch": 0.38283828382838286, "grad_norm": 0.07917948067188263, "kl": 0.1787109375, "learning_rate": 2.3387893552061204e-06, "loss": 0.0915, "reward": 0.35416666977107525, "reward_std": 0.1705274023115635, "rewards/accuracy_reward": 0.35416666977107525, "rewards/format_reward": 0.0, "step": 58 }, { "completion_length": 972.2083587646484, "epoch": 0.38943894389438943, "grad_norm": 0.1615315079689026, "kl": 0.1478271484375, "learning_rate": 2.3096259869272697e-06, "loss": 0.0861, "reward": 0.4375000223517418, "reward_std": 0.40168892592191696, "rewards/accuracy_reward": 0.4375000223517418, "rewards/format_reward": 0.0, "step": 59 }, { "completion_length": 922.2708587646484, "epoch": 0.39603960396039606, "grad_norm": 0.1217813566327095, "kl": 0.1329345703125, "learning_rate": 2.280024191893823e-06, "loss": 0.1169, "reward": 0.4583333507180214, "reward_std": 0.2957112640142441, "rewards/accuracy_reward": 0.4583333507180214, "rewards/format_reward": 0.0, "step": 60 }, { "completion_length": 923.3750152587891, "epoch": 0.40264026402640263, "grad_norm": 0.1521437168121338, "kl": 0.11083984375, "learning_rate": 2.25e-06, "loss": 0.122, "reward": 0.5625000149011612, "reward_std": 0.40168890357017517, "rewards/accuracy_reward": 0.5625000149011612, "rewards/format_reward": 0.0, "step": 61 }, { "completion_length": 776.1250152587891, "epoch": 0.40924092409240925, "grad_norm": 0.15655189752578735, "kl": 0.13427734375, "learning_rate": 2.2195696698753695e-06, "loss": 0.0472, "reward": 0.3333333395421505, "reward_std": 0.3680921792984009, "rewards/accuracy_reward": 0.3333333395421505, "rewards/format_reward": 0.0, "step": 62 }, { "completion_length": 927.9792022705078, "epoch": 0.4158415841584158, "grad_norm": 0.08691170066595078, "kl": 0.1297607421875, "learning_rate": 2.1887496800805174e-06, "loss": 0.0569, "reward": 0.5208333507180214, "reward_std": 0.2350771315395832, "rewards/accuracy_reward": 0.5208333507180214, "rewards/format_reward": 0.0, "step": 63 }, { "completion_length": 1035.8125, "epoch": 0.42244224422442245, "grad_norm": 0.10468967258930206, "kl": 0.16912841796875, "learning_rate": 2.157556720183616e-06, "loss": 0.009, "reward": 0.2708333395421505, "reward_std": 0.2996268570423126, "rewards/accuracy_reward": 0.2708333395421505, "rewards/format_reward": 0.0, "step": 64 }, { "completion_length": 891.8333435058594, "epoch": 0.429042904290429, "grad_norm": 0.09926916658878326, "kl": 0.11190414428710938, "learning_rate": 2.126007681722727e-06, "loss": 0.0617, "reward": 0.5000000204890966, "reward_std": 0.2861081399023533, "rewards/accuracy_reward": 0.5000000204890966, "rewards/format_reward": 0.0, "step": 65 }, { "completion_length": 756.2708587646484, "epoch": 0.43564356435643564, "grad_norm": 0.14332562685012817, "kl": 0.09716796875, "learning_rate": 2.0941196490587354e-06, "loss": 0.0435, "reward": 0.5000000298023224, "reward_std": 0.2686738669872284, "rewards/accuracy_reward": 0.5000000298023224, "rewards/format_reward": 0.0, "step": 66 }, { "completion_length": 889.8125228881836, "epoch": 0.44224422442244227, "grad_norm": 0.16792258620262146, "kl": 0.1457672119140625, "learning_rate": 2.061909890123868e-06, "loss": 0.1384, "reward": 0.5416666865348816, "reward_std": 0.3506578765809536, "rewards/accuracy_reward": 0.5416666865348816, "rewards/format_reward": 0.0, "step": 67 }, { "completion_length": 870.2292098999023, "epoch": 0.44884488448844884, "grad_norm": 0.1451350599527359, "kl": 0.1453857421875, "learning_rate": 2.0293958470708033e-06, "loss": 0.0586, "reward": 0.3750000149011612, "reward_std": 0.23899272456765175, "rewards/accuracy_reward": 0.3750000149011612, "rewards/format_reward": 0.0, "step": 68 }, { "completion_length": 931.5625305175781, "epoch": 0.45544554455445546, "grad_norm": 0.17694266140460968, "kl": 0.2371826171875, "learning_rate": 1.9965951268274372e-06, "loss": 0.1671, "reward": 0.3750000149011612, "reward_std": 0.26603008806705475, "rewards/accuracy_reward": 0.3750000149011612, "rewards/format_reward": 0.0, "step": 69 }, { "completion_length": 767.4167022705078, "epoch": 0.46204620462046203, "grad_norm": 0.13988901674747467, "kl": 0.1268310546875, "learning_rate": 1.963525491562421e-06, "loss": -0.0182, "reward": 0.5833333432674408, "reward_std": 0.2957112491130829, "rewards/accuracy_reward": 0.5833333432674408, "rewards/format_reward": 0.0, "step": 70 }, { "completion_length": 945.3125305175781, "epoch": 0.46864686468646866, "grad_norm": 0.17755043506622314, "kl": 0.244384765625, "learning_rate": 1.9302048490666355e-06, "loss": 0.1788, "reward": 0.3541666828095913, "reward_std": 0.4662386551499367, "rewards/accuracy_reward": 0.3541666828095913, "rewards/format_reward": 0.0, "step": 71 }, { "completion_length": 935.2083435058594, "epoch": 0.4752475247524752, "grad_norm": 0.14559805393218994, "kl": 0.17041015625, "learning_rate": 1.8966512430558036e-06, "loss": 0.1942, "reward": 0.416666679084301, "reward_std": 0.4596792608499527, "rewards/accuracy_reward": 0.416666679084301, "rewards/format_reward": 0.0, "step": 72 }, { "completion_length": 908.1458587646484, "epoch": 0.48184818481848185, "grad_norm": 0.11466598510742188, "kl": 0.2431640625, "learning_rate": 1.8628828433995015e-06, "loss": -0.0209, "reward": 0.31250001676380634, "reward_std": 0.21764283999800682, "rewards/accuracy_reward": 0.31250001676380634, "rewards/format_reward": 0.0, "step": 73 }, { "completion_length": 868.7917022705078, "epoch": 0.4884488448844885, "grad_norm": 0.11173044145107269, "kl": 0.1383056640625, "learning_rate": 1.828917936281855e-06, "loss": 0.0838, "reward": 0.35416667722165585, "reward_std": 0.1530931033194065, "rewards/accuracy_reward": 0.35416667722165585, "rewards/format_reward": 0.0, "step": 74 }, { "completion_length": 933.4583587646484, "epoch": 0.49504950495049505, "grad_norm": 0.12861062586307526, "kl": 0.17724609375, "learning_rate": 1.7947749142992453e-06, "loss": 0.0587, "reward": 0.2291666716337204, "reward_std": 0.35457348823547363, "rewards/accuracy_reward": 0.2291666716337204, "rewards/format_reward": 0.0, "step": 75 }, { "completion_length": 924.4166717529297, "epoch": 0.5016501650165016, "grad_norm": 0.09584160894155502, "kl": 0.221435546875, "learning_rate": 1.7604722665003958e-06, "loss": 0.0786, "reward": 0.16666667722165585, "reward_std": 0.18404607474803925, "rewards/accuracy_reward": 0.16666667722165585, "rewards/format_reward": 0.0, "step": 76 }, { "completion_length": 1081.0208435058594, "epoch": 0.5082508250825083, "grad_norm": 0.11683700233697891, "kl": 0.1298828125, "learning_rate": 1.7260285683742248e-06, "loss": 0.1236, "reward": 0.416666679084301, "reward_std": 0.3680921830236912, "rewards/accuracy_reward": 0.416666679084301, "rewards/format_reward": 0.0, "step": 77 }, { "completion_length": 978.3333435058594, "epoch": 0.5148514851485149, "grad_norm": 0.09451648592948914, "kl": 0.1480712890625, "learning_rate": 1.6914624717908924e-06, "loss": 0.0095, "reward": 0.2708333395421505, "reward_std": 0.34674229472875595, "rewards/accuracy_reward": 0.2708333395421505, "rewards/format_reward": 0.0, "step": 78 }, { "completion_length": 1018.5417175292969, "epoch": 0.5214521452145214, "grad_norm": 0.10737847536802292, "kl": 0.217529296875, "learning_rate": 1.6567926949014804e-06, "loss": 0.0826, "reward": 0.3750000074505806, "reward_std": 0.24859581142663956, "rewards/accuracy_reward": 0.3750000074505806, "rewards/format_reward": 0.0, "step": 79 }, { "completion_length": 893.5000457763672, "epoch": 0.528052805280528, "grad_norm": 0.1193479523062706, "kl": 0.171875, "learning_rate": 1.6220380120017874e-06, "loss": 0.045, "reward": 0.43750002048909664, "reward_std": 0.35457346960902214, "rewards/accuracy_reward": 0.43750002048909664, "rewards/format_reward": 0.0, "step": 80 }, { "completion_length": 772.6041717529297, "epoch": 0.5346534653465347, "grad_norm": 0.1544453501701355, "kl": 0.15875244140625, "learning_rate": 1.5872172433657137e-06, "loss": -0.0317, "reward": 0.5416666828095913, "reward_std": 0.3332236036658287, "rewards/accuracy_reward": 0.5416666828095913, "rewards/format_reward": 0.0, "step": 81 }, { "completion_length": 817.8750305175781, "epoch": 0.5412541254125413, "grad_norm": 0.08425965160131454, "kl": 0.09405517578125, "learning_rate": 1.5523492450537518e-06, "loss": 0.0603, "reward": 0.5000000055879354, "reward_std": 0.23899271711707115, "rewards/accuracy_reward": 0.5000000055879354, "rewards/format_reward": 0.0, "step": 82 }, { "completion_length": 883.8542175292969, "epoch": 0.5478547854785478, "grad_norm": 0.14589935541152954, "kl": 0.218994140625, "learning_rate": 1.5174528987020958e-06, "loss": 0.1737, "reward": 0.3750000037252903, "reward_std": 0.4248107075691223, "rewards/accuracy_reward": 0.3750000037252903, "rewards/format_reward": 0.0, "step": 83 }, { "completion_length": 1053.2083740234375, "epoch": 0.5544554455445545, "grad_norm": 0.09967659413814545, "kl": 0.287841796875, "learning_rate": 1.4825471012979047e-06, "loss": -0.0045, "reward": 0.25000000558793545, "reward_std": 0.10206207260489464, "rewards/accuracy_reward": 0.25000000558793545, "rewards/format_reward": 0.0, "step": 84 }, { "completion_length": 1129.2083587646484, "epoch": 0.5610561056105611, "grad_norm": 0.0934222936630249, "kl": 0.2568359375, "learning_rate": 1.4476507549462489e-06, "loss": 0.0961, "reward": 0.37500001303851604, "reward_std": 0.2957112528383732, "rewards/accuracy_reward": 0.37500001303851604, "rewards/format_reward": 0.0, "step": 85 }, { "completion_length": 904.125, "epoch": 0.5676567656765676, "grad_norm": 0.13640955090522766, "kl": 0.2188720703125, "learning_rate": 1.4127827566342864e-06, "loss": 0.0729, "reward": 0.354166679084301, "reward_std": 0.3720077723264694, "rewards/accuracy_reward": 0.354166679084301, "rewards/format_reward": 0.0, "step": 86 }, { "completion_length": 931.2291870117188, "epoch": 0.5742574257425742, "grad_norm": 0.21060959994792938, "kl": 0.2344970703125, "learning_rate": 1.3779619879982127e-06, "loss": 0.1297, "reward": 0.4583333507180214, "reward_std": 0.3506578877568245, "rewards/accuracy_reward": 0.4583333507180214, "rewards/format_reward": 0.0, "step": 87 }, { "completion_length": 997.7916717529297, "epoch": 0.5808580858085809, "grad_norm": 0.14985467493534088, "kl": 0.306396484375, "learning_rate": 1.3432073050985201e-06, "loss": 0.0972, "reward": 0.25, "reward_std": 0.1369306445121765, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "step": 88 }, { "completion_length": 931.5833587646484, "epoch": 0.5874587458745875, "grad_norm": 0.33125752210617065, "kl": 0.2950439453125, "learning_rate": 1.308537528209108e-06, "loss": 0.1418, "reward": 0.5208333507180214, "reward_std": 0.2900237627327442, "rewards/accuracy_reward": 0.5208333507180214, "rewards/format_reward": 0.0, "step": 89 }, { "completion_length": 1021.1666870117188, "epoch": 0.594059405940594, "grad_norm": 0.16106674075126648, "kl": 0.379638671875, "learning_rate": 1.2739714316257753e-06, "loss": 0.0325, "reward": 0.3333333432674408, "reward_std": 0.3776952549815178, "rewards/accuracy_reward": 0.3333333432674408, "rewards/format_reward": 0.0, "step": 90 }, { "completion_length": 988.3125305175781, "epoch": 0.6006600660066007, "grad_norm": 16.83741569519043, "kl": 0.857177734375, "learning_rate": 1.2395277334996047e-06, "loss": 0.1811, "reward": 0.27083334140479565, "reward_std": 0.37465154752135277, "rewards/accuracy_reward": 0.27083334140479565, "rewards/format_reward": 0.0, "step": 91 }, { "completion_length": 828.8125305175781, "epoch": 0.6072607260726073, "grad_norm": 0.5444411039352417, "kl": 0.21484375, "learning_rate": 1.2052250857007548e-06, "loss": 0.1801, "reward": 0.5625000223517418, "reward_std": 0.38161084055900574, "rewards/accuracy_reward": 0.5625000223517418, "rewards/format_reward": 0.0, "step": 92 }, { "completion_length": 1117.7917022705078, "epoch": 0.6138613861386139, "grad_norm": 0.33171162009239197, "kl": 0.449462890625, "learning_rate": 1.1710820637181448e-06, "loss": 0.2095, "reward": 0.2500000111758709, "reward_std": 0.26603008806705475, "rewards/accuracy_reward": 0.2500000111758709, "rewards/format_reward": 0.0, "step": 93 }, { "completion_length": 825.5208740234375, "epoch": 0.6204620462046204, "grad_norm": 0.30983299016952515, "kl": 0.7236328125, "learning_rate": 1.1371171566004986e-06, "loss": 0.0181, "reward": 0.3333333432674408, "reward_std": 0.20148035883903503, "rewards/accuracy_reward": 0.3333333432674408, "rewards/format_reward": 0.0, "step": 94 }, { "completion_length": 973.3750152587891, "epoch": 0.6270627062706271, "grad_norm": 0.7808173894882202, "kl": 0.62158203125, "learning_rate": 1.103348756944197e-06, "loss": 0.1762, "reward": 0.3750000037252903, "reward_std": 0.4422450140118599, "rewards/accuracy_reward": 0.3750000037252903, "rewards/format_reward": 0.0, "step": 95 }, { "completion_length": 991.2708587646484, "epoch": 0.6336633663366337, "grad_norm": 0.6956799030303955, "kl": 0.50244140625, "learning_rate": 1.069795150933365e-06, "loss": 0.1548, "reward": 0.2291666753590107, "reward_std": 0.2996268570423126, "rewards/accuracy_reward": 0.2291666753590107, "rewards/format_reward": 0.0, "step": 96 }, { "completion_length": 845.6041870117188, "epoch": 0.6402640264026402, "grad_norm": 0.6455105543136597, "kl": 0.8310546875, "learning_rate": 1.036474508437579e-06, "loss": 0.0902, "reward": 0.2916666716337204, "reward_std": 0.11949635669589043, "rewards/accuracy_reward": 0.2916666716337204, "rewards/format_reward": 0.0, "step": 97 }, { "completion_length": 961.2708587646484, "epoch": 0.6468646864686468, "grad_norm": 1.0175178050994873, "kl": 1.361572265625, "learning_rate": 1.003404873172563e-06, "loss": 0.2628, "reward": 0.2916666716337204, "reward_std": 0.3506578765809536, "rewards/accuracy_reward": 0.2916666716337204, "rewards/format_reward": 0.0, "step": 98 }, { "completion_length": 801.7500152587891, "epoch": 0.6534653465346535, "grad_norm": 0.9493899941444397, "kl": 1.8583984375, "learning_rate": 9.70604152929197e-07, "loss": 0.1342, "reward": 0.27083333395421505, "reward_std": 0.33713919296860695, "rewards/accuracy_reward": 0.27083333395421505, "rewards/format_reward": 0.0, "step": 99 }, { "completion_length": 1018.9583587646484, "epoch": 0.6600660066006601, "grad_norm": 0.641622006893158, "kl": 1.20654296875, "learning_rate": 9.380901098761319e-07, "loss": 0.1288, "reward": 0.354166679084301, "reward_std": 0.21764283999800682, "rewards/accuracy_reward": 0.354166679084301, "rewards/format_reward": 0.0, "step": 100 }, { "completion_length": 991.6666870117188, "epoch": 0.6666666666666666, "grad_norm": 3.699712038040161, "kl": 2.2578125, "learning_rate": 9.058803509412648e-07, "loss": 0.1524, "reward": 0.20833334140479565, "reward_std": 0.2686738781630993, "rewards/accuracy_reward": 0.20833334140479565, "rewards/format_reward": 0.0, "step": 101 }, { "completion_length": 851.1458740234375, "epoch": 0.6732673267326733, "grad_norm": 0.7072895765304565, "kl": 0.2276611328125, "learning_rate": 8.739923182772733e-07, "loss": 0.0363, "reward": 0.5000000074505806, "reward_std": 0.341054804623127, "rewards/accuracy_reward": 0.5000000074505806, "rewards/format_reward": 0.0, "step": 102 }, { "completion_length": 609.0000152587891, "epoch": 0.6798679867986799, "grad_norm": 1.4843007326126099, "kl": 0.269775390625, "learning_rate": 8.424432798163837e-07, "loss": 0.03, "reward": 0.604166679084301, "reward_std": 0.28219255805015564, "rewards/accuracy_reward": 0.604166679084301, "rewards/format_reward": 0.0, "step": 103 }, { "completion_length": 905.5625152587891, "epoch": 0.6864686468646864, "grad_norm": 1.7519278526306152, "kl": 0.4345703125, "learning_rate": 8.112503199194822e-07, "loss": 0.026, "reward": 0.20833333767950535, "reward_std": 0.3602609820663929, "rewards/accuracy_reward": 0.20833333767950535, "rewards/format_reward": 0.0, "step": 104 }, { "completion_length": 880.8541870117188, "epoch": 0.693069306930693, "grad_norm": 2.7430357933044434, "kl": 2.5283203125, "learning_rate": 7.804303301246311e-07, "loss": 0.212, "reward": 0.31250001303851604, "reward_std": 0.21764283999800682, "rewards/accuracy_reward": 0.31250001303851604, "rewards/format_reward": 0.0, "step": 105 }, { "completion_length": 1010.9792022705078, "epoch": 0.6996699669966997, "grad_norm": 1.1869699954986572, "kl": 2.6875, "learning_rate": 7.500000000000003e-07, "loss": 0.2102, "reward": 0.37500001676380634, "reward_std": 0.3131455294787884, "rewards/accuracy_reward": 0.37500001676380634, "rewards/format_reward": 0.0, "step": 106 }, { "completion_length": 1013.1875305175781, "epoch": 0.7062706270627063, "grad_norm": 1.6787883043289185, "kl": 1.580078125, "learning_rate": 7.19975808106177e-07, "loss": 0.215, "reward": 0.2500000074505806, "reward_std": 0.31314554065465927, "rewards/accuracy_reward": 0.2500000074505806, "rewards/format_reward": 0.0, "step": 107 }, { "completion_length": 818.2917022705078, "epoch": 0.7128712871287128, "grad_norm": 1.6915416717529297, "kl": 0.703125, "learning_rate": 6.903740130727312e-07, "loss": 0.1459, "reward": 0.5000000149011612, "reward_std": 0.33057980239391327, "rewards/accuracy_reward": 0.5000000149011612, "rewards/format_reward": 0.0, "step": 108 }, { "completion_length": 1075.5833740234375, "epoch": 0.7194719471947195, "grad_norm": 0.8402836918830872, "kl": 0.9658203125, "learning_rate": 6.6121064479388e-07, "loss": 0.0926, "reward": 0.2916666716337204, "reward_std": 0.19364918768405914, "rewards/accuracy_reward": 0.2916666716337204, "rewards/format_reward": 0.0, "step": 109 }, { "completion_length": 848.4375305175781, "epoch": 0.7260726072607261, "grad_norm": 4.569257736206055, "kl": 0.77294921875, "learning_rate": 6.325014957480202e-07, "loss": 0.221, "reward": 0.5416666865348816, "reward_std": 0.4422449842095375, "rewards/accuracy_reward": 0.5416666865348816, "rewards/format_reward": 0.0, "step": 110 }, { "completion_length": 891.8750381469727, "epoch": 0.7326732673267327, "grad_norm": 1.430153489112854, "kl": 1.53662109375, "learning_rate": 6.04262112445821e-07, "loss": 0.1013, "reward": 0.43750000558793545, "reward_std": 0.2350771203637123, "rewards/accuracy_reward": 0.43750000558793545, "rewards/format_reward": 0.0, "step": 111 }, { "completion_length": 755.6458435058594, "epoch": 0.7392739273927392, "grad_norm": 0.3871193826198578, "kl": 0.632415771484375, "learning_rate": 5.765077870115125e-07, "loss": 0.0524, "reward": 0.43750002048909664, "reward_std": 0.21764283627271652, "rewards/accuracy_reward": 0.43750002048909664, "rewards/format_reward": 0.0, "step": 112 }, { "completion_length": 899.6875305175781, "epoch": 0.7458745874587459, "grad_norm": 2.0980653762817383, "kl": 0.4720458984375, "learning_rate": 5.492535489019345e-07, "loss": 0.1302, "reward": 0.41666667349636555, "reward_std": 0.3332235962152481, "rewards/accuracy_reward": 0.41666667349636555, "rewards/format_reward": 0.0, "step": 113 }, { "completion_length": 769.0417022705078, "epoch": 0.7524752475247525, "grad_norm": 2.5095949172973633, "kl": 0.71484375, "learning_rate": 5.225141567678172e-07, "loss": 0.1271, "reward": 0.5416666865348816, "reward_std": 0.4500761739909649, "rewards/accuracy_reward": 0.5416666865348816, "rewards/format_reward": 0.0, "step": 114 }, { "completion_length": 1073.0416870117188, "epoch": 0.759075907590759, "grad_norm": 1.9907251596450806, "kl": 1.24365234375, "learning_rate": 4.963040904617131e-07, "loss": 0.1739, "reward": 0.3333333507180214, "reward_std": 0.350657869130373, "rewards/accuracy_reward": 0.3333333507180214, "rewards/format_reward": 0.0, "step": 115 }, { "completion_length": 935.7500305175781, "epoch": 0.7656765676567657, "grad_norm": 3.745811939239502, "kl": 3.703125, "learning_rate": 4.7063754319689976e-07, "loss": 0.2618, "reward": 0.1458333395421505, "reward_std": 0.23507710546255112, "rewards/accuracy_reward": 0.1458333395421505, "rewards/format_reward": 0.0, "step": 116 }, { "completion_length": 1031.5208587646484, "epoch": 0.7722772277227723, "grad_norm": 5.434175968170166, "kl": 3.8812255859375, "learning_rate": 4.4552841386150737e-07, "loss": 0.2507, "reward": 0.22916666977107525, "reward_std": 0.25515518710017204, "rewards/accuracy_reward": 0.22916666977107525, "rewards/format_reward": 0.0, "step": 117 }, { "completion_length": 894.9375305175781, "epoch": 0.7788778877887789, "grad_norm": 2.229215621948242, "kl": 1.484375, "learning_rate": 4.2099029949202353e-07, "loss": 0.2131, "reward": 0.5625000223517418, "reward_std": 0.39208584651350975, "rewards/accuracy_reward": 0.5625000223517418, "rewards/format_reward": 0.0, "step": 118 }, { "completion_length": 884.8333435058594, "epoch": 0.7854785478547854, "grad_norm": 3.206272602081299, "kl": 2.19384765625, "learning_rate": 3.9703648791025716e-07, "loss": 0.121, "reward": 0.2291666753590107, "reward_std": 0.2621144950389862, "rewards/accuracy_reward": 0.2291666753590107, "rewards/format_reward": 0.0, "step": 119 }, { "completion_length": 927.5208587646484, "epoch": 0.7920792079207921, "grad_norm": 1.4413719177246094, "kl": 1.4619140625, "learning_rate": 3.736799505277512e-07, "loss": 0.2241, "reward": 0.3750000074505806, "reward_std": 0.2957112640142441, "rewards/accuracy_reward": 0.3750000074505806, "rewards/format_reward": 0.0, "step": 120 }, { "completion_length": 1030.8333740234375, "epoch": 0.7986798679867987, "grad_norm": 2.288118362426758, "kl": 2.2021484375, "learning_rate": 3.5093333532153313e-07, "loss": 0.2132, "reward": 0.5000000149011612, "reward_std": 0.3332235887646675, "rewards/accuracy_reward": 0.5000000149011612, "rewards/format_reward": 0.0, "step": 121 }, { "completion_length": 795.2291870117188, "epoch": 0.8052805280528053, "grad_norm": 4.600270748138428, "kl": 0.909210205078125, "learning_rate": 3.288089599850112e-07, "loss": 0.1673, "reward": 0.6250000149011612, "reward_std": 0.23116153478622437, "rewards/accuracy_reward": 0.6250000149011612, "rewards/format_reward": 0.0, "step": 122 }, { "completion_length": 820.2708740234375, "epoch": 0.8118811881188119, "grad_norm": 1.4757143259048462, "kl": 0.82757568359375, "learning_rate": 3.073188052577282e-07, "loss": 0.0951, "reward": 0.3333333432674408, "reward_std": 0.20412414520978928, "rewards/accuracy_reward": 0.3333333432674408, "rewards/format_reward": 0.0, "step": 123 }, { "completion_length": 1093.0417022705078, "epoch": 0.8184818481848185, "grad_norm": 1.748262643814087, "kl": 2.27734375, "learning_rate": 2.86474508437579e-07, "loss": 0.2423, "reward": 0.37500001303851604, "reward_std": 0.4248107150197029, "rewards/accuracy_reward": 0.37500001303851604, "rewards/format_reward": 0.0, "step": 124 }, { "completion_length": 906.8958587646484, "epoch": 0.8250825082508251, "grad_norm": 1.2707668542861938, "kl": 2.072265625, "learning_rate": 2.6628735707900655e-07, "loss": 0.1805, "reward": 0.3750000074505806, "reward_std": 0.3332236036658287, "rewards/accuracy_reward": 0.3750000074505806, "rewards/format_reward": 0.0, "step": 125 }, { "completion_length": 1030.1041717529297, "epoch": 0.8316831683168316, "grad_norm": 1.1277378797531128, "kl": 2.2890625, "learning_rate": 2.467682828805956e-07, "loss": 0.1976, "reward": 0.3125000111758709, "reward_std": 0.28219256177544594, "rewards/accuracy_reward": 0.3125000111758709, "rewards/format_reward": 0.0, "step": 126 }, { "completion_length": 740.7291870117188, "epoch": 0.8382838283828383, "grad_norm": 1.2501320838928223, "kl": 1.093414306640625, "learning_rate": 2.2792785576536108e-07, "loss": 0.1343, "reward": 0.6666666716337204, "reward_std": 0.16661179438233376, "rewards/accuracy_reward": 0.6666666716337204, "rewards/format_reward": 0.0, "step": 127 }, { "completion_length": 924.0625305175781, "epoch": 0.8448844884488449, "grad_norm": 1.4948713779449463, "kl": 1.78173828125, "learning_rate": 2.0977627815695215e-07, "loss": 0.1317, "reward": 0.3333333507180214, "reward_std": 0.23116152361035347, "rewards/accuracy_reward": 0.3333333507180214, "rewards/format_reward": 0.0, "step": 128 }, { "completion_length": 942.5208740234375, "epoch": 0.8514851485148515, "grad_norm": 0.5847099423408508, "kl": 0.99560546875, "learning_rate": 1.9232337945485655e-07, "loss": 0.0936, "reward": 0.4375, "reward_std": 0.06846532225608826, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 129 }, { "completion_length": 917.1666870117188, "epoch": 0.858085808580858, "grad_norm": 0.5804275274276733, "kl": 0.701263427734375, "learning_rate": 1.7557861071160953e-07, "loss": 0.0218, "reward": 0.31250000558793545, "reward_std": 0.2446802258491516, "rewards/accuracy_reward": 0.31250000558793545, "rewards/format_reward": 0.0, "step": 130 }, { "completion_length": 1004.8333587646484, "epoch": 0.8646864686468647, "grad_norm": 0.8942325711250305, "kl": 0.6883544921875, "learning_rate": 1.5955103951488177e-07, "loss": 0.1236, "reward": 0.35416667722165585, "reward_std": 0.36417656019330025, "rewards/accuracy_reward": 0.35416667722165585, "rewards/format_reward": 0.0, "step": 131 }, { "completion_length": 881.7916870117188, "epoch": 0.8712871287128713, "grad_norm": 2.220564365386963, "kl": 0.78369140625, "learning_rate": 1.4424934507721927e-07, "loss": 0.1166, "reward": 0.41666667722165585, "reward_std": 0.2686738669872284, "rewards/accuracy_reward": 0.41666667722165585, "rewards/format_reward": 0.0, "step": 132 }, { "completion_length": 997.3541870117188, "epoch": 0.8778877887788779, "grad_norm": 1.1939600706100464, "kl": 0.900390625, "learning_rate": 1.2968181353609853e-07, "loss": 0.0486, "reward": 0.3125000111758709, "reward_std": 0.1801304891705513, "rewards/accuracy_reward": 0.3125000111758709, "rewards/format_reward": 0.0, "step": 133 }, { "completion_length": 728.3541793823242, "epoch": 0.8844884488448845, "grad_norm": 0.9555450677871704, "kl": 0.58154296875, "learning_rate": 1.1585633346683655e-07, "loss": 0.0358, "reward": 0.45833334885537624, "reward_std": 0.3332235924899578, "rewards/accuracy_reward": 0.45833334885537624, "rewards/format_reward": 0.0, "step": 134 }, { "completion_length": 979.4167175292969, "epoch": 0.8910891089108911, "grad_norm": 0.5510440468788147, "kl": 0.7451171875, "learning_rate": 1.0278039161078634e-07, "loss": 0.0197, "reward": 0.3125000111758709, "reward_std": 0.25515517219901085, "rewards/accuracy_reward": 0.3125000111758709, "rewards/format_reward": 0.0, "step": 135 }, { "completion_length": 975.3750152587891, "epoch": 0.8976897689768977, "grad_norm": 0.4859466552734375, "kl": 0.39111328125, "learning_rate": 9.046106882113752e-08, "loss": 0.0393, "reward": 0.4791666716337204, "reward_std": 0.2525114044547081, "rewards/accuracy_reward": 0.4791666716337204, "rewards/format_reward": 0.0, "step": 136 }, { "completion_length": 873.8333587646484, "epoch": 0.9042904290429042, "grad_norm": 0.8227368593215942, "kl": 0.5703125, "learning_rate": 7.89050362285062e-08, "loss": 0.0192, "reward": 0.4583333507180214, "reward_std": 0.3855264447629452, "rewards/accuracy_reward": 0.4583333507180214, "rewards/format_reward": 0.0, "step": 137 }, { "completion_length": 879.6667022705078, "epoch": 0.9108910891089109, "grad_norm": 2.8329885005950928, "kl": 0.44970703125, "learning_rate": 6.811855162840214e-08, "loss": 0.1331, "reward": 0.4791666753590107, "reward_std": 0.27258947491645813, "rewards/accuracy_reward": 0.4791666753590107, "rewards/format_reward": 0.0, "step": 138 }, { "completion_length": 677.9166870117188, "epoch": 0.9174917491749175, "grad_norm": 2.366013288497925, "kl": 0.3238067626953125, "learning_rate": 5.810745609252166e-08, "loss": 0.1267, "reward": 0.5208333637565374, "reward_std": 0.25515517592430115, "rewards/accuracy_reward": 0.5208333637565374, "rewards/format_reward": 0.0, "step": 139 }, { "completion_length": 1135.9791870117188, "epoch": 0.9240924092409241, "grad_norm": 0.692542314529419, "kl": 1.0966796875, "learning_rate": 4.887717080570431e-08, "loss": 0.1422, "reward": 0.06250000186264515, "reward_std": 0.1530931107699871, "rewards/accuracy_reward": 0.06250000186264515, "rewards/format_reward": 0.0, "step": 140 }, { "completion_length": 1054.6875305175781, "epoch": 0.9306930693069307, "grad_norm": 2.231672763824463, "kl": 0.7978515625, "learning_rate": 4.0432694130264294e-08, "loss": 0.166, "reward": 0.29166667722165585, "reward_std": 0.2686738669872284, "rewards/accuracy_reward": 0.29166667722165585, "rewards/format_reward": 0.0, "step": 141 }, { "completion_length": 692.4375152587891, "epoch": 0.9372937293729373, "grad_norm": 0.28133484721183777, "kl": 0.2666015625, "learning_rate": 3.277859889929147e-08, "loss": -0.0049, "reward": 0.3125000149011612, "reward_std": 0.11558076366782188, "rewards/accuracy_reward": 0.3125000149011612, "rewards/format_reward": 0.0, "step": 142 }, { "completion_length": 908.3750534057617, "epoch": 0.9438943894389439, "grad_norm": 0.6757098436355591, "kl": 0.28759765625, "learning_rate": 2.5919029940380145e-08, "loss": -0.0196, "reward": 0.3333333507180214, "reward_std": 0.30354244261980057, "rewards/accuracy_reward": 0.3333333507180214, "rewards/format_reward": 0.0, "step": 143 }, { "completion_length": 788.1666793823242, "epoch": 0.9504950495049505, "grad_norm": 2.814443349838257, "kl": 0.331787109375, "learning_rate": 1.985770183113117e-08, "loss": 0.1527, "reward": 0.41666669212281704, "reward_std": 0.3680921457707882, "rewards/accuracy_reward": 0.41666669212281704, "rewards/format_reward": 0.0, "step": 144 }, { "completion_length": 876.6041717529297, "epoch": 0.9570957095709571, "grad_norm": 1.0456154346466064, "kl": 0.763427734375, "learning_rate": 1.4597896887644457e-08, "loss": 0.086, "reward": 0.29166667349636555, "reward_std": 0.16661180555820465, "rewards/accuracy_reward": 0.29166667349636555, "rewards/format_reward": 0.0, "step": 145 }, { "completion_length": 947.5208587646484, "epoch": 0.9636963696369637, "grad_norm": 0.5970892310142517, "kl": 0.9365234375, "learning_rate": 1.0142463387085465e-08, "loss": 0.0589, "reward": 0.1458333358168602, "reward_std": 0.13301505148410797, "rewards/accuracy_reward": 0.1458333358168602, "rewards/format_reward": 0.0, "step": 146 }, { "completion_length": 971.2916870117188, "epoch": 0.9702970297029703, "grad_norm": 0.4104056656360626, "kl": 0.8409423828125, "learning_rate": 6.493814025293476e-09, "loss": 0.0431, "reward": 0.33333334140479565, "reward_std": 0.2686738818883896, "rewards/accuracy_reward": 0.33333334140479565, "rewards/format_reward": 0.0, "step": 147 }, { "completion_length": 869.8958587646484, "epoch": 0.976897689768977, "grad_norm": 3.5218846797943115, "kl": 0.5218505859375, "learning_rate": 3.6539246102637037e-09, "loss": 0.1713, "reward": 0.4791666828095913, "reward_std": 0.2996268570423126, "rewards/accuracy_reward": 0.4791666828095913, "rewards/format_reward": 0.0, "step": 148 }, { "completion_length": 775.1875152587891, "epoch": 0.9834983498349835, "grad_norm": 2.006347417831421, "kl": 0.35626220703125, "learning_rate": 1.624332992213151e-09, "loss": 0.1167, "reward": 0.5625000223517418, "reward_std": 0.28219255805015564, "rewards/accuracy_reward": 0.5625000223517418, "rewards/format_reward": 0.0, "step": 149 }, { "completion_length": 690.9166870117188, "epoch": 0.9900990099009901, "grad_norm": 3.488175630569458, "kl": 0.43896484375, "learning_rate": 4.0613823080742907e-10, "loss": 0.094, "reward": 0.6250000149011612, "reward_std": 0.3977733254432678, "rewards/accuracy_reward": 0.6250000149011612, "rewards/format_reward": 0.0, "step": 150 }, { "completion_length": 961.7500305175781, "epoch": 0.9966996699669967, "grad_norm": 2.3887975215911865, "kl": 0.55078125, "learning_rate": 0.0, "loss": 0.1768, "reward": 0.3958333507180214, "reward_std": 0.41912319883704185, "rewards/accuracy_reward": 0.3958333507180214, "rewards/format_reward": 0.0, "step": 151 }, { "epoch": 0.9966996699669967, "step": 151, "total_flos": 0.0, "train_loss": 0.09152872039099531, "train_runtime": 28966.5677, "train_samples_per_second": 0.042, "train_steps_per_second": 0.005 } ], "logging_steps": 1, "max_steps": 151, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 3, "trial_name": null, "trial_params": null }