Qwen2.5-Math-7B-cn-zero-class2 / trainer_state.json
watermelonhjg's picture
Upload folder using huggingface_hub
5769136 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9966996699669967,
"eval_steps": 10,
"global_step": 151,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"completion_length": 761.6875305175781,
"epoch": 0.006600660066006601,
"grad_norm": 0.09082216769456863,
"kl": 0.0,
"learning_rate": 1.875e-07,
"loss": -0.0159,
"reward": 0.2291666679084301,
"reward_std": 0.1705273911356926,
"rewards/accuracy_reward": 0.2291666679084301,
"rewards/format_reward": 0.0,
"step": 1
},
{
"completion_length": 897.8541717529297,
"epoch": 0.013201320132013201,
"grad_norm": 0.12771357595920563,
"kl": 0.0,
"learning_rate": 3.75e-07,
"loss": 0.0257,
"reward": 0.3750000111758709,
"reward_std": 0.2861081659793854,
"rewards/accuracy_reward": 0.3750000111758709,
"rewards/format_reward": 0.0,
"step": 2
},
{
"completion_length": 946.2291717529297,
"epoch": 0.019801980198019802,
"grad_norm": 0.202493816614151,
"kl": 0.0002696514129638672,
"learning_rate": 5.625e-07,
"loss": 0.0767,
"reward": 0.43750002048909664,
"reward_std": 0.33713919669389725,
"rewards/accuracy_reward": 0.43750002048909664,
"rewards/format_reward": 0.0,
"step": 3
},
{
"completion_length": 947.2708740234375,
"epoch": 0.026402640264026403,
"grad_norm": 0.13421419262886047,
"kl": 0.00023603439331054688,
"learning_rate": 7.5e-07,
"loss": 0.0437,
"reward": 0.3541666716337204,
"reward_std": 0.4932760149240494,
"rewards/accuracy_reward": 0.3541666716337204,
"rewards/format_reward": 0.0,
"step": 4
},
{
"completion_length": 949.7708587646484,
"epoch": 0.033003300330033,
"grad_norm": 0.15579567849636078,
"kl": 0.0001881122589111328,
"learning_rate": 9.375e-07,
"loss": 0.082,
"reward": 0.583333358168602,
"reward_std": 0.4701542407274246,
"rewards/accuracy_reward": 0.583333358168602,
"rewards/format_reward": 0.0,
"step": 5
},
{
"completion_length": 679.7500305175781,
"epoch": 0.039603960396039604,
"grad_norm": 0.19298173487186432,
"kl": 0.0002353191375732422,
"learning_rate": 1.125e-06,
"loss": -0.055,
"reward": 0.7500000149011612,
"reward_std": 0.3332235999405384,
"rewards/accuracy_reward": 0.7500000149011612,
"rewards/format_reward": 0.0,
"step": 6
},
{
"completion_length": 713.4375305175781,
"epoch": 0.0462046204620462,
"grad_norm": 0.151686891913414,
"kl": 0.00026154518127441406,
"learning_rate": 1.3125000000000001e-06,
"loss": 0.0054,
"reward": 0.416666679084301,
"reward_std": 0.377695269882679,
"rewards/accuracy_reward": 0.416666679084301,
"rewards/format_reward": 0.0,
"step": 7
},
{
"completion_length": 721.6250076293945,
"epoch": 0.052805280528052806,
"grad_norm": 0.0006056017591618001,
"kl": 0.00021958351135253906,
"learning_rate": 1.5e-06,
"loss": 0.0,
"reward": 0.5,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"step": 8
},
{
"completion_length": 756.0000228881836,
"epoch": 0.0594059405940594,
"grad_norm": 0.14626246690750122,
"kl": 0.0002689361572265625,
"learning_rate": 1.6875e-06,
"loss": -0.0,
"reward": 0.33333334140479565,
"reward_std": 0.23116152361035347,
"rewards/accuracy_reward": 0.33333334140479565,
"rewards/format_reward": 0.0,
"step": 9
},
{
"completion_length": 755.75,
"epoch": 0.066006600660066,
"grad_norm": 0.14428134262561798,
"kl": 0.00023508071899414062,
"learning_rate": 1.875e-06,
"loss": -0.0386,
"reward": 0.7291666865348816,
"reward_std": 0.21764282882213593,
"rewards/accuracy_reward": 0.7291666865348816,
"rewards/format_reward": 0.0,
"step": 10
},
{
"completion_length": 993.1042022705078,
"epoch": 0.07260726072607261,
"grad_norm": 0.1510692834854126,
"kl": 0.0003027915954589844,
"learning_rate": 2.0625e-06,
"loss": 0.118,
"reward": 0.5208333432674408,
"reward_std": 0.4932760149240494,
"rewards/accuracy_reward": 0.5208333432674408,
"rewards/format_reward": 0.0,
"step": 11
},
{
"completion_length": 831.3125152587891,
"epoch": 0.07920792079207921,
"grad_norm": 0.08587031811475754,
"kl": 0.00022935867309570312,
"learning_rate": 2.25e-06,
"loss": 0.0158,
"reward": 0.708333358168602,
"reward_std": 0.24859580025076866,
"rewards/accuracy_reward": 0.708333358168602,
"rewards/format_reward": 0.0,
"step": 12
},
{
"completion_length": 786.7708435058594,
"epoch": 0.0858085808580858,
"grad_norm": 0.15709738433361053,
"kl": 0.0002751350402832031,
"learning_rate": 2.4375e-06,
"loss": 0.0416,
"reward": 0.4375000149011612,
"reward_std": 0.28219255432486534,
"rewards/accuracy_reward": 0.4375000149011612,
"rewards/format_reward": 0.0,
"step": 13
},
{
"completion_length": 896.3958587646484,
"epoch": 0.0924092409240924,
"grad_norm": 0.1950385421514511,
"kl": 0.0004210472106933594,
"learning_rate": 2.6250000000000003e-06,
"loss": -0.0025,
"reward": 0.416666679084301,
"reward_std": 0.30354245752096176,
"rewards/accuracy_reward": 0.416666679084301,
"rewards/format_reward": 0.0,
"step": 14
},
{
"completion_length": 915.1458587646484,
"epoch": 0.09900990099009901,
"grad_norm": 0.1464419662952423,
"kl": 0.00046825408935546875,
"learning_rate": 2.8125e-06,
"loss": 0.0378,
"reward": 0.5208333358168602,
"reward_std": 0.3720077723264694,
"rewards/accuracy_reward": 0.5208333358168602,
"rewards/format_reward": 0.0,
"step": 15
},
{
"completion_length": 733.4792022705078,
"epoch": 0.10561056105610561,
"grad_norm": 0.195227712392807,
"kl": 0.0011754035949707031,
"learning_rate": 3e-06,
"loss": -0.026,
"reward": 0.645833358168602,
"reward_std": 0.41912318766117096,
"rewards/accuracy_reward": 0.645833358168602,
"rewards/format_reward": 0.0,
"step": 16
},
{
"completion_length": 864.2083740234375,
"epoch": 0.11221122112211221,
"grad_norm": 0.29113319516181946,
"kl": 0.0006542205810546875,
"learning_rate": 2.9995938617691924e-06,
"loss": 0.0484,
"reward": 0.5000000186264515,
"reward_std": 0.4152075983583927,
"rewards/accuracy_reward": 0.5000000186264515,
"rewards/format_reward": 0.0,
"step": 17
},
{
"completion_length": 795.2291870117188,
"epoch": 0.1188118811881188,
"grad_norm": 0.13108719885349274,
"kl": 0.0011267662048339844,
"learning_rate": 2.998375667007787e-06,
"loss": 0.0592,
"reward": 0.6666666865348816,
"reward_std": 0.3035424277186394,
"rewards/accuracy_reward": 0.6666666865348816,
"rewards/format_reward": 0.0,
"step": 18
},
{
"completion_length": 722.3542022705078,
"epoch": 0.1254125412541254,
"grad_norm": 0.17488506436347961,
"kl": 0.0010900497436523438,
"learning_rate": 2.9963460753897363e-06,
"loss": 0.0007,
"reward": 0.6458333432674408,
"reward_std": 0.29962683096528053,
"rewards/accuracy_reward": 0.6458333432674408,
"rewards/format_reward": 0.0,
"step": 19
},
{
"completion_length": 776.3750152587891,
"epoch": 0.132013201320132,
"grad_norm": 0.11859464645385742,
"kl": 0.00139617919921875,
"learning_rate": 2.9935061859747068e-06,
"loss": 0.0307,
"reward": 0.3333333432674408,
"reward_std": 0.26603007316589355,
"rewards/accuracy_reward": 0.3333333432674408,
"rewards/format_reward": 0.0,
"step": 20
},
{
"completion_length": 661.0625152587891,
"epoch": 0.13861386138613863,
"grad_norm": 0.15094302594661713,
"kl": 0.0016927719116210938,
"learning_rate": 2.989857536612915e-06,
"loss": -0.0411,
"reward": 0.770833358168602,
"reward_std": 0.33713918551802635,
"rewards/accuracy_reward": 0.770833358168602,
"rewards/format_reward": 0.0,
"step": 21
},
{
"completion_length": 848.6666870117188,
"epoch": 0.14521452145214522,
"grad_norm": 0.2171953022480011,
"kl": 0.00218963623046875,
"learning_rate": 2.9854021031123555e-06,
"loss": 0.074,
"reward": 0.6041666716337204,
"reward_std": 0.18796169012784958,
"rewards/accuracy_reward": 0.6041666716337204,
"rewards/format_reward": 0.0,
"step": 22
},
{
"completion_length": 764.2083587646484,
"epoch": 0.15181518151815182,
"grad_norm": 0.19823089241981506,
"kl": 0.004238128662109375,
"learning_rate": 2.980142298168869e-06,
"loss": 0.0469,
"reward": 0.35416667722165585,
"reward_std": 0.36417657136917114,
"rewards/accuracy_reward": 0.35416667722165585,
"rewards/format_reward": 0.0,
"step": 23
},
{
"completion_length": 1017.3333435058594,
"epoch": 0.15841584158415842,
"grad_norm": 0.1434755176305771,
"kl": 0.006870269775390625,
"learning_rate": 2.97408097005962e-06,
"loss": 0.0274,
"reward": 0.2500000074505806,
"reward_std": 0.3506578952074051,
"rewards/accuracy_reward": 0.2500000074505806,
"rewards/format_reward": 0.0,
"step": 24
},
{
"completion_length": 750.2917022705078,
"epoch": 0.16501650165016502,
"grad_norm": 0.13002969324588776,
"kl": 0.0044384002685546875,
"learning_rate": 2.9672214011007086e-06,
"loss": 0.0874,
"reward": 0.583333358168602,
"reward_std": 0.3061862215399742,
"rewards/accuracy_reward": 0.583333358168602,
"rewards/format_reward": 0.0,
"step": 25
},
{
"completion_length": 810.7916870117188,
"epoch": 0.1716171617161716,
"grad_norm": 0.23493841290473938,
"kl": 0.008636474609375,
"learning_rate": 2.959567305869736e-06,
"loss": 0.0438,
"reward": 0.4583333432674408,
"reward_std": 0.32274864614009857,
"rewards/accuracy_reward": 0.4583333432674408,
"rewards/format_reward": 0.0,
"step": 26
},
{
"completion_length": 814.3750152587891,
"epoch": 0.1782178217821782,
"grad_norm": 0.319170206785202,
"kl": 0.007965087890625,
"learning_rate": 2.951122829194296e-06,
"loss": -0.0679,
"reward": 0.5833333507180214,
"reward_std": 0.24859580025076866,
"rewards/accuracy_reward": 0.5833333507180214,
"rewards/format_reward": 0.0,
"step": 27
},
{
"completion_length": 758.1041870117188,
"epoch": 0.1848184818481848,
"grad_norm": 0.27551642060279846,
"kl": 0.007470130920410156,
"learning_rate": 2.9418925439074784e-06,
"loss": 0.0304,
"reward": 0.6875000298023224,
"reward_std": 0.3842546343803406,
"rewards/accuracy_reward": 0.6875000298023224,
"rewards/format_reward": 0.0,
"step": 28
},
{
"completion_length": 950.4375152587891,
"epoch": 0.19141914191419143,
"grad_norm": 0.10155142843723297,
"kl": 0.021270751953125,
"learning_rate": 2.9318814483715983e-06,
"loss": 0.0413,
"reward": 0.3541666679084301,
"reward_std": 0.13301505148410797,
"rewards/accuracy_reward": 0.3541666679084301,
"rewards/format_reward": 0.0,
"step": 29
},
{
"completion_length": 718.0208587646484,
"epoch": 0.19801980198019803,
"grad_norm": 0.25280967354774475,
"kl": 0.01032257080078125,
"learning_rate": 2.921094963771494e-06,
"loss": -0.0332,
"reward": 0.5416666772216558,
"reward_std": 0.2861081622540951,
"rewards/accuracy_reward": 0.5416666772216558,
"rewards/format_reward": 0.0,
"step": 30
},
{
"completion_length": 836.3333587646484,
"epoch": 0.20462046204620463,
"grad_norm": 0.42321765422821045,
"kl": 0.0301666259765625,
"learning_rate": 2.9095389311788626e-06,
"loss": -0.0053,
"reward": 0.4583333544433117,
"reward_std": 0.3506578803062439,
"rewards/accuracy_reward": 0.4583333544433117,
"rewards/format_reward": 0.0,
"step": 31
},
{
"completion_length": 704.520866394043,
"epoch": 0.21122112211221122,
"grad_norm": 0.23243634402751923,
"kl": 0.015577316284179688,
"learning_rate": 2.8972196083892137e-06,
"loss": 0.0638,
"reward": 0.6666666865348816,
"reward_std": 0.3035424277186394,
"rewards/accuracy_reward": 0.6666666865348816,
"rewards/format_reward": 0.0,
"step": 32
},
{
"completion_length": 819.6458587646484,
"epoch": 0.21782178217821782,
"grad_norm": 0.2788753807544708,
"kl": 0.017120361328125,
"learning_rate": 2.8841436665331635e-06,
"loss": 0.1618,
"reward": 0.583333358168602,
"reward_std": 0.4326418936252594,
"rewards/accuracy_reward": 0.583333358168602,
"rewards/format_reward": 0.0,
"step": 33
},
{
"completion_length": 953.5417175292969,
"epoch": 0.22442244224422442,
"grad_norm": 0.168825164437294,
"kl": 0.0284271240234375,
"learning_rate": 2.8703181864639013e-06,
"loss": 0.1058,
"reward": 0.4375000149011612,
"reward_std": 0.43655748665332794,
"rewards/accuracy_reward": 0.4375000149011612,
"rewards/format_reward": 0.0,
"step": 34
},
{
"completion_length": 712.7916870117188,
"epoch": 0.23102310231023102,
"grad_norm": 0.3284476101398468,
"kl": 0.02169036865234375,
"learning_rate": 2.855750654922781e-06,
"loss": 0.0903,
"reward": 0.708333358168602,
"reward_std": 0.3236205019056797,
"rewards/accuracy_reward": 0.708333358168602,
"rewards/format_reward": 0.0,
"step": 35
},
{
"completion_length": 975.9791870117188,
"epoch": 0.2376237623762376,
"grad_norm": 0.11752771586179733,
"kl": 0.04449462890625,
"learning_rate": 2.8404489604851183e-06,
"loss": 0.0141,
"reward": 0.3125000074505806,
"reward_std": 0.3720077723264694,
"rewards/accuracy_reward": 0.3125000074505806,
"rewards/format_reward": 0.0,
"step": 36
},
{
"completion_length": 727.8125,
"epoch": 0.24422442244224424,
"grad_norm": 0.20371825993061066,
"kl": 0.0565185546875,
"learning_rate": 2.8244213892883906e-06,
"loss": 0.0715,
"reward": 0.6666667014360428,
"reward_std": 0.350657869130373,
"rewards/accuracy_reward": 0.6666667014360428,
"rewards/format_reward": 0.0,
"step": 37
},
{
"completion_length": 971.9375305175781,
"epoch": 0.2508250825082508,
"grad_norm": 0.10167910903692245,
"kl": 0.0328369140625,
"learning_rate": 2.8076766205451433e-06,
"loss": 0.0277,
"reward": 0.39583334885537624,
"reward_std": 0.21764282882213593,
"rewards/accuracy_reward": 0.39583334885537624,
"rewards/format_reward": 0.0,
"step": 38
},
{
"completion_length": 996.3750305175781,
"epoch": 0.25742574257425743,
"grad_norm": 0.10488853603601456,
"kl": 0.072265625,
"learning_rate": 2.7902237218430485e-06,
"loss": 0.0558,
"reward": 0.3958333432674408,
"reward_std": 0.29962684214115143,
"rewards/accuracy_reward": 0.3958333432674408,
"rewards/format_reward": 0.0,
"step": 39
},
{
"completion_length": 786.3125152587891,
"epoch": 0.264026402640264,
"grad_norm": 0.11269073933362961,
"kl": 0.0630340576171875,
"learning_rate": 2.772072144234639e-06,
"loss": 0.0412,
"reward": 0.5833333432674408,
"reward_std": 0.16661179438233376,
"rewards/accuracy_reward": 0.5833333432674408,
"rewards/format_reward": 0.0,
"step": 40
},
{
"completion_length": 884.9375305175781,
"epoch": 0.2706270627062706,
"grad_norm": 0.1405339539051056,
"kl": 0.0887451171875,
"learning_rate": 2.753231717119405e-06,
"loss": 0.1139,
"reward": 0.6458333432674408,
"reward_std": 0.44616057723760605,
"rewards/accuracy_reward": 0.6458333432674408,
"rewards/format_reward": 0.0,
"step": 41
},
{
"completion_length": 861.7291870117188,
"epoch": 0.27722772277227725,
"grad_norm": 0.11663912236690521,
"kl": 0.08587646484375,
"learning_rate": 2.7337126429209934e-06,
"loss": 0.1666,
"reward": 0.5833333544433117,
"reward_std": 0.23116152733564377,
"rewards/accuracy_reward": 0.5833333544433117,
"rewards/format_reward": 0.0,
"step": 42
},
{
"completion_length": 920.8958587646484,
"epoch": 0.2838283828382838,
"grad_norm": 0.1270524263381958,
"kl": 0.1448974609375,
"learning_rate": 2.713525491562421e-06,
"loss": 0.1552,
"reward": 0.5208333432674408,
"reward_std": 0.28219256550073624,
"rewards/accuracy_reward": 0.5208333432674408,
"rewards/format_reward": 0.0,
"step": 43
},
{
"completion_length": 1177.687515258789,
"epoch": 0.29042904290429045,
"grad_norm": 0.17139725387096405,
"kl": 0.176513671875,
"learning_rate": 2.6926811947422717e-06,
"loss": 0.0787,
"reward": 0.29166666977107525,
"reward_std": 0.2957112528383732,
"rewards/accuracy_reward": 0.29166666977107525,
"rewards/format_reward": 0.0,
"step": 44
},
{
"completion_length": 951.2500305175781,
"epoch": 0.297029702970297,
"grad_norm": 0.11752501130104065,
"kl": 0.1422119140625,
"learning_rate": 2.671191040014989e-06,
"loss": 0.1136,
"reward": 0.4375000149011612,
"reward_std": 0.309229951351881,
"rewards/accuracy_reward": 0.4375000149011612,
"rewards/format_reward": 0.0,
"step": 45
},
{
"completion_length": 922.6458435058594,
"epoch": 0.30363036303630364,
"grad_norm": 0.16094517707824707,
"kl": 0.15997314453125,
"learning_rate": 2.649066664678467e-06,
"loss": 0.0865,
"reward": 0.416666679084301,
"reward_std": 0.32274864614009857,
"rewards/accuracy_reward": 0.416666679084301,
"rewards/format_reward": 0.0,
"step": 46
},
{
"completion_length": 713.6458740234375,
"epoch": 0.3102310231023102,
"grad_norm": 0.17246519029140472,
"kl": 0.08984375,
"learning_rate": 2.626320049472249e-06,
"loss": 0.1438,
"reward": 0.5000000260770321,
"reward_std": 0.3332235999405384,
"rewards/accuracy_reward": 0.5000000260770321,
"rewards/format_reward": 0.0,
"step": 47
},
{
"completion_length": 924.0208435058594,
"epoch": 0.31683168316831684,
"grad_norm": 0.13788333535194397,
"kl": 0.11041259765625,
"learning_rate": 2.6029635120897432e-06,
"loss": 0.1128,
"reward": 0.3125000111758709,
"reward_std": 0.40168891102075577,
"rewards/accuracy_reward": 0.3125000111758709,
"rewards/format_reward": 0.0,
"step": 48
},
{
"completion_length": 1077.2292175292969,
"epoch": 0.3234323432343234,
"grad_norm": 0.12129193544387817,
"kl": 0.18906784057617188,
"learning_rate": 2.5790097005079765e-06,
"loss": 0.0767,
"reward": 0.3125000037252903,
"reward_std": 0.2350771278142929,
"rewards/accuracy_reward": 0.3125000037252903,
"rewards/format_reward": 0.0,
"step": 49
},
{
"completion_length": 866.0416870117188,
"epoch": 0.33003300330033003,
"grad_norm": 0.12490338832139969,
"kl": 0.1341552734375,
"learning_rate": 2.5544715861384928e-06,
"loss": 0.1316,
"reward": 0.41666667722165585,
"reward_std": 0.2957112640142441,
"rewards/accuracy_reward": 0.41666667722165585,
"rewards/format_reward": 0.0,
"step": 50
},
{
"completion_length": 851.5208587646484,
"epoch": 0.33663366336633666,
"grad_norm": 0.1823125034570694,
"kl": 0.0992431640625,
"learning_rate": 2.529362456803101e-06,
"loss": 0.0959,
"reward": 0.4791666828095913,
"reward_std": 0.43655748665332794,
"rewards/accuracy_reward": 0.4791666828095913,
"rewards/format_reward": 0.0,
"step": 51
},
{
"completion_length": 1083.4583435058594,
"epoch": 0.3432343234323432,
"grad_norm": 0.12086265534162521,
"kl": 0.25439453125,
"learning_rate": 2.5036959095382875e-06,
"loss": 0.1457,
"reward": 0.16666667349636555,
"reward_std": 0.24859580397605896,
"rewards/accuracy_reward": 0.16666667349636555,
"rewards/format_reward": 0.0,
"step": 52
},
{
"completion_length": 663.4583587646484,
"epoch": 0.34983498349834985,
"grad_norm": 0.1067744717001915,
"kl": 0.0916290283203125,
"learning_rate": 2.477485843232183e-06,
"loss": 0.1077,
"reward": 0.6666666716337204,
"reward_std": 0.11949635669589043,
"rewards/accuracy_reward": 0.6666666716337204,
"rewards/format_reward": 0.0,
"step": 53
},
{
"completion_length": 1239.0834045410156,
"epoch": 0.3564356435643564,
"grad_norm": 0.1427147090435028,
"kl": 0.22607421875,
"learning_rate": 2.4507464510980654e-06,
"loss": 0.1453,
"reward": 0.22916667722165585,
"reward_std": 0.23507710918784142,
"rewards/accuracy_reward": 0.22916667722165585,
"rewards/format_reward": 0.0,
"step": 54
},
{
"completion_length": 835.8750152587891,
"epoch": 0.36303630363036304,
"grad_norm": 0.171806201338768,
"kl": 0.1243896484375,
"learning_rate": 2.4234922129884873e-06,
"loss": 0.1779,
"reward": 0.5625000055879354,
"reward_std": 0.31970490142703056,
"rewards/accuracy_reward": 0.5625000055879354,
"rewards/format_reward": 0.0,
"step": 55
},
{
"completion_length": 1060.2916870117188,
"epoch": 0.3696369636963696,
"grad_norm": 0.11252501606941223,
"kl": 0.179443359375,
"learning_rate": 2.3957378875541795e-06,
"loss": 0.1761,
"reward": 0.3333333432674408,
"reward_std": 0.30354245379567146,
"rewards/accuracy_reward": 0.3333333432674408,
"rewards/format_reward": 0.0,
"step": 56
},
{
"completion_length": 1031.6458435058594,
"epoch": 0.37623762376237624,
"grad_norm": 0.08250569552183151,
"kl": 0.1865234375,
"learning_rate": 2.36749850425198e-06,
"loss": 0.0708,
"reward": 0.33333334140479565,
"reward_std": 0.31314554437994957,
"rewards/accuracy_reward": 0.33333334140479565,
"rewards/format_reward": 0.0,
"step": 57
},
{
"completion_length": 948.4583587646484,
"epoch": 0.38283828382838286,
"grad_norm": 0.07917948067188263,
"kl": 0.1787109375,
"learning_rate": 2.3387893552061204e-06,
"loss": 0.0915,
"reward": 0.35416666977107525,
"reward_std": 0.1705274023115635,
"rewards/accuracy_reward": 0.35416666977107525,
"rewards/format_reward": 0.0,
"step": 58
},
{
"completion_length": 972.2083587646484,
"epoch": 0.38943894389438943,
"grad_norm": 0.1615315079689026,
"kl": 0.1478271484375,
"learning_rate": 2.3096259869272697e-06,
"loss": 0.0861,
"reward": 0.4375000223517418,
"reward_std": 0.40168892592191696,
"rewards/accuracy_reward": 0.4375000223517418,
"rewards/format_reward": 0.0,
"step": 59
},
{
"completion_length": 922.2708587646484,
"epoch": 0.39603960396039606,
"grad_norm": 0.1217813566327095,
"kl": 0.1329345703125,
"learning_rate": 2.280024191893823e-06,
"loss": 0.1169,
"reward": 0.4583333507180214,
"reward_std": 0.2957112640142441,
"rewards/accuracy_reward": 0.4583333507180214,
"rewards/format_reward": 0.0,
"step": 60
},
{
"completion_length": 923.3750152587891,
"epoch": 0.40264026402640263,
"grad_norm": 0.1521437168121338,
"kl": 0.11083984375,
"learning_rate": 2.25e-06,
"loss": 0.122,
"reward": 0.5625000149011612,
"reward_std": 0.40168890357017517,
"rewards/accuracy_reward": 0.5625000149011612,
"rewards/format_reward": 0.0,
"step": 61
},
{
"completion_length": 776.1250152587891,
"epoch": 0.40924092409240925,
"grad_norm": 0.15655189752578735,
"kl": 0.13427734375,
"learning_rate": 2.2195696698753695e-06,
"loss": 0.0472,
"reward": 0.3333333395421505,
"reward_std": 0.3680921792984009,
"rewards/accuracy_reward": 0.3333333395421505,
"rewards/format_reward": 0.0,
"step": 62
},
{
"completion_length": 927.9792022705078,
"epoch": 0.4158415841584158,
"grad_norm": 0.08691170066595078,
"kl": 0.1297607421875,
"learning_rate": 2.1887496800805174e-06,
"loss": 0.0569,
"reward": 0.5208333507180214,
"reward_std": 0.2350771315395832,
"rewards/accuracy_reward": 0.5208333507180214,
"rewards/format_reward": 0.0,
"step": 63
},
{
"completion_length": 1035.8125,
"epoch": 0.42244224422442245,
"grad_norm": 0.10468967258930206,
"kl": 0.16912841796875,
"learning_rate": 2.157556720183616e-06,
"loss": 0.009,
"reward": 0.2708333395421505,
"reward_std": 0.2996268570423126,
"rewards/accuracy_reward": 0.2708333395421505,
"rewards/format_reward": 0.0,
"step": 64
},
{
"completion_length": 891.8333435058594,
"epoch": 0.429042904290429,
"grad_norm": 0.09926916658878326,
"kl": 0.11190414428710938,
"learning_rate": 2.126007681722727e-06,
"loss": 0.0617,
"reward": 0.5000000204890966,
"reward_std": 0.2861081399023533,
"rewards/accuracy_reward": 0.5000000204890966,
"rewards/format_reward": 0.0,
"step": 65
},
{
"completion_length": 756.2708587646484,
"epoch": 0.43564356435643564,
"grad_norm": 0.14332562685012817,
"kl": 0.09716796875,
"learning_rate": 2.0941196490587354e-06,
"loss": 0.0435,
"reward": 0.5000000298023224,
"reward_std": 0.2686738669872284,
"rewards/accuracy_reward": 0.5000000298023224,
"rewards/format_reward": 0.0,
"step": 66
},
{
"completion_length": 889.8125228881836,
"epoch": 0.44224422442244227,
"grad_norm": 0.16792258620262146,
"kl": 0.1457672119140625,
"learning_rate": 2.061909890123868e-06,
"loss": 0.1384,
"reward": 0.5416666865348816,
"reward_std": 0.3506578765809536,
"rewards/accuracy_reward": 0.5416666865348816,
"rewards/format_reward": 0.0,
"step": 67
},
{
"completion_length": 870.2292098999023,
"epoch": 0.44884488448844884,
"grad_norm": 0.1451350599527359,
"kl": 0.1453857421875,
"learning_rate": 2.0293958470708033e-06,
"loss": 0.0586,
"reward": 0.3750000149011612,
"reward_std": 0.23899272456765175,
"rewards/accuracy_reward": 0.3750000149011612,
"rewards/format_reward": 0.0,
"step": 68
},
{
"completion_length": 931.5625305175781,
"epoch": 0.45544554455445546,
"grad_norm": 0.17694266140460968,
"kl": 0.2371826171875,
"learning_rate": 1.9965951268274372e-06,
"loss": 0.1671,
"reward": 0.3750000149011612,
"reward_std": 0.26603008806705475,
"rewards/accuracy_reward": 0.3750000149011612,
"rewards/format_reward": 0.0,
"step": 69
},
{
"completion_length": 767.4167022705078,
"epoch": 0.46204620462046203,
"grad_norm": 0.13988901674747467,
"kl": 0.1268310546875,
"learning_rate": 1.963525491562421e-06,
"loss": -0.0182,
"reward": 0.5833333432674408,
"reward_std": 0.2957112491130829,
"rewards/accuracy_reward": 0.5833333432674408,
"rewards/format_reward": 0.0,
"step": 70
},
{
"completion_length": 945.3125305175781,
"epoch": 0.46864686468646866,
"grad_norm": 0.17755043506622314,
"kl": 0.244384765625,
"learning_rate": 1.9302048490666355e-06,
"loss": 0.1788,
"reward": 0.3541666828095913,
"reward_std": 0.4662386551499367,
"rewards/accuracy_reward": 0.3541666828095913,
"rewards/format_reward": 0.0,
"step": 71
},
{
"completion_length": 935.2083435058594,
"epoch": 0.4752475247524752,
"grad_norm": 0.14559805393218994,
"kl": 0.17041015625,
"learning_rate": 1.8966512430558036e-06,
"loss": 0.1942,
"reward": 0.416666679084301,
"reward_std": 0.4596792608499527,
"rewards/accuracy_reward": 0.416666679084301,
"rewards/format_reward": 0.0,
"step": 72
},
{
"completion_length": 908.1458587646484,
"epoch": 0.48184818481848185,
"grad_norm": 0.11466598510742188,
"kl": 0.2431640625,
"learning_rate": 1.8628828433995015e-06,
"loss": -0.0209,
"reward": 0.31250001676380634,
"reward_std": 0.21764283999800682,
"rewards/accuracy_reward": 0.31250001676380634,
"rewards/format_reward": 0.0,
"step": 73
},
{
"completion_length": 868.7917022705078,
"epoch": 0.4884488448844885,
"grad_norm": 0.11173044145107269,
"kl": 0.1383056640625,
"learning_rate": 1.828917936281855e-06,
"loss": 0.0838,
"reward": 0.35416667722165585,
"reward_std": 0.1530931033194065,
"rewards/accuracy_reward": 0.35416667722165585,
"rewards/format_reward": 0.0,
"step": 74
},
{
"completion_length": 933.4583587646484,
"epoch": 0.49504950495049505,
"grad_norm": 0.12861062586307526,
"kl": 0.17724609375,
"learning_rate": 1.7947749142992453e-06,
"loss": 0.0587,
"reward": 0.2291666716337204,
"reward_std": 0.35457348823547363,
"rewards/accuracy_reward": 0.2291666716337204,
"rewards/format_reward": 0.0,
"step": 75
},
{
"completion_length": 924.4166717529297,
"epoch": 0.5016501650165016,
"grad_norm": 0.09584160894155502,
"kl": 0.221435546875,
"learning_rate": 1.7604722665003958e-06,
"loss": 0.0786,
"reward": 0.16666667722165585,
"reward_std": 0.18404607474803925,
"rewards/accuracy_reward": 0.16666667722165585,
"rewards/format_reward": 0.0,
"step": 76
},
{
"completion_length": 1081.0208435058594,
"epoch": 0.5082508250825083,
"grad_norm": 0.11683700233697891,
"kl": 0.1298828125,
"learning_rate": 1.7260285683742248e-06,
"loss": 0.1236,
"reward": 0.416666679084301,
"reward_std": 0.3680921830236912,
"rewards/accuracy_reward": 0.416666679084301,
"rewards/format_reward": 0.0,
"step": 77
},
{
"completion_length": 978.3333435058594,
"epoch": 0.5148514851485149,
"grad_norm": 0.09451648592948914,
"kl": 0.1480712890625,
"learning_rate": 1.6914624717908924e-06,
"loss": 0.0095,
"reward": 0.2708333395421505,
"reward_std": 0.34674229472875595,
"rewards/accuracy_reward": 0.2708333395421505,
"rewards/format_reward": 0.0,
"step": 78
},
{
"completion_length": 1018.5417175292969,
"epoch": 0.5214521452145214,
"grad_norm": 0.10737847536802292,
"kl": 0.217529296875,
"learning_rate": 1.6567926949014804e-06,
"loss": 0.0826,
"reward": 0.3750000074505806,
"reward_std": 0.24859581142663956,
"rewards/accuracy_reward": 0.3750000074505806,
"rewards/format_reward": 0.0,
"step": 79
},
{
"completion_length": 893.5000457763672,
"epoch": 0.528052805280528,
"grad_norm": 0.1193479523062706,
"kl": 0.171875,
"learning_rate": 1.6220380120017874e-06,
"loss": 0.045,
"reward": 0.43750002048909664,
"reward_std": 0.35457346960902214,
"rewards/accuracy_reward": 0.43750002048909664,
"rewards/format_reward": 0.0,
"step": 80
},
{
"completion_length": 772.6041717529297,
"epoch": 0.5346534653465347,
"grad_norm": 0.1544453501701355,
"kl": 0.15875244140625,
"learning_rate": 1.5872172433657137e-06,
"loss": -0.0317,
"reward": 0.5416666828095913,
"reward_std": 0.3332236036658287,
"rewards/accuracy_reward": 0.5416666828095913,
"rewards/format_reward": 0.0,
"step": 81
},
{
"completion_length": 817.8750305175781,
"epoch": 0.5412541254125413,
"grad_norm": 0.08425965160131454,
"kl": 0.09405517578125,
"learning_rate": 1.5523492450537518e-06,
"loss": 0.0603,
"reward": 0.5000000055879354,
"reward_std": 0.23899271711707115,
"rewards/accuracy_reward": 0.5000000055879354,
"rewards/format_reward": 0.0,
"step": 82
},
{
"completion_length": 883.8542175292969,
"epoch": 0.5478547854785478,
"grad_norm": 0.14589935541152954,
"kl": 0.218994140625,
"learning_rate": 1.5174528987020958e-06,
"loss": 0.1737,
"reward": 0.3750000037252903,
"reward_std": 0.4248107075691223,
"rewards/accuracy_reward": 0.3750000037252903,
"rewards/format_reward": 0.0,
"step": 83
},
{
"completion_length": 1053.2083740234375,
"epoch": 0.5544554455445545,
"grad_norm": 0.09967659413814545,
"kl": 0.287841796875,
"learning_rate": 1.4825471012979047e-06,
"loss": -0.0045,
"reward": 0.25000000558793545,
"reward_std": 0.10206207260489464,
"rewards/accuracy_reward": 0.25000000558793545,
"rewards/format_reward": 0.0,
"step": 84
},
{
"completion_length": 1129.2083587646484,
"epoch": 0.5610561056105611,
"grad_norm": 0.0934222936630249,
"kl": 0.2568359375,
"learning_rate": 1.4476507549462489e-06,
"loss": 0.0961,
"reward": 0.37500001303851604,
"reward_std": 0.2957112528383732,
"rewards/accuracy_reward": 0.37500001303851604,
"rewards/format_reward": 0.0,
"step": 85
},
{
"completion_length": 904.125,
"epoch": 0.5676567656765676,
"grad_norm": 0.13640955090522766,
"kl": 0.2188720703125,
"learning_rate": 1.4127827566342864e-06,
"loss": 0.0729,
"reward": 0.354166679084301,
"reward_std": 0.3720077723264694,
"rewards/accuracy_reward": 0.354166679084301,
"rewards/format_reward": 0.0,
"step": 86
},
{
"completion_length": 931.2291870117188,
"epoch": 0.5742574257425742,
"grad_norm": 0.21060959994792938,
"kl": 0.2344970703125,
"learning_rate": 1.3779619879982127e-06,
"loss": 0.1297,
"reward": 0.4583333507180214,
"reward_std": 0.3506578877568245,
"rewards/accuracy_reward": 0.4583333507180214,
"rewards/format_reward": 0.0,
"step": 87
},
{
"completion_length": 997.7916717529297,
"epoch": 0.5808580858085809,
"grad_norm": 0.14985467493534088,
"kl": 0.306396484375,
"learning_rate": 1.3432073050985201e-06,
"loss": 0.0972,
"reward": 0.25,
"reward_std": 0.1369306445121765,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.0,
"step": 88
},
{
"completion_length": 931.5833587646484,
"epoch": 0.5874587458745875,
"grad_norm": 0.33125752210617065,
"kl": 0.2950439453125,
"learning_rate": 1.308537528209108e-06,
"loss": 0.1418,
"reward": 0.5208333507180214,
"reward_std": 0.2900237627327442,
"rewards/accuracy_reward": 0.5208333507180214,
"rewards/format_reward": 0.0,
"step": 89
},
{
"completion_length": 1021.1666870117188,
"epoch": 0.594059405940594,
"grad_norm": 0.16106674075126648,
"kl": 0.379638671875,
"learning_rate": 1.2739714316257753e-06,
"loss": 0.0325,
"reward": 0.3333333432674408,
"reward_std": 0.3776952549815178,
"rewards/accuracy_reward": 0.3333333432674408,
"rewards/format_reward": 0.0,
"step": 90
},
{
"completion_length": 988.3125305175781,
"epoch": 0.6006600660066007,
"grad_norm": 16.83741569519043,
"kl": 0.857177734375,
"learning_rate": 1.2395277334996047e-06,
"loss": 0.1811,
"reward": 0.27083334140479565,
"reward_std": 0.37465154752135277,
"rewards/accuracy_reward": 0.27083334140479565,
"rewards/format_reward": 0.0,
"step": 91
},
{
"completion_length": 828.8125305175781,
"epoch": 0.6072607260726073,
"grad_norm": 0.5444411039352417,
"kl": 0.21484375,
"learning_rate": 1.2052250857007548e-06,
"loss": 0.1801,
"reward": 0.5625000223517418,
"reward_std": 0.38161084055900574,
"rewards/accuracy_reward": 0.5625000223517418,
"rewards/format_reward": 0.0,
"step": 92
},
{
"completion_length": 1117.7917022705078,
"epoch": 0.6138613861386139,
"grad_norm": 0.33171162009239197,
"kl": 0.449462890625,
"learning_rate": 1.1710820637181448e-06,
"loss": 0.2095,
"reward": 0.2500000111758709,
"reward_std": 0.26603008806705475,
"rewards/accuracy_reward": 0.2500000111758709,
"rewards/format_reward": 0.0,
"step": 93
},
{
"completion_length": 825.5208740234375,
"epoch": 0.6204620462046204,
"grad_norm": 0.30983299016952515,
"kl": 0.7236328125,
"learning_rate": 1.1371171566004986e-06,
"loss": 0.0181,
"reward": 0.3333333432674408,
"reward_std": 0.20148035883903503,
"rewards/accuracy_reward": 0.3333333432674408,
"rewards/format_reward": 0.0,
"step": 94
},
{
"completion_length": 973.3750152587891,
"epoch": 0.6270627062706271,
"grad_norm": 0.7808173894882202,
"kl": 0.62158203125,
"learning_rate": 1.103348756944197e-06,
"loss": 0.1762,
"reward": 0.3750000037252903,
"reward_std": 0.4422450140118599,
"rewards/accuracy_reward": 0.3750000037252903,
"rewards/format_reward": 0.0,
"step": 95
},
{
"completion_length": 991.2708587646484,
"epoch": 0.6336633663366337,
"grad_norm": 0.6956799030303955,
"kl": 0.50244140625,
"learning_rate": 1.069795150933365e-06,
"loss": 0.1548,
"reward": 0.2291666753590107,
"reward_std": 0.2996268570423126,
"rewards/accuracy_reward": 0.2291666753590107,
"rewards/format_reward": 0.0,
"step": 96
},
{
"completion_length": 845.6041870117188,
"epoch": 0.6402640264026402,
"grad_norm": 0.6455105543136597,
"kl": 0.8310546875,
"learning_rate": 1.036474508437579e-06,
"loss": 0.0902,
"reward": 0.2916666716337204,
"reward_std": 0.11949635669589043,
"rewards/accuracy_reward": 0.2916666716337204,
"rewards/format_reward": 0.0,
"step": 97
},
{
"completion_length": 961.2708587646484,
"epoch": 0.6468646864686468,
"grad_norm": 1.0175178050994873,
"kl": 1.361572265625,
"learning_rate": 1.003404873172563e-06,
"loss": 0.2628,
"reward": 0.2916666716337204,
"reward_std": 0.3506578765809536,
"rewards/accuracy_reward": 0.2916666716337204,
"rewards/format_reward": 0.0,
"step": 98
},
{
"completion_length": 801.7500152587891,
"epoch": 0.6534653465346535,
"grad_norm": 0.9493899941444397,
"kl": 1.8583984375,
"learning_rate": 9.70604152929197e-07,
"loss": 0.1342,
"reward": 0.27083333395421505,
"reward_std": 0.33713919296860695,
"rewards/accuracy_reward": 0.27083333395421505,
"rewards/format_reward": 0.0,
"step": 99
},
{
"completion_length": 1018.9583587646484,
"epoch": 0.6600660066006601,
"grad_norm": 0.641622006893158,
"kl": 1.20654296875,
"learning_rate": 9.380901098761319e-07,
"loss": 0.1288,
"reward": 0.354166679084301,
"reward_std": 0.21764283999800682,
"rewards/accuracy_reward": 0.354166679084301,
"rewards/format_reward": 0.0,
"step": 100
},
{
"completion_length": 991.6666870117188,
"epoch": 0.6666666666666666,
"grad_norm": 3.699712038040161,
"kl": 2.2578125,
"learning_rate": 9.058803509412648e-07,
"loss": 0.1524,
"reward": 0.20833334140479565,
"reward_std": 0.2686738781630993,
"rewards/accuracy_reward": 0.20833334140479565,
"rewards/format_reward": 0.0,
"step": 101
},
{
"completion_length": 851.1458740234375,
"epoch": 0.6732673267326733,
"grad_norm": 0.7072895765304565,
"kl": 0.2276611328125,
"learning_rate": 8.739923182772733e-07,
"loss": 0.0363,
"reward": 0.5000000074505806,
"reward_std": 0.341054804623127,
"rewards/accuracy_reward": 0.5000000074505806,
"rewards/format_reward": 0.0,
"step": 102
},
{
"completion_length": 609.0000152587891,
"epoch": 0.6798679867986799,
"grad_norm": 1.4843007326126099,
"kl": 0.269775390625,
"learning_rate": 8.424432798163837e-07,
"loss": 0.03,
"reward": 0.604166679084301,
"reward_std": 0.28219255805015564,
"rewards/accuracy_reward": 0.604166679084301,
"rewards/format_reward": 0.0,
"step": 103
},
{
"completion_length": 905.5625152587891,
"epoch": 0.6864686468646864,
"grad_norm": 1.7519278526306152,
"kl": 0.4345703125,
"learning_rate": 8.112503199194822e-07,
"loss": 0.026,
"reward": 0.20833333767950535,
"reward_std": 0.3602609820663929,
"rewards/accuracy_reward": 0.20833333767950535,
"rewards/format_reward": 0.0,
"step": 104
},
{
"completion_length": 880.8541870117188,
"epoch": 0.693069306930693,
"grad_norm": 2.7430357933044434,
"kl": 2.5283203125,
"learning_rate": 7.804303301246311e-07,
"loss": 0.212,
"reward": 0.31250001303851604,
"reward_std": 0.21764283999800682,
"rewards/accuracy_reward": 0.31250001303851604,
"rewards/format_reward": 0.0,
"step": 105
},
{
"completion_length": 1010.9792022705078,
"epoch": 0.6996699669966997,
"grad_norm": 1.1869699954986572,
"kl": 2.6875,
"learning_rate": 7.500000000000003e-07,
"loss": 0.2102,
"reward": 0.37500001676380634,
"reward_std": 0.3131455294787884,
"rewards/accuracy_reward": 0.37500001676380634,
"rewards/format_reward": 0.0,
"step": 106
},
{
"completion_length": 1013.1875305175781,
"epoch": 0.7062706270627063,
"grad_norm": 1.6787883043289185,
"kl": 1.580078125,
"learning_rate": 7.19975808106177e-07,
"loss": 0.215,
"reward": 0.2500000074505806,
"reward_std": 0.31314554065465927,
"rewards/accuracy_reward": 0.2500000074505806,
"rewards/format_reward": 0.0,
"step": 107
},
{
"completion_length": 818.2917022705078,
"epoch": 0.7128712871287128,
"grad_norm": 1.6915416717529297,
"kl": 0.703125,
"learning_rate": 6.903740130727312e-07,
"loss": 0.1459,
"reward": 0.5000000149011612,
"reward_std": 0.33057980239391327,
"rewards/accuracy_reward": 0.5000000149011612,
"rewards/format_reward": 0.0,
"step": 108
},
{
"completion_length": 1075.5833740234375,
"epoch": 0.7194719471947195,
"grad_norm": 0.8402836918830872,
"kl": 0.9658203125,
"learning_rate": 6.6121064479388e-07,
"loss": 0.0926,
"reward": 0.2916666716337204,
"reward_std": 0.19364918768405914,
"rewards/accuracy_reward": 0.2916666716337204,
"rewards/format_reward": 0.0,
"step": 109
},
{
"completion_length": 848.4375305175781,
"epoch": 0.7260726072607261,
"grad_norm": 4.569257736206055,
"kl": 0.77294921875,
"learning_rate": 6.325014957480202e-07,
"loss": 0.221,
"reward": 0.5416666865348816,
"reward_std": 0.4422449842095375,
"rewards/accuracy_reward": 0.5416666865348816,
"rewards/format_reward": 0.0,
"step": 110
},
{
"completion_length": 891.8750381469727,
"epoch": 0.7326732673267327,
"grad_norm": 1.430153489112854,
"kl": 1.53662109375,
"learning_rate": 6.04262112445821e-07,
"loss": 0.1013,
"reward": 0.43750000558793545,
"reward_std": 0.2350771203637123,
"rewards/accuracy_reward": 0.43750000558793545,
"rewards/format_reward": 0.0,
"step": 111
},
{
"completion_length": 755.6458435058594,
"epoch": 0.7392739273927392,
"grad_norm": 0.3871193826198578,
"kl": 0.632415771484375,
"learning_rate": 5.765077870115125e-07,
"loss": 0.0524,
"reward": 0.43750002048909664,
"reward_std": 0.21764283627271652,
"rewards/accuracy_reward": 0.43750002048909664,
"rewards/format_reward": 0.0,
"step": 112
},
{
"completion_length": 899.6875305175781,
"epoch": 0.7458745874587459,
"grad_norm": 2.0980653762817383,
"kl": 0.4720458984375,
"learning_rate": 5.492535489019345e-07,
"loss": 0.1302,
"reward": 0.41666667349636555,
"reward_std": 0.3332235962152481,
"rewards/accuracy_reward": 0.41666667349636555,
"rewards/format_reward": 0.0,
"step": 113
},
{
"completion_length": 769.0417022705078,
"epoch": 0.7524752475247525,
"grad_norm": 2.5095949172973633,
"kl": 0.71484375,
"learning_rate": 5.225141567678172e-07,
"loss": 0.1271,
"reward": 0.5416666865348816,
"reward_std": 0.4500761739909649,
"rewards/accuracy_reward": 0.5416666865348816,
"rewards/format_reward": 0.0,
"step": 114
},
{
"completion_length": 1073.0416870117188,
"epoch": 0.759075907590759,
"grad_norm": 1.9907251596450806,
"kl": 1.24365234375,
"learning_rate": 4.963040904617131e-07,
"loss": 0.1739,
"reward": 0.3333333507180214,
"reward_std": 0.350657869130373,
"rewards/accuracy_reward": 0.3333333507180214,
"rewards/format_reward": 0.0,
"step": 115
},
{
"completion_length": 935.7500305175781,
"epoch": 0.7656765676567657,
"grad_norm": 3.745811939239502,
"kl": 3.703125,
"learning_rate": 4.7063754319689976e-07,
"loss": 0.2618,
"reward": 0.1458333395421505,
"reward_std": 0.23507710546255112,
"rewards/accuracy_reward": 0.1458333395421505,
"rewards/format_reward": 0.0,
"step": 116
},
{
"completion_length": 1031.5208587646484,
"epoch": 0.7722772277227723,
"grad_norm": 5.434175968170166,
"kl": 3.8812255859375,
"learning_rate": 4.4552841386150737e-07,
"loss": 0.2507,
"reward": 0.22916666977107525,
"reward_std": 0.25515518710017204,
"rewards/accuracy_reward": 0.22916666977107525,
"rewards/format_reward": 0.0,
"step": 117
},
{
"completion_length": 894.9375305175781,
"epoch": 0.7788778877887789,
"grad_norm": 2.229215621948242,
"kl": 1.484375,
"learning_rate": 4.2099029949202353e-07,
"loss": 0.2131,
"reward": 0.5625000223517418,
"reward_std": 0.39208584651350975,
"rewards/accuracy_reward": 0.5625000223517418,
"rewards/format_reward": 0.0,
"step": 118
},
{
"completion_length": 884.8333435058594,
"epoch": 0.7854785478547854,
"grad_norm": 3.206272602081299,
"kl": 2.19384765625,
"learning_rate": 3.9703648791025716e-07,
"loss": 0.121,
"reward": 0.2291666753590107,
"reward_std": 0.2621144950389862,
"rewards/accuracy_reward": 0.2291666753590107,
"rewards/format_reward": 0.0,
"step": 119
},
{
"completion_length": 927.5208587646484,
"epoch": 0.7920792079207921,
"grad_norm": 1.4413719177246094,
"kl": 1.4619140625,
"learning_rate": 3.736799505277512e-07,
"loss": 0.2241,
"reward": 0.3750000074505806,
"reward_std": 0.2957112640142441,
"rewards/accuracy_reward": 0.3750000074505806,
"rewards/format_reward": 0.0,
"step": 120
},
{
"completion_length": 1030.8333740234375,
"epoch": 0.7986798679867987,
"grad_norm": 2.288118362426758,
"kl": 2.2021484375,
"learning_rate": 3.5093333532153313e-07,
"loss": 0.2132,
"reward": 0.5000000149011612,
"reward_std": 0.3332235887646675,
"rewards/accuracy_reward": 0.5000000149011612,
"rewards/format_reward": 0.0,
"step": 121
},
{
"completion_length": 795.2291870117188,
"epoch": 0.8052805280528053,
"grad_norm": 4.600270748138428,
"kl": 0.909210205078125,
"learning_rate": 3.288089599850112e-07,
"loss": 0.1673,
"reward": 0.6250000149011612,
"reward_std": 0.23116153478622437,
"rewards/accuracy_reward": 0.6250000149011612,
"rewards/format_reward": 0.0,
"step": 122
},
{
"completion_length": 820.2708740234375,
"epoch": 0.8118811881188119,
"grad_norm": 1.4757143259048462,
"kl": 0.82757568359375,
"learning_rate": 3.073188052577282e-07,
"loss": 0.0951,
"reward": 0.3333333432674408,
"reward_std": 0.20412414520978928,
"rewards/accuracy_reward": 0.3333333432674408,
"rewards/format_reward": 0.0,
"step": 123
},
{
"completion_length": 1093.0417022705078,
"epoch": 0.8184818481848185,
"grad_norm": 1.748262643814087,
"kl": 2.27734375,
"learning_rate": 2.86474508437579e-07,
"loss": 0.2423,
"reward": 0.37500001303851604,
"reward_std": 0.4248107150197029,
"rewards/accuracy_reward": 0.37500001303851604,
"rewards/format_reward": 0.0,
"step": 124
},
{
"completion_length": 906.8958587646484,
"epoch": 0.8250825082508251,
"grad_norm": 1.2707668542861938,
"kl": 2.072265625,
"learning_rate": 2.6628735707900655e-07,
"loss": 0.1805,
"reward": 0.3750000074505806,
"reward_std": 0.3332236036658287,
"rewards/accuracy_reward": 0.3750000074505806,
"rewards/format_reward": 0.0,
"step": 125
},
{
"completion_length": 1030.1041717529297,
"epoch": 0.8316831683168316,
"grad_norm": 1.1277378797531128,
"kl": 2.2890625,
"learning_rate": 2.467682828805956e-07,
"loss": 0.1976,
"reward": 0.3125000111758709,
"reward_std": 0.28219256177544594,
"rewards/accuracy_reward": 0.3125000111758709,
"rewards/format_reward": 0.0,
"step": 126
},
{
"completion_length": 740.7291870117188,
"epoch": 0.8382838283828383,
"grad_norm": 1.2501320838928223,
"kl": 1.093414306640625,
"learning_rate": 2.2792785576536108e-07,
"loss": 0.1343,
"reward": 0.6666666716337204,
"reward_std": 0.16661179438233376,
"rewards/accuracy_reward": 0.6666666716337204,
"rewards/format_reward": 0.0,
"step": 127
},
{
"completion_length": 924.0625305175781,
"epoch": 0.8448844884488449,
"grad_norm": 1.4948713779449463,
"kl": 1.78173828125,
"learning_rate": 2.0977627815695215e-07,
"loss": 0.1317,
"reward": 0.3333333507180214,
"reward_std": 0.23116152361035347,
"rewards/accuracy_reward": 0.3333333507180214,
"rewards/format_reward": 0.0,
"step": 128
},
{
"completion_length": 942.5208740234375,
"epoch": 0.8514851485148515,
"grad_norm": 0.5847099423408508,
"kl": 0.99560546875,
"learning_rate": 1.9232337945485655e-07,
"loss": 0.0936,
"reward": 0.4375,
"reward_std": 0.06846532225608826,
"rewards/accuracy_reward": 0.4375,
"rewards/format_reward": 0.0,
"step": 129
},
{
"completion_length": 917.1666870117188,
"epoch": 0.858085808580858,
"grad_norm": 0.5804275274276733,
"kl": 0.701263427734375,
"learning_rate": 1.7557861071160953e-07,
"loss": 0.0218,
"reward": 0.31250000558793545,
"reward_std": 0.2446802258491516,
"rewards/accuracy_reward": 0.31250000558793545,
"rewards/format_reward": 0.0,
"step": 130
},
{
"completion_length": 1004.8333587646484,
"epoch": 0.8646864686468647,
"grad_norm": 0.8942325711250305,
"kl": 0.6883544921875,
"learning_rate": 1.5955103951488177e-07,
"loss": 0.1236,
"reward": 0.35416667722165585,
"reward_std": 0.36417656019330025,
"rewards/accuracy_reward": 0.35416667722165585,
"rewards/format_reward": 0.0,
"step": 131
},
{
"completion_length": 881.7916870117188,
"epoch": 0.8712871287128713,
"grad_norm": 2.220564365386963,
"kl": 0.78369140625,
"learning_rate": 1.4424934507721927e-07,
"loss": 0.1166,
"reward": 0.41666667722165585,
"reward_std": 0.2686738669872284,
"rewards/accuracy_reward": 0.41666667722165585,
"rewards/format_reward": 0.0,
"step": 132
},
{
"completion_length": 997.3541870117188,
"epoch": 0.8778877887788779,
"grad_norm": 1.1939600706100464,
"kl": 0.900390625,
"learning_rate": 1.2968181353609853e-07,
"loss": 0.0486,
"reward": 0.3125000111758709,
"reward_std": 0.1801304891705513,
"rewards/accuracy_reward": 0.3125000111758709,
"rewards/format_reward": 0.0,
"step": 133
},
{
"completion_length": 728.3541793823242,
"epoch": 0.8844884488448845,
"grad_norm": 0.9555450677871704,
"kl": 0.58154296875,
"learning_rate": 1.1585633346683655e-07,
"loss": 0.0358,
"reward": 0.45833334885537624,
"reward_std": 0.3332235924899578,
"rewards/accuracy_reward": 0.45833334885537624,
"rewards/format_reward": 0.0,
"step": 134
},
{
"completion_length": 979.4167175292969,
"epoch": 0.8910891089108911,
"grad_norm": 0.5510440468788147,
"kl": 0.7451171875,
"learning_rate": 1.0278039161078634e-07,
"loss": 0.0197,
"reward": 0.3125000111758709,
"reward_std": 0.25515517219901085,
"rewards/accuracy_reward": 0.3125000111758709,
"rewards/format_reward": 0.0,
"step": 135
},
{
"completion_length": 975.3750152587891,
"epoch": 0.8976897689768977,
"grad_norm": 0.4859466552734375,
"kl": 0.39111328125,
"learning_rate": 9.046106882113752e-08,
"loss": 0.0393,
"reward": 0.4791666716337204,
"reward_std": 0.2525114044547081,
"rewards/accuracy_reward": 0.4791666716337204,
"rewards/format_reward": 0.0,
"step": 136
},
{
"completion_length": 873.8333587646484,
"epoch": 0.9042904290429042,
"grad_norm": 0.8227368593215942,
"kl": 0.5703125,
"learning_rate": 7.89050362285062e-08,
"loss": 0.0192,
"reward": 0.4583333507180214,
"reward_std": 0.3855264447629452,
"rewards/accuracy_reward": 0.4583333507180214,
"rewards/format_reward": 0.0,
"step": 137
},
{
"completion_length": 879.6667022705078,
"epoch": 0.9108910891089109,
"grad_norm": 2.8329885005950928,
"kl": 0.44970703125,
"learning_rate": 6.811855162840214e-08,
"loss": 0.1331,
"reward": 0.4791666753590107,
"reward_std": 0.27258947491645813,
"rewards/accuracy_reward": 0.4791666753590107,
"rewards/format_reward": 0.0,
"step": 138
},
{
"completion_length": 677.9166870117188,
"epoch": 0.9174917491749175,
"grad_norm": 2.366013288497925,
"kl": 0.3238067626953125,
"learning_rate": 5.810745609252166e-08,
"loss": 0.1267,
"reward": 0.5208333637565374,
"reward_std": 0.25515517592430115,
"rewards/accuracy_reward": 0.5208333637565374,
"rewards/format_reward": 0.0,
"step": 139
},
{
"completion_length": 1135.9791870117188,
"epoch": 0.9240924092409241,
"grad_norm": 0.692542314529419,
"kl": 1.0966796875,
"learning_rate": 4.887717080570431e-08,
"loss": 0.1422,
"reward": 0.06250000186264515,
"reward_std": 0.1530931107699871,
"rewards/accuracy_reward": 0.06250000186264515,
"rewards/format_reward": 0.0,
"step": 140
},
{
"completion_length": 1054.6875305175781,
"epoch": 0.9306930693069307,
"grad_norm": 2.231672763824463,
"kl": 0.7978515625,
"learning_rate": 4.0432694130264294e-08,
"loss": 0.166,
"reward": 0.29166667722165585,
"reward_std": 0.2686738669872284,
"rewards/accuracy_reward": 0.29166667722165585,
"rewards/format_reward": 0.0,
"step": 141
},
{
"completion_length": 692.4375152587891,
"epoch": 0.9372937293729373,
"grad_norm": 0.28133484721183777,
"kl": 0.2666015625,
"learning_rate": 3.277859889929147e-08,
"loss": -0.0049,
"reward": 0.3125000149011612,
"reward_std": 0.11558076366782188,
"rewards/accuracy_reward": 0.3125000149011612,
"rewards/format_reward": 0.0,
"step": 142
},
{
"completion_length": 908.3750534057617,
"epoch": 0.9438943894389439,
"grad_norm": 0.6757098436355591,
"kl": 0.28759765625,
"learning_rate": 2.5919029940380145e-08,
"loss": -0.0196,
"reward": 0.3333333507180214,
"reward_std": 0.30354244261980057,
"rewards/accuracy_reward": 0.3333333507180214,
"rewards/format_reward": 0.0,
"step": 143
},
{
"completion_length": 788.1666793823242,
"epoch": 0.9504950495049505,
"grad_norm": 2.814443349838257,
"kl": 0.331787109375,
"learning_rate": 1.985770183113117e-08,
"loss": 0.1527,
"reward": 0.41666669212281704,
"reward_std": 0.3680921457707882,
"rewards/accuracy_reward": 0.41666669212281704,
"rewards/format_reward": 0.0,
"step": 144
},
{
"completion_length": 876.6041717529297,
"epoch": 0.9570957095709571,
"grad_norm": 1.0456154346466064,
"kl": 0.763427734375,
"learning_rate": 1.4597896887644457e-08,
"loss": 0.086,
"reward": 0.29166667349636555,
"reward_std": 0.16661180555820465,
"rewards/accuracy_reward": 0.29166667349636555,
"rewards/format_reward": 0.0,
"step": 145
},
{
"completion_length": 947.5208587646484,
"epoch": 0.9636963696369637,
"grad_norm": 0.5970892310142517,
"kl": 0.9365234375,
"learning_rate": 1.0142463387085465e-08,
"loss": 0.0589,
"reward": 0.1458333358168602,
"reward_std": 0.13301505148410797,
"rewards/accuracy_reward": 0.1458333358168602,
"rewards/format_reward": 0.0,
"step": 146
},
{
"completion_length": 971.2916870117188,
"epoch": 0.9702970297029703,
"grad_norm": 0.4104056656360626,
"kl": 0.8409423828125,
"learning_rate": 6.493814025293476e-09,
"loss": 0.0431,
"reward": 0.33333334140479565,
"reward_std": 0.2686738818883896,
"rewards/accuracy_reward": 0.33333334140479565,
"rewards/format_reward": 0.0,
"step": 147
},
{
"completion_length": 869.8958587646484,
"epoch": 0.976897689768977,
"grad_norm": 3.5218846797943115,
"kl": 0.5218505859375,
"learning_rate": 3.6539246102637037e-09,
"loss": 0.1713,
"reward": 0.4791666828095913,
"reward_std": 0.2996268570423126,
"rewards/accuracy_reward": 0.4791666828095913,
"rewards/format_reward": 0.0,
"step": 148
},
{
"completion_length": 775.1875152587891,
"epoch": 0.9834983498349835,
"grad_norm": 2.006347417831421,
"kl": 0.35626220703125,
"learning_rate": 1.624332992213151e-09,
"loss": 0.1167,
"reward": 0.5625000223517418,
"reward_std": 0.28219255805015564,
"rewards/accuracy_reward": 0.5625000223517418,
"rewards/format_reward": 0.0,
"step": 149
},
{
"completion_length": 690.9166870117188,
"epoch": 0.9900990099009901,
"grad_norm": 3.488175630569458,
"kl": 0.43896484375,
"learning_rate": 4.0613823080742907e-10,
"loss": 0.094,
"reward": 0.6250000149011612,
"reward_std": 0.3977733254432678,
"rewards/accuracy_reward": 0.6250000149011612,
"rewards/format_reward": 0.0,
"step": 150
},
{
"completion_length": 961.7500305175781,
"epoch": 0.9966996699669967,
"grad_norm": 2.3887975215911865,
"kl": 0.55078125,
"learning_rate": 0.0,
"loss": 0.1768,
"reward": 0.3958333507180214,
"reward_std": 0.41912319883704185,
"rewards/accuracy_reward": 0.3958333507180214,
"rewards/format_reward": 0.0,
"step": 151
},
{
"epoch": 0.9966996699669967,
"step": 151,
"total_flos": 0.0,
"train_loss": 0.09152872039099531,
"train_runtime": 28966.5677,
"train_samples_per_second": 0.042,
"train_steps_per_second": 0.005
}
],
"logging_steps": 1,
"max_steps": 151,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 3,
"trial_name": null,
"trial_params": null
}