| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.9966996699669967, | |
| "eval_steps": 10, | |
| "global_step": 151, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "completion_length": 761.6875305175781, | |
| "epoch": 0.006600660066006601, | |
| "grad_norm": 0.09082216769456863, | |
| "kl": 0.0, | |
| "learning_rate": 1.875e-07, | |
| "loss": -0.0159, | |
| "reward": 0.2291666679084301, | |
| "reward_std": 0.1705273911356926, | |
| "rewards/accuracy_reward": 0.2291666679084301, | |
| "rewards/format_reward": 0.0, | |
| "step": 1 | |
| }, | |
| { | |
| "completion_length": 897.8541717529297, | |
| "epoch": 0.013201320132013201, | |
| "grad_norm": 0.12771357595920563, | |
| "kl": 0.0, | |
| "learning_rate": 3.75e-07, | |
| "loss": 0.0257, | |
| "reward": 0.3750000111758709, | |
| "reward_std": 0.2861081659793854, | |
| "rewards/accuracy_reward": 0.3750000111758709, | |
| "rewards/format_reward": 0.0, | |
| "step": 2 | |
| }, | |
| { | |
| "completion_length": 946.2291717529297, | |
| "epoch": 0.019801980198019802, | |
| "grad_norm": 0.202493816614151, | |
| "kl": 0.0002696514129638672, | |
| "learning_rate": 5.625e-07, | |
| "loss": 0.0767, | |
| "reward": 0.43750002048909664, | |
| "reward_std": 0.33713919669389725, | |
| "rewards/accuracy_reward": 0.43750002048909664, | |
| "rewards/format_reward": 0.0, | |
| "step": 3 | |
| }, | |
| { | |
| "completion_length": 947.2708740234375, | |
| "epoch": 0.026402640264026403, | |
| "grad_norm": 0.13421419262886047, | |
| "kl": 0.00023603439331054688, | |
| "learning_rate": 7.5e-07, | |
| "loss": 0.0437, | |
| "reward": 0.3541666716337204, | |
| "reward_std": 0.4932760149240494, | |
| "rewards/accuracy_reward": 0.3541666716337204, | |
| "rewards/format_reward": 0.0, | |
| "step": 4 | |
| }, | |
| { | |
| "completion_length": 949.7708587646484, | |
| "epoch": 0.033003300330033, | |
| "grad_norm": 0.15579567849636078, | |
| "kl": 0.0001881122589111328, | |
| "learning_rate": 9.375e-07, | |
| "loss": 0.082, | |
| "reward": 0.583333358168602, | |
| "reward_std": 0.4701542407274246, | |
| "rewards/accuracy_reward": 0.583333358168602, | |
| "rewards/format_reward": 0.0, | |
| "step": 5 | |
| }, | |
| { | |
| "completion_length": 679.7500305175781, | |
| "epoch": 0.039603960396039604, | |
| "grad_norm": 0.19298173487186432, | |
| "kl": 0.0002353191375732422, | |
| "learning_rate": 1.125e-06, | |
| "loss": -0.055, | |
| "reward": 0.7500000149011612, | |
| "reward_std": 0.3332235999405384, | |
| "rewards/accuracy_reward": 0.7500000149011612, | |
| "rewards/format_reward": 0.0, | |
| "step": 6 | |
| }, | |
| { | |
| "completion_length": 713.4375305175781, | |
| "epoch": 0.0462046204620462, | |
| "grad_norm": 0.151686891913414, | |
| "kl": 0.00026154518127441406, | |
| "learning_rate": 1.3125000000000001e-06, | |
| "loss": 0.0054, | |
| "reward": 0.416666679084301, | |
| "reward_std": 0.377695269882679, | |
| "rewards/accuracy_reward": 0.416666679084301, | |
| "rewards/format_reward": 0.0, | |
| "step": 7 | |
| }, | |
| { | |
| "completion_length": 721.6250076293945, | |
| "epoch": 0.052805280528052806, | |
| "grad_norm": 0.0006056017591618001, | |
| "kl": 0.00021958351135253906, | |
| "learning_rate": 1.5e-06, | |
| "loss": 0.0, | |
| "reward": 0.5, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.5, | |
| "rewards/format_reward": 0.0, | |
| "step": 8 | |
| }, | |
| { | |
| "completion_length": 756.0000228881836, | |
| "epoch": 0.0594059405940594, | |
| "grad_norm": 0.14626246690750122, | |
| "kl": 0.0002689361572265625, | |
| "learning_rate": 1.6875e-06, | |
| "loss": -0.0, | |
| "reward": 0.33333334140479565, | |
| "reward_std": 0.23116152361035347, | |
| "rewards/accuracy_reward": 0.33333334140479565, | |
| "rewards/format_reward": 0.0, | |
| "step": 9 | |
| }, | |
| { | |
| "completion_length": 755.75, | |
| "epoch": 0.066006600660066, | |
| "grad_norm": 0.14428134262561798, | |
| "kl": 0.00023508071899414062, | |
| "learning_rate": 1.875e-06, | |
| "loss": -0.0386, | |
| "reward": 0.7291666865348816, | |
| "reward_std": 0.21764282882213593, | |
| "rewards/accuracy_reward": 0.7291666865348816, | |
| "rewards/format_reward": 0.0, | |
| "step": 10 | |
| }, | |
| { | |
| "completion_length": 993.1042022705078, | |
| "epoch": 0.07260726072607261, | |
| "grad_norm": 0.1510692834854126, | |
| "kl": 0.0003027915954589844, | |
| "learning_rate": 2.0625e-06, | |
| "loss": 0.118, | |
| "reward": 0.5208333432674408, | |
| "reward_std": 0.4932760149240494, | |
| "rewards/accuracy_reward": 0.5208333432674408, | |
| "rewards/format_reward": 0.0, | |
| "step": 11 | |
| }, | |
| { | |
| "completion_length": 831.3125152587891, | |
| "epoch": 0.07920792079207921, | |
| "grad_norm": 0.08587031811475754, | |
| "kl": 0.00022935867309570312, | |
| "learning_rate": 2.25e-06, | |
| "loss": 0.0158, | |
| "reward": 0.708333358168602, | |
| "reward_std": 0.24859580025076866, | |
| "rewards/accuracy_reward": 0.708333358168602, | |
| "rewards/format_reward": 0.0, | |
| "step": 12 | |
| }, | |
| { | |
| "completion_length": 786.7708435058594, | |
| "epoch": 0.0858085808580858, | |
| "grad_norm": 0.15709738433361053, | |
| "kl": 0.0002751350402832031, | |
| "learning_rate": 2.4375e-06, | |
| "loss": 0.0416, | |
| "reward": 0.4375000149011612, | |
| "reward_std": 0.28219255432486534, | |
| "rewards/accuracy_reward": 0.4375000149011612, | |
| "rewards/format_reward": 0.0, | |
| "step": 13 | |
| }, | |
| { | |
| "completion_length": 896.3958587646484, | |
| "epoch": 0.0924092409240924, | |
| "grad_norm": 0.1950385421514511, | |
| "kl": 0.0004210472106933594, | |
| "learning_rate": 2.6250000000000003e-06, | |
| "loss": -0.0025, | |
| "reward": 0.416666679084301, | |
| "reward_std": 0.30354245752096176, | |
| "rewards/accuracy_reward": 0.416666679084301, | |
| "rewards/format_reward": 0.0, | |
| "step": 14 | |
| }, | |
| { | |
| "completion_length": 915.1458587646484, | |
| "epoch": 0.09900990099009901, | |
| "grad_norm": 0.1464419662952423, | |
| "kl": 0.00046825408935546875, | |
| "learning_rate": 2.8125e-06, | |
| "loss": 0.0378, | |
| "reward": 0.5208333358168602, | |
| "reward_std": 0.3720077723264694, | |
| "rewards/accuracy_reward": 0.5208333358168602, | |
| "rewards/format_reward": 0.0, | |
| "step": 15 | |
| }, | |
| { | |
| "completion_length": 733.4792022705078, | |
| "epoch": 0.10561056105610561, | |
| "grad_norm": 0.195227712392807, | |
| "kl": 0.0011754035949707031, | |
| "learning_rate": 3e-06, | |
| "loss": -0.026, | |
| "reward": 0.645833358168602, | |
| "reward_std": 0.41912318766117096, | |
| "rewards/accuracy_reward": 0.645833358168602, | |
| "rewards/format_reward": 0.0, | |
| "step": 16 | |
| }, | |
| { | |
| "completion_length": 864.2083740234375, | |
| "epoch": 0.11221122112211221, | |
| "grad_norm": 0.29113319516181946, | |
| "kl": 0.0006542205810546875, | |
| "learning_rate": 2.9995938617691924e-06, | |
| "loss": 0.0484, | |
| "reward": 0.5000000186264515, | |
| "reward_std": 0.4152075983583927, | |
| "rewards/accuracy_reward": 0.5000000186264515, | |
| "rewards/format_reward": 0.0, | |
| "step": 17 | |
| }, | |
| { | |
| "completion_length": 795.2291870117188, | |
| "epoch": 0.1188118811881188, | |
| "grad_norm": 0.13108719885349274, | |
| "kl": 0.0011267662048339844, | |
| "learning_rate": 2.998375667007787e-06, | |
| "loss": 0.0592, | |
| "reward": 0.6666666865348816, | |
| "reward_std": 0.3035424277186394, | |
| "rewards/accuracy_reward": 0.6666666865348816, | |
| "rewards/format_reward": 0.0, | |
| "step": 18 | |
| }, | |
| { | |
| "completion_length": 722.3542022705078, | |
| "epoch": 0.1254125412541254, | |
| "grad_norm": 0.17488506436347961, | |
| "kl": 0.0010900497436523438, | |
| "learning_rate": 2.9963460753897363e-06, | |
| "loss": 0.0007, | |
| "reward": 0.6458333432674408, | |
| "reward_std": 0.29962683096528053, | |
| "rewards/accuracy_reward": 0.6458333432674408, | |
| "rewards/format_reward": 0.0, | |
| "step": 19 | |
| }, | |
| { | |
| "completion_length": 776.3750152587891, | |
| "epoch": 0.132013201320132, | |
| "grad_norm": 0.11859464645385742, | |
| "kl": 0.00139617919921875, | |
| "learning_rate": 2.9935061859747068e-06, | |
| "loss": 0.0307, | |
| "reward": 0.3333333432674408, | |
| "reward_std": 0.26603007316589355, | |
| "rewards/accuracy_reward": 0.3333333432674408, | |
| "rewards/format_reward": 0.0, | |
| "step": 20 | |
| }, | |
| { | |
| "completion_length": 661.0625152587891, | |
| "epoch": 0.13861386138613863, | |
| "grad_norm": 0.15094302594661713, | |
| "kl": 0.0016927719116210938, | |
| "learning_rate": 2.989857536612915e-06, | |
| "loss": -0.0411, | |
| "reward": 0.770833358168602, | |
| "reward_std": 0.33713918551802635, | |
| "rewards/accuracy_reward": 0.770833358168602, | |
| "rewards/format_reward": 0.0, | |
| "step": 21 | |
| }, | |
| { | |
| "completion_length": 848.6666870117188, | |
| "epoch": 0.14521452145214522, | |
| "grad_norm": 0.2171953022480011, | |
| "kl": 0.00218963623046875, | |
| "learning_rate": 2.9854021031123555e-06, | |
| "loss": 0.074, | |
| "reward": 0.6041666716337204, | |
| "reward_std": 0.18796169012784958, | |
| "rewards/accuracy_reward": 0.6041666716337204, | |
| "rewards/format_reward": 0.0, | |
| "step": 22 | |
| }, | |
| { | |
| "completion_length": 764.2083587646484, | |
| "epoch": 0.15181518151815182, | |
| "grad_norm": 0.19823089241981506, | |
| "kl": 0.004238128662109375, | |
| "learning_rate": 2.980142298168869e-06, | |
| "loss": 0.0469, | |
| "reward": 0.35416667722165585, | |
| "reward_std": 0.36417657136917114, | |
| "rewards/accuracy_reward": 0.35416667722165585, | |
| "rewards/format_reward": 0.0, | |
| "step": 23 | |
| }, | |
| { | |
| "completion_length": 1017.3333435058594, | |
| "epoch": 0.15841584158415842, | |
| "grad_norm": 0.1434755176305771, | |
| "kl": 0.006870269775390625, | |
| "learning_rate": 2.97408097005962e-06, | |
| "loss": 0.0274, | |
| "reward": 0.2500000074505806, | |
| "reward_std": 0.3506578952074051, | |
| "rewards/accuracy_reward": 0.2500000074505806, | |
| "rewards/format_reward": 0.0, | |
| "step": 24 | |
| }, | |
| { | |
| "completion_length": 750.2917022705078, | |
| "epoch": 0.16501650165016502, | |
| "grad_norm": 0.13002969324588776, | |
| "kl": 0.0044384002685546875, | |
| "learning_rate": 2.9672214011007086e-06, | |
| "loss": 0.0874, | |
| "reward": 0.583333358168602, | |
| "reward_std": 0.3061862215399742, | |
| "rewards/accuracy_reward": 0.583333358168602, | |
| "rewards/format_reward": 0.0, | |
| "step": 25 | |
| }, | |
| { | |
| "completion_length": 810.7916870117188, | |
| "epoch": 0.1716171617161716, | |
| "grad_norm": 0.23493841290473938, | |
| "kl": 0.008636474609375, | |
| "learning_rate": 2.959567305869736e-06, | |
| "loss": 0.0438, | |
| "reward": 0.4583333432674408, | |
| "reward_std": 0.32274864614009857, | |
| "rewards/accuracy_reward": 0.4583333432674408, | |
| "rewards/format_reward": 0.0, | |
| "step": 26 | |
| }, | |
| { | |
| "completion_length": 814.3750152587891, | |
| "epoch": 0.1782178217821782, | |
| "grad_norm": 0.319170206785202, | |
| "kl": 0.007965087890625, | |
| "learning_rate": 2.951122829194296e-06, | |
| "loss": -0.0679, | |
| "reward": 0.5833333507180214, | |
| "reward_std": 0.24859580025076866, | |
| "rewards/accuracy_reward": 0.5833333507180214, | |
| "rewards/format_reward": 0.0, | |
| "step": 27 | |
| }, | |
| { | |
| "completion_length": 758.1041870117188, | |
| "epoch": 0.1848184818481848, | |
| "grad_norm": 0.27551642060279846, | |
| "kl": 0.007470130920410156, | |
| "learning_rate": 2.9418925439074784e-06, | |
| "loss": 0.0304, | |
| "reward": 0.6875000298023224, | |
| "reward_std": 0.3842546343803406, | |
| "rewards/accuracy_reward": 0.6875000298023224, | |
| "rewards/format_reward": 0.0, | |
| "step": 28 | |
| }, | |
| { | |
| "completion_length": 950.4375152587891, | |
| "epoch": 0.19141914191419143, | |
| "grad_norm": 0.10155142843723297, | |
| "kl": 0.021270751953125, | |
| "learning_rate": 2.9318814483715983e-06, | |
| "loss": 0.0413, | |
| "reward": 0.3541666679084301, | |
| "reward_std": 0.13301505148410797, | |
| "rewards/accuracy_reward": 0.3541666679084301, | |
| "rewards/format_reward": 0.0, | |
| "step": 29 | |
| }, | |
| { | |
| "completion_length": 718.0208587646484, | |
| "epoch": 0.19801980198019803, | |
| "grad_norm": 0.25280967354774475, | |
| "kl": 0.01032257080078125, | |
| "learning_rate": 2.921094963771494e-06, | |
| "loss": -0.0332, | |
| "reward": 0.5416666772216558, | |
| "reward_std": 0.2861081622540951, | |
| "rewards/accuracy_reward": 0.5416666772216558, | |
| "rewards/format_reward": 0.0, | |
| "step": 30 | |
| }, | |
| { | |
| "completion_length": 836.3333587646484, | |
| "epoch": 0.20462046204620463, | |
| "grad_norm": 0.42321765422821045, | |
| "kl": 0.0301666259765625, | |
| "learning_rate": 2.9095389311788626e-06, | |
| "loss": -0.0053, | |
| "reward": 0.4583333544433117, | |
| "reward_std": 0.3506578803062439, | |
| "rewards/accuracy_reward": 0.4583333544433117, | |
| "rewards/format_reward": 0.0, | |
| "step": 31 | |
| }, | |
| { | |
| "completion_length": 704.520866394043, | |
| "epoch": 0.21122112211221122, | |
| "grad_norm": 0.23243634402751923, | |
| "kl": 0.015577316284179688, | |
| "learning_rate": 2.8972196083892137e-06, | |
| "loss": 0.0638, | |
| "reward": 0.6666666865348816, | |
| "reward_std": 0.3035424277186394, | |
| "rewards/accuracy_reward": 0.6666666865348816, | |
| "rewards/format_reward": 0.0, | |
| "step": 32 | |
| }, | |
| { | |
| "completion_length": 819.6458587646484, | |
| "epoch": 0.21782178217821782, | |
| "grad_norm": 0.2788753807544708, | |
| "kl": 0.017120361328125, | |
| "learning_rate": 2.8841436665331635e-06, | |
| "loss": 0.1618, | |
| "reward": 0.583333358168602, | |
| "reward_std": 0.4326418936252594, | |
| "rewards/accuracy_reward": 0.583333358168602, | |
| "rewards/format_reward": 0.0, | |
| "step": 33 | |
| }, | |
| { | |
| "completion_length": 953.5417175292969, | |
| "epoch": 0.22442244224422442, | |
| "grad_norm": 0.168825164437294, | |
| "kl": 0.0284271240234375, | |
| "learning_rate": 2.8703181864639013e-06, | |
| "loss": 0.1058, | |
| "reward": 0.4375000149011612, | |
| "reward_std": 0.43655748665332794, | |
| "rewards/accuracy_reward": 0.4375000149011612, | |
| "rewards/format_reward": 0.0, | |
| "step": 34 | |
| }, | |
| { | |
| "completion_length": 712.7916870117188, | |
| "epoch": 0.23102310231023102, | |
| "grad_norm": 0.3284476101398468, | |
| "kl": 0.02169036865234375, | |
| "learning_rate": 2.855750654922781e-06, | |
| "loss": 0.0903, | |
| "reward": 0.708333358168602, | |
| "reward_std": 0.3236205019056797, | |
| "rewards/accuracy_reward": 0.708333358168602, | |
| "rewards/format_reward": 0.0, | |
| "step": 35 | |
| }, | |
| { | |
| "completion_length": 975.9791870117188, | |
| "epoch": 0.2376237623762376, | |
| "grad_norm": 0.11752771586179733, | |
| "kl": 0.04449462890625, | |
| "learning_rate": 2.8404489604851183e-06, | |
| "loss": 0.0141, | |
| "reward": 0.3125000074505806, | |
| "reward_std": 0.3720077723264694, | |
| "rewards/accuracy_reward": 0.3125000074505806, | |
| "rewards/format_reward": 0.0, | |
| "step": 36 | |
| }, | |
| { | |
| "completion_length": 727.8125, | |
| "epoch": 0.24422442244224424, | |
| "grad_norm": 0.20371825993061066, | |
| "kl": 0.0565185546875, | |
| "learning_rate": 2.8244213892883906e-06, | |
| "loss": 0.0715, | |
| "reward": 0.6666667014360428, | |
| "reward_std": 0.350657869130373, | |
| "rewards/accuracy_reward": 0.6666667014360428, | |
| "rewards/format_reward": 0.0, | |
| "step": 37 | |
| }, | |
| { | |
| "completion_length": 971.9375305175781, | |
| "epoch": 0.2508250825082508, | |
| "grad_norm": 0.10167910903692245, | |
| "kl": 0.0328369140625, | |
| "learning_rate": 2.8076766205451433e-06, | |
| "loss": 0.0277, | |
| "reward": 0.39583334885537624, | |
| "reward_std": 0.21764282882213593, | |
| "rewards/accuracy_reward": 0.39583334885537624, | |
| "rewards/format_reward": 0.0, | |
| "step": 38 | |
| }, | |
| { | |
| "completion_length": 996.3750305175781, | |
| "epoch": 0.25742574257425743, | |
| "grad_norm": 0.10488853603601456, | |
| "kl": 0.072265625, | |
| "learning_rate": 2.7902237218430485e-06, | |
| "loss": 0.0558, | |
| "reward": 0.3958333432674408, | |
| "reward_std": 0.29962684214115143, | |
| "rewards/accuracy_reward": 0.3958333432674408, | |
| "rewards/format_reward": 0.0, | |
| "step": 39 | |
| }, | |
| { | |
| "completion_length": 786.3125152587891, | |
| "epoch": 0.264026402640264, | |
| "grad_norm": 0.11269073933362961, | |
| "kl": 0.0630340576171875, | |
| "learning_rate": 2.772072144234639e-06, | |
| "loss": 0.0412, | |
| "reward": 0.5833333432674408, | |
| "reward_std": 0.16661179438233376, | |
| "rewards/accuracy_reward": 0.5833333432674408, | |
| "rewards/format_reward": 0.0, | |
| "step": 40 | |
| }, | |
| { | |
| "completion_length": 884.9375305175781, | |
| "epoch": 0.2706270627062706, | |
| "grad_norm": 0.1405339539051056, | |
| "kl": 0.0887451171875, | |
| "learning_rate": 2.753231717119405e-06, | |
| "loss": 0.1139, | |
| "reward": 0.6458333432674408, | |
| "reward_std": 0.44616057723760605, | |
| "rewards/accuracy_reward": 0.6458333432674408, | |
| "rewards/format_reward": 0.0, | |
| "step": 41 | |
| }, | |
| { | |
| "completion_length": 861.7291870117188, | |
| "epoch": 0.27722772277227725, | |
| "grad_norm": 0.11663912236690521, | |
| "kl": 0.08587646484375, | |
| "learning_rate": 2.7337126429209934e-06, | |
| "loss": 0.1666, | |
| "reward": 0.5833333544433117, | |
| "reward_std": 0.23116152733564377, | |
| "rewards/accuracy_reward": 0.5833333544433117, | |
| "rewards/format_reward": 0.0, | |
| "step": 42 | |
| }, | |
| { | |
| "completion_length": 920.8958587646484, | |
| "epoch": 0.2838283828382838, | |
| "grad_norm": 0.1270524263381958, | |
| "kl": 0.1448974609375, | |
| "learning_rate": 2.713525491562421e-06, | |
| "loss": 0.1552, | |
| "reward": 0.5208333432674408, | |
| "reward_std": 0.28219256550073624, | |
| "rewards/accuracy_reward": 0.5208333432674408, | |
| "rewards/format_reward": 0.0, | |
| "step": 43 | |
| }, | |
| { | |
| "completion_length": 1177.687515258789, | |
| "epoch": 0.29042904290429045, | |
| "grad_norm": 0.17139725387096405, | |
| "kl": 0.176513671875, | |
| "learning_rate": 2.6926811947422717e-06, | |
| "loss": 0.0787, | |
| "reward": 0.29166666977107525, | |
| "reward_std": 0.2957112528383732, | |
| "rewards/accuracy_reward": 0.29166666977107525, | |
| "rewards/format_reward": 0.0, | |
| "step": 44 | |
| }, | |
| { | |
| "completion_length": 951.2500305175781, | |
| "epoch": 0.297029702970297, | |
| "grad_norm": 0.11752501130104065, | |
| "kl": 0.1422119140625, | |
| "learning_rate": 2.671191040014989e-06, | |
| "loss": 0.1136, | |
| "reward": 0.4375000149011612, | |
| "reward_std": 0.309229951351881, | |
| "rewards/accuracy_reward": 0.4375000149011612, | |
| "rewards/format_reward": 0.0, | |
| "step": 45 | |
| }, | |
| { | |
| "completion_length": 922.6458435058594, | |
| "epoch": 0.30363036303630364, | |
| "grad_norm": 0.16094517707824707, | |
| "kl": 0.15997314453125, | |
| "learning_rate": 2.649066664678467e-06, | |
| "loss": 0.0865, | |
| "reward": 0.416666679084301, | |
| "reward_std": 0.32274864614009857, | |
| "rewards/accuracy_reward": 0.416666679084301, | |
| "rewards/format_reward": 0.0, | |
| "step": 46 | |
| }, | |
| { | |
| "completion_length": 713.6458740234375, | |
| "epoch": 0.3102310231023102, | |
| "grad_norm": 0.17246519029140472, | |
| "kl": 0.08984375, | |
| "learning_rate": 2.626320049472249e-06, | |
| "loss": 0.1438, | |
| "reward": 0.5000000260770321, | |
| "reward_std": 0.3332235999405384, | |
| "rewards/accuracy_reward": 0.5000000260770321, | |
| "rewards/format_reward": 0.0, | |
| "step": 47 | |
| }, | |
| { | |
| "completion_length": 924.0208435058594, | |
| "epoch": 0.31683168316831684, | |
| "grad_norm": 0.13788333535194397, | |
| "kl": 0.11041259765625, | |
| "learning_rate": 2.6029635120897432e-06, | |
| "loss": 0.1128, | |
| "reward": 0.3125000111758709, | |
| "reward_std": 0.40168891102075577, | |
| "rewards/accuracy_reward": 0.3125000111758709, | |
| "rewards/format_reward": 0.0, | |
| "step": 48 | |
| }, | |
| { | |
| "completion_length": 1077.2292175292969, | |
| "epoch": 0.3234323432343234, | |
| "grad_norm": 0.12129193544387817, | |
| "kl": 0.18906784057617188, | |
| "learning_rate": 2.5790097005079765e-06, | |
| "loss": 0.0767, | |
| "reward": 0.3125000037252903, | |
| "reward_std": 0.2350771278142929, | |
| "rewards/accuracy_reward": 0.3125000037252903, | |
| "rewards/format_reward": 0.0, | |
| "step": 49 | |
| }, | |
| { | |
| "completion_length": 866.0416870117188, | |
| "epoch": 0.33003300330033003, | |
| "grad_norm": 0.12490338832139969, | |
| "kl": 0.1341552734375, | |
| "learning_rate": 2.5544715861384928e-06, | |
| "loss": 0.1316, | |
| "reward": 0.41666667722165585, | |
| "reward_std": 0.2957112640142441, | |
| "rewards/accuracy_reward": 0.41666667722165585, | |
| "rewards/format_reward": 0.0, | |
| "step": 50 | |
| }, | |
| { | |
| "completion_length": 851.5208587646484, | |
| "epoch": 0.33663366336633666, | |
| "grad_norm": 0.1823125034570694, | |
| "kl": 0.0992431640625, | |
| "learning_rate": 2.529362456803101e-06, | |
| "loss": 0.0959, | |
| "reward": 0.4791666828095913, | |
| "reward_std": 0.43655748665332794, | |
| "rewards/accuracy_reward": 0.4791666828095913, | |
| "rewards/format_reward": 0.0, | |
| "step": 51 | |
| }, | |
| { | |
| "completion_length": 1083.4583435058594, | |
| "epoch": 0.3432343234323432, | |
| "grad_norm": 0.12086265534162521, | |
| "kl": 0.25439453125, | |
| "learning_rate": 2.5036959095382875e-06, | |
| "loss": 0.1457, | |
| "reward": 0.16666667349636555, | |
| "reward_std": 0.24859580397605896, | |
| "rewards/accuracy_reward": 0.16666667349636555, | |
| "rewards/format_reward": 0.0, | |
| "step": 52 | |
| }, | |
| { | |
| "completion_length": 663.4583587646484, | |
| "epoch": 0.34983498349834985, | |
| "grad_norm": 0.1067744717001915, | |
| "kl": 0.0916290283203125, | |
| "learning_rate": 2.477485843232183e-06, | |
| "loss": 0.1077, | |
| "reward": 0.6666666716337204, | |
| "reward_std": 0.11949635669589043, | |
| "rewards/accuracy_reward": 0.6666666716337204, | |
| "rewards/format_reward": 0.0, | |
| "step": 53 | |
| }, | |
| { | |
| "completion_length": 1239.0834045410156, | |
| "epoch": 0.3564356435643564, | |
| "grad_norm": 0.1427147090435028, | |
| "kl": 0.22607421875, | |
| "learning_rate": 2.4507464510980654e-06, | |
| "loss": 0.1453, | |
| "reward": 0.22916667722165585, | |
| "reward_std": 0.23507710918784142, | |
| "rewards/accuracy_reward": 0.22916667722165585, | |
| "rewards/format_reward": 0.0, | |
| "step": 54 | |
| }, | |
| { | |
| "completion_length": 835.8750152587891, | |
| "epoch": 0.36303630363036304, | |
| "grad_norm": 0.171806201338768, | |
| "kl": 0.1243896484375, | |
| "learning_rate": 2.4234922129884873e-06, | |
| "loss": 0.1779, | |
| "reward": 0.5625000055879354, | |
| "reward_std": 0.31970490142703056, | |
| "rewards/accuracy_reward": 0.5625000055879354, | |
| "rewards/format_reward": 0.0, | |
| "step": 55 | |
| }, | |
| { | |
| "completion_length": 1060.2916870117188, | |
| "epoch": 0.3696369636963696, | |
| "grad_norm": 0.11252501606941223, | |
| "kl": 0.179443359375, | |
| "learning_rate": 2.3957378875541795e-06, | |
| "loss": 0.1761, | |
| "reward": 0.3333333432674408, | |
| "reward_std": 0.30354245379567146, | |
| "rewards/accuracy_reward": 0.3333333432674408, | |
| "rewards/format_reward": 0.0, | |
| "step": 56 | |
| }, | |
| { | |
| "completion_length": 1031.6458435058594, | |
| "epoch": 0.37623762376237624, | |
| "grad_norm": 0.08250569552183151, | |
| "kl": 0.1865234375, | |
| "learning_rate": 2.36749850425198e-06, | |
| "loss": 0.0708, | |
| "reward": 0.33333334140479565, | |
| "reward_std": 0.31314554437994957, | |
| "rewards/accuracy_reward": 0.33333334140479565, | |
| "rewards/format_reward": 0.0, | |
| "step": 57 | |
| }, | |
| { | |
| "completion_length": 948.4583587646484, | |
| "epoch": 0.38283828382838286, | |
| "grad_norm": 0.07917948067188263, | |
| "kl": 0.1787109375, | |
| "learning_rate": 2.3387893552061204e-06, | |
| "loss": 0.0915, | |
| "reward": 0.35416666977107525, | |
| "reward_std": 0.1705274023115635, | |
| "rewards/accuracy_reward": 0.35416666977107525, | |
| "rewards/format_reward": 0.0, | |
| "step": 58 | |
| }, | |
| { | |
| "completion_length": 972.2083587646484, | |
| "epoch": 0.38943894389438943, | |
| "grad_norm": 0.1615315079689026, | |
| "kl": 0.1478271484375, | |
| "learning_rate": 2.3096259869272697e-06, | |
| "loss": 0.0861, | |
| "reward": 0.4375000223517418, | |
| "reward_std": 0.40168892592191696, | |
| "rewards/accuracy_reward": 0.4375000223517418, | |
| "rewards/format_reward": 0.0, | |
| "step": 59 | |
| }, | |
| { | |
| "completion_length": 922.2708587646484, | |
| "epoch": 0.39603960396039606, | |
| "grad_norm": 0.1217813566327095, | |
| "kl": 0.1329345703125, | |
| "learning_rate": 2.280024191893823e-06, | |
| "loss": 0.1169, | |
| "reward": 0.4583333507180214, | |
| "reward_std": 0.2957112640142441, | |
| "rewards/accuracy_reward": 0.4583333507180214, | |
| "rewards/format_reward": 0.0, | |
| "step": 60 | |
| }, | |
| { | |
| "completion_length": 923.3750152587891, | |
| "epoch": 0.40264026402640263, | |
| "grad_norm": 0.1521437168121338, | |
| "kl": 0.11083984375, | |
| "learning_rate": 2.25e-06, | |
| "loss": 0.122, | |
| "reward": 0.5625000149011612, | |
| "reward_std": 0.40168890357017517, | |
| "rewards/accuracy_reward": 0.5625000149011612, | |
| "rewards/format_reward": 0.0, | |
| "step": 61 | |
| }, | |
| { | |
| "completion_length": 776.1250152587891, | |
| "epoch": 0.40924092409240925, | |
| "grad_norm": 0.15655189752578735, | |
| "kl": 0.13427734375, | |
| "learning_rate": 2.2195696698753695e-06, | |
| "loss": 0.0472, | |
| "reward": 0.3333333395421505, | |
| "reward_std": 0.3680921792984009, | |
| "rewards/accuracy_reward": 0.3333333395421505, | |
| "rewards/format_reward": 0.0, | |
| "step": 62 | |
| }, | |
| { | |
| "completion_length": 927.9792022705078, | |
| "epoch": 0.4158415841584158, | |
| "grad_norm": 0.08691170066595078, | |
| "kl": 0.1297607421875, | |
| "learning_rate": 2.1887496800805174e-06, | |
| "loss": 0.0569, | |
| "reward": 0.5208333507180214, | |
| "reward_std": 0.2350771315395832, | |
| "rewards/accuracy_reward": 0.5208333507180214, | |
| "rewards/format_reward": 0.0, | |
| "step": 63 | |
| }, | |
| { | |
| "completion_length": 1035.8125, | |
| "epoch": 0.42244224422442245, | |
| "grad_norm": 0.10468967258930206, | |
| "kl": 0.16912841796875, | |
| "learning_rate": 2.157556720183616e-06, | |
| "loss": 0.009, | |
| "reward": 0.2708333395421505, | |
| "reward_std": 0.2996268570423126, | |
| "rewards/accuracy_reward": 0.2708333395421505, | |
| "rewards/format_reward": 0.0, | |
| "step": 64 | |
| }, | |
| { | |
| "completion_length": 891.8333435058594, | |
| "epoch": 0.429042904290429, | |
| "grad_norm": 0.09926916658878326, | |
| "kl": 0.11190414428710938, | |
| "learning_rate": 2.126007681722727e-06, | |
| "loss": 0.0617, | |
| "reward": 0.5000000204890966, | |
| "reward_std": 0.2861081399023533, | |
| "rewards/accuracy_reward": 0.5000000204890966, | |
| "rewards/format_reward": 0.0, | |
| "step": 65 | |
| }, | |
| { | |
| "completion_length": 756.2708587646484, | |
| "epoch": 0.43564356435643564, | |
| "grad_norm": 0.14332562685012817, | |
| "kl": 0.09716796875, | |
| "learning_rate": 2.0941196490587354e-06, | |
| "loss": 0.0435, | |
| "reward": 0.5000000298023224, | |
| "reward_std": 0.2686738669872284, | |
| "rewards/accuracy_reward": 0.5000000298023224, | |
| "rewards/format_reward": 0.0, | |
| "step": 66 | |
| }, | |
| { | |
| "completion_length": 889.8125228881836, | |
| "epoch": 0.44224422442244227, | |
| "grad_norm": 0.16792258620262146, | |
| "kl": 0.1457672119140625, | |
| "learning_rate": 2.061909890123868e-06, | |
| "loss": 0.1384, | |
| "reward": 0.5416666865348816, | |
| "reward_std": 0.3506578765809536, | |
| "rewards/accuracy_reward": 0.5416666865348816, | |
| "rewards/format_reward": 0.0, | |
| "step": 67 | |
| }, | |
| { | |
| "completion_length": 870.2292098999023, | |
| "epoch": 0.44884488448844884, | |
| "grad_norm": 0.1451350599527359, | |
| "kl": 0.1453857421875, | |
| "learning_rate": 2.0293958470708033e-06, | |
| "loss": 0.0586, | |
| "reward": 0.3750000149011612, | |
| "reward_std": 0.23899272456765175, | |
| "rewards/accuracy_reward": 0.3750000149011612, | |
| "rewards/format_reward": 0.0, | |
| "step": 68 | |
| }, | |
| { | |
| "completion_length": 931.5625305175781, | |
| "epoch": 0.45544554455445546, | |
| "grad_norm": 0.17694266140460968, | |
| "kl": 0.2371826171875, | |
| "learning_rate": 1.9965951268274372e-06, | |
| "loss": 0.1671, | |
| "reward": 0.3750000149011612, | |
| "reward_std": 0.26603008806705475, | |
| "rewards/accuracy_reward": 0.3750000149011612, | |
| "rewards/format_reward": 0.0, | |
| "step": 69 | |
| }, | |
| { | |
| "completion_length": 767.4167022705078, | |
| "epoch": 0.46204620462046203, | |
| "grad_norm": 0.13988901674747467, | |
| "kl": 0.1268310546875, | |
| "learning_rate": 1.963525491562421e-06, | |
| "loss": -0.0182, | |
| "reward": 0.5833333432674408, | |
| "reward_std": 0.2957112491130829, | |
| "rewards/accuracy_reward": 0.5833333432674408, | |
| "rewards/format_reward": 0.0, | |
| "step": 70 | |
| }, | |
| { | |
| "completion_length": 945.3125305175781, | |
| "epoch": 0.46864686468646866, | |
| "grad_norm": 0.17755043506622314, | |
| "kl": 0.244384765625, | |
| "learning_rate": 1.9302048490666355e-06, | |
| "loss": 0.1788, | |
| "reward": 0.3541666828095913, | |
| "reward_std": 0.4662386551499367, | |
| "rewards/accuracy_reward": 0.3541666828095913, | |
| "rewards/format_reward": 0.0, | |
| "step": 71 | |
| }, | |
| { | |
| "completion_length": 935.2083435058594, | |
| "epoch": 0.4752475247524752, | |
| "grad_norm": 0.14559805393218994, | |
| "kl": 0.17041015625, | |
| "learning_rate": 1.8966512430558036e-06, | |
| "loss": 0.1942, | |
| "reward": 0.416666679084301, | |
| "reward_std": 0.4596792608499527, | |
| "rewards/accuracy_reward": 0.416666679084301, | |
| "rewards/format_reward": 0.0, | |
| "step": 72 | |
| }, | |
| { | |
| "completion_length": 908.1458587646484, | |
| "epoch": 0.48184818481848185, | |
| "grad_norm": 0.11466598510742188, | |
| "kl": 0.2431640625, | |
| "learning_rate": 1.8628828433995015e-06, | |
| "loss": -0.0209, | |
| "reward": 0.31250001676380634, | |
| "reward_std": 0.21764283999800682, | |
| "rewards/accuracy_reward": 0.31250001676380634, | |
| "rewards/format_reward": 0.0, | |
| "step": 73 | |
| }, | |
| { | |
| "completion_length": 868.7917022705078, | |
| "epoch": 0.4884488448844885, | |
| "grad_norm": 0.11173044145107269, | |
| "kl": 0.1383056640625, | |
| "learning_rate": 1.828917936281855e-06, | |
| "loss": 0.0838, | |
| "reward": 0.35416667722165585, | |
| "reward_std": 0.1530931033194065, | |
| "rewards/accuracy_reward": 0.35416667722165585, | |
| "rewards/format_reward": 0.0, | |
| "step": 74 | |
| }, | |
| { | |
| "completion_length": 933.4583587646484, | |
| "epoch": 0.49504950495049505, | |
| "grad_norm": 0.12861062586307526, | |
| "kl": 0.17724609375, | |
| "learning_rate": 1.7947749142992453e-06, | |
| "loss": 0.0587, | |
| "reward": 0.2291666716337204, | |
| "reward_std": 0.35457348823547363, | |
| "rewards/accuracy_reward": 0.2291666716337204, | |
| "rewards/format_reward": 0.0, | |
| "step": 75 | |
| }, | |
| { | |
| "completion_length": 924.4166717529297, | |
| "epoch": 0.5016501650165016, | |
| "grad_norm": 0.09584160894155502, | |
| "kl": 0.221435546875, | |
| "learning_rate": 1.7604722665003958e-06, | |
| "loss": 0.0786, | |
| "reward": 0.16666667722165585, | |
| "reward_std": 0.18404607474803925, | |
| "rewards/accuracy_reward": 0.16666667722165585, | |
| "rewards/format_reward": 0.0, | |
| "step": 76 | |
| }, | |
| { | |
| "completion_length": 1081.0208435058594, | |
| "epoch": 0.5082508250825083, | |
| "grad_norm": 0.11683700233697891, | |
| "kl": 0.1298828125, | |
| "learning_rate": 1.7260285683742248e-06, | |
| "loss": 0.1236, | |
| "reward": 0.416666679084301, | |
| "reward_std": 0.3680921830236912, | |
| "rewards/accuracy_reward": 0.416666679084301, | |
| "rewards/format_reward": 0.0, | |
| "step": 77 | |
| }, | |
| { | |
| "completion_length": 978.3333435058594, | |
| "epoch": 0.5148514851485149, | |
| "grad_norm": 0.09451648592948914, | |
| "kl": 0.1480712890625, | |
| "learning_rate": 1.6914624717908924e-06, | |
| "loss": 0.0095, | |
| "reward": 0.2708333395421505, | |
| "reward_std": 0.34674229472875595, | |
| "rewards/accuracy_reward": 0.2708333395421505, | |
| "rewards/format_reward": 0.0, | |
| "step": 78 | |
| }, | |
| { | |
| "completion_length": 1018.5417175292969, | |
| "epoch": 0.5214521452145214, | |
| "grad_norm": 0.10737847536802292, | |
| "kl": 0.217529296875, | |
| "learning_rate": 1.6567926949014804e-06, | |
| "loss": 0.0826, | |
| "reward": 0.3750000074505806, | |
| "reward_std": 0.24859581142663956, | |
| "rewards/accuracy_reward": 0.3750000074505806, | |
| "rewards/format_reward": 0.0, | |
| "step": 79 | |
| }, | |
| { | |
| "completion_length": 893.5000457763672, | |
| "epoch": 0.528052805280528, | |
| "grad_norm": 0.1193479523062706, | |
| "kl": 0.171875, | |
| "learning_rate": 1.6220380120017874e-06, | |
| "loss": 0.045, | |
| "reward": 0.43750002048909664, | |
| "reward_std": 0.35457346960902214, | |
| "rewards/accuracy_reward": 0.43750002048909664, | |
| "rewards/format_reward": 0.0, | |
| "step": 80 | |
| }, | |
| { | |
| "completion_length": 772.6041717529297, | |
| "epoch": 0.5346534653465347, | |
| "grad_norm": 0.1544453501701355, | |
| "kl": 0.15875244140625, | |
| "learning_rate": 1.5872172433657137e-06, | |
| "loss": -0.0317, | |
| "reward": 0.5416666828095913, | |
| "reward_std": 0.3332236036658287, | |
| "rewards/accuracy_reward": 0.5416666828095913, | |
| "rewards/format_reward": 0.0, | |
| "step": 81 | |
| }, | |
| { | |
| "completion_length": 817.8750305175781, | |
| "epoch": 0.5412541254125413, | |
| "grad_norm": 0.08425965160131454, | |
| "kl": 0.09405517578125, | |
| "learning_rate": 1.5523492450537518e-06, | |
| "loss": 0.0603, | |
| "reward": 0.5000000055879354, | |
| "reward_std": 0.23899271711707115, | |
| "rewards/accuracy_reward": 0.5000000055879354, | |
| "rewards/format_reward": 0.0, | |
| "step": 82 | |
| }, | |
| { | |
| "completion_length": 883.8542175292969, | |
| "epoch": 0.5478547854785478, | |
| "grad_norm": 0.14589935541152954, | |
| "kl": 0.218994140625, | |
| "learning_rate": 1.5174528987020958e-06, | |
| "loss": 0.1737, | |
| "reward": 0.3750000037252903, | |
| "reward_std": 0.4248107075691223, | |
| "rewards/accuracy_reward": 0.3750000037252903, | |
| "rewards/format_reward": 0.0, | |
| "step": 83 | |
| }, | |
| { | |
| "completion_length": 1053.2083740234375, | |
| "epoch": 0.5544554455445545, | |
| "grad_norm": 0.09967659413814545, | |
| "kl": 0.287841796875, | |
| "learning_rate": 1.4825471012979047e-06, | |
| "loss": -0.0045, | |
| "reward": 0.25000000558793545, | |
| "reward_std": 0.10206207260489464, | |
| "rewards/accuracy_reward": 0.25000000558793545, | |
| "rewards/format_reward": 0.0, | |
| "step": 84 | |
| }, | |
| { | |
| "completion_length": 1129.2083587646484, | |
| "epoch": 0.5610561056105611, | |
| "grad_norm": 0.0934222936630249, | |
| "kl": 0.2568359375, | |
| "learning_rate": 1.4476507549462489e-06, | |
| "loss": 0.0961, | |
| "reward": 0.37500001303851604, | |
| "reward_std": 0.2957112528383732, | |
| "rewards/accuracy_reward": 0.37500001303851604, | |
| "rewards/format_reward": 0.0, | |
| "step": 85 | |
| }, | |
| { | |
| "completion_length": 904.125, | |
| "epoch": 0.5676567656765676, | |
| "grad_norm": 0.13640955090522766, | |
| "kl": 0.2188720703125, | |
| "learning_rate": 1.4127827566342864e-06, | |
| "loss": 0.0729, | |
| "reward": 0.354166679084301, | |
| "reward_std": 0.3720077723264694, | |
| "rewards/accuracy_reward": 0.354166679084301, | |
| "rewards/format_reward": 0.0, | |
| "step": 86 | |
| }, | |
| { | |
| "completion_length": 931.2291870117188, | |
| "epoch": 0.5742574257425742, | |
| "grad_norm": 0.21060959994792938, | |
| "kl": 0.2344970703125, | |
| "learning_rate": 1.3779619879982127e-06, | |
| "loss": 0.1297, | |
| "reward": 0.4583333507180214, | |
| "reward_std": 0.3506578877568245, | |
| "rewards/accuracy_reward": 0.4583333507180214, | |
| "rewards/format_reward": 0.0, | |
| "step": 87 | |
| }, | |
| { | |
| "completion_length": 997.7916717529297, | |
| "epoch": 0.5808580858085809, | |
| "grad_norm": 0.14985467493534088, | |
| "kl": 0.306396484375, | |
| "learning_rate": 1.3432073050985201e-06, | |
| "loss": 0.0972, | |
| "reward": 0.25, | |
| "reward_std": 0.1369306445121765, | |
| "rewards/accuracy_reward": 0.25, | |
| "rewards/format_reward": 0.0, | |
| "step": 88 | |
| }, | |
| { | |
| "completion_length": 931.5833587646484, | |
| "epoch": 0.5874587458745875, | |
| "grad_norm": 0.33125752210617065, | |
| "kl": 0.2950439453125, | |
| "learning_rate": 1.308537528209108e-06, | |
| "loss": 0.1418, | |
| "reward": 0.5208333507180214, | |
| "reward_std": 0.2900237627327442, | |
| "rewards/accuracy_reward": 0.5208333507180214, | |
| "rewards/format_reward": 0.0, | |
| "step": 89 | |
| }, | |
| { | |
| "completion_length": 1021.1666870117188, | |
| "epoch": 0.594059405940594, | |
| "grad_norm": 0.16106674075126648, | |
| "kl": 0.379638671875, | |
| "learning_rate": 1.2739714316257753e-06, | |
| "loss": 0.0325, | |
| "reward": 0.3333333432674408, | |
| "reward_std": 0.3776952549815178, | |
| "rewards/accuracy_reward": 0.3333333432674408, | |
| "rewards/format_reward": 0.0, | |
| "step": 90 | |
| }, | |
| { | |
| "completion_length": 988.3125305175781, | |
| "epoch": 0.6006600660066007, | |
| "grad_norm": 16.83741569519043, | |
| "kl": 0.857177734375, | |
| "learning_rate": 1.2395277334996047e-06, | |
| "loss": 0.1811, | |
| "reward": 0.27083334140479565, | |
| "reward_std": 0.37465154752135277, | |
| "rewards/accuracy_reward": 0.27083334140479565, | |
| "rewards/format_reward": 0.0, | |
| "step": 91 | |
| }, | |
| { | |
| "completion_length": 828.8125305175781, | |
| "epoch": 0.6072607260726073, | |
| "grad_norm": 0.5444411039352417, | |
| "kl": 0.21484375, | |
| "learning_rate": 1.2052250857007548e-06, | |
| "loss": 0.1801, | |
| "reward": 0.5625000223517418, | |
| "reward_std": 0.38161084055900574, | |
| "rewards/accuracy_reward": 0.5625000223517418, | |
| "rewards/format_reward": 0.0, | |
| "step": 92 | |
| }, | |
| { | |
| "completion_length": 1117.7917022705078, | |
| "epoch": 0.6138613861386139, | |
| "grad_norm": 0.33171162009239197, | |
| "kl": 0.449462890625, | |
| "learning_rate": 1.1710820637181448e-06, | |
| "loss": 0.2095, | |
| "reward": 0.2500000111758709, | |
| "reward_std": 0.26603008806705475, | |
| "rewards/accuracy_reward": 0.2500000111758709, | |
| "rewards/format_reward": 0.0, | |
| "step": 93 | |
| }, | |
| { | |
| "completion_length": 825.5208740234375, | |
| "epoch": 0.6204620462046204, | |
| "grad_norm": 0.30983299016952515, | |
| "kl": 0.7236328125, | |
| "learning_rate": 1.1371171566004986e-06, | |
| "loss": 0.0181, | |
| "reward": 0.3333333432674408, | |
| "reward_std": 0.20148035883903503, | |
| "rewards/accuracy_reward": 0.3333333432674408, | |
| "rewards/format_reward": 0.0, | |
| "step": 94 | |
| }, | |
| { | |
| "completion_length": 973.3750152587891, | |
| "epoch": 0.6270627062706271, | |
| "grad_norm": 0.7808173894882202, | |
| "kl": 0.62158203125, | |
| "learning_rate": 1.103348756944197e-06, | |
| "loss": 0.1762, | |
| "reward": 0.3750000037252903, | |
| "reward_std": 0.4422450140118599, | |
| "rewards/accuracy_reward": 0.3750000037252903, | |
| "rewards/format_reward": 0.0, | |
| "step": 95 | |
| }, | |
| { | |
| "completion_length": 991.2708587646484, | |
| "epoch": 0.6336633663366337, | |
| "grad_norm": 0.6956799030303955, | |
| "kl": 0.50244140625, | |
| "learning_rate": 1.069795150933365e-06, | |
| "loss": 0.1548, | |
| "reward": 0.2291666753590107, | |
| "reward_std": 0.2996268570423126, | |
| "rewards/accuracy_reward": 0.2291666753590107, | |
| "rewards/format_reward": 0.0, | |
| "step": 96 | |
| }, | |
| { | |
| "completion_length": 845.6041870117188, | |
| "epoch": 0.6402640264026402, | |
| "grad_norm": 0.6455105543136597, | |
| "kl": 0.8310546875, | |
| "learning_rate": 1.036474508437579e-06, | |
| "loss": 0.0902, | |
| "reward": 0.2916666716337204, | |
| "reward_std": 0.11949635669589043, | |
| "rewards/accuracy_reward": 0.2916666716337204, | |
| "rewards/format_reward": 0.0, | |
| "step": 97 | |
| }, | |
| { | |
| "completion_length": 961.2708587646484, | |
| "epoch": 0.6468646864686468, | |
| "grad_norm": 1.0175178050994873, | |
| "kl": 1.361572265625, | |
| "learning_rate": 1.003404873172563e-06, | |
| "loss": 0.2628, | |
| "reward": 0.2916666716337204, | |
| "reward_std": 0.3506578765809536, | |
| "rewards/accuracy_reward": 0.2916666716337204, | |
| "rewards/format_reward": 0.0, | |
| "step": 98 | |
| }, | |
| { | |
| "completion_length": 801.7500152587891, | |
| "epoch": 0.6534653465346535, | |
| "grad_norm": 0.9493899941444397, | |
| "kl": 1.8583984375, | |
| "learning_rate": 9.70604152929197e-07, | |
| "loss": 0.1342, | |
| "reward": 0.27083333395421505, | |
| "reward_std": 0.33713919296860695, | |
| "rewards/accuracy_reward": 0.27083333395421505, | |
| "rewards/format_reward": 0.0, | |
| "step": 99 | |
| }, | |
| { | |
| "completion_length": 1018.9583587646484, | |
| "epoch": 0.6600660066006601, | |
| "grad_norm": 0.641622006893158, | |
| "kl": 1.20654296875, | |
| "learning_rate": 9.380901098761319e-07, | |
| "loss": 0.1288, | |
| "reward": 0.354166679084301, | |
| "reward_std": 0.21764283999800682, | |
| "rewards/accuracy_reward": 0.354166679084301, | |
| "rewards/format_reward": 0.0, | |
| "step": 100 | |
| }, | |
| { | |
| "completion_length": 991.6666870117188, | |
| "epoch": 0.6666666666666666, | |
| "grad_norm": 3.699712038040161, | |
| "kl": 2.2578125, | |
| "learning_rate": 9.058803509412648e-07, | |
| "loss": 0.1524, | |
| "reward": 0.20833334140479565, | |
| "reward_std": 0.2686738781630993, | |
| "rewards/accuracy_reward": 0.20833334140479565, | |
| "rewards/format_reward": 0.0, | |
| "step": 101 | |
| }, | |
| { | |
| "completion_length": 851.1458740234375, | |
| "epoch": 0.6732673267326733, | |
| "grad_norm": 0.7072895765304565, | |
| "kl": 0.2276611328125, | |
| "learning_rate": 8.739923182772733e-07, | |
| "loss": 0.0363, | |
| "reward": 0.5000000074505806, | |
| "reward_std": 0.341054804623127, | |
| "rewards/accuracy_reward": 0.5000000074505806, | |
| "rewards/format_reward": 0.0, | |
| "step": 102 | |
| }, | |
| { | |
| "completion_length": 609.0000152587891, | |
| "epoch": 0.6798679867986799, | |
| "grad_norm": 1.4843007326126099, | |
| "kl": 0.269775390625, | |
| "learning_rate": 8.424432798163837e-07, | |
| "loss": 0.03, | |
| "reward": 0.604166679084301, | |
| "reward_std": 0.28219255805015564, | |
| "rewards/accuracy_reward": 0.604166679084301, | |
| "rewards/format_reward": 0.0, | |
| "step": 103 | |
| }, | |
| { | |
| "completion_length": 905.5625152587891, | |
| "epoch": 0.6864686468646864, | |
| "grad_norm": 1.7519278526306152, | |
| "kl": 0.4345703125, | |
| "learning_rate": 8.112503199194822e-07, | |
| "loss": 0.026, | |
| "reward": 0.20833333767950535, | |
| "reward_std": 0.3602609820663929, | |
| "rewards/accuracy_reward": 0.20833333767950535, | |
| "rewards/format_reward": 0.0, | |
| "step": 104 | |
| }, | |
| { | |
| "completion_length": 880.8541870117188, | |
| "epoch": 0.693069306930693, | |
| "grad_norm": 2.7430357933044434, | |
| "kl": 2.5283203125, | |
| "learning_rate": 7.804303301246311e-07, | |
| "loss": 0.212, | |
| "reward": 0.31250001303851604, | |
| "reward_std": 0.21764283999800682, | |
| "rewards/accuracy_reward": 0.31250001303851604, | |
| "rewards/format_reward": 0.0, | |
| "step": 105 | |
| }, | |
| { | |
| "completion_length": 1010.9792022705078, | |
| "epoch": 0.6996699669966997, | |
| "grad_norm": 1.1869699954986572, | |
| "kl": 2.6875, | |
| "learning_rate": 7.500000000000003e-07, | |
| "loss": 0.2102, | |
| "reward": 0.37500001676380634, | |
| "reward_std": 0.3131455294787884, | |
| "rewards/accuracy_reward": 0.37500001676380634, | |
| "rewards/format_reward": 0.0, | |
| "step": 106 | |
| }, | |
| { | |
| "completion_length": 1013.1875305175781, | |
| "epoch": 0.7062706270627063, | |
| "grad_norm": 1.6787883043289185, | |
| "kl": 1.580078125, | |
| "learning_rate": 7.19975808106177e-07, | |
| "loss": 0.215, | |
| "reward": 0.2500000074505806, | |
| "reward_std": 0.31314554065465927, | |
| "rewards/accuracy_reward": 0.2500000074505806, | |
| "rewards/format_reward": 0.0, | |
| "step": 107 | |
| }, | |
| { | |
| "completion_length": 818.2917022705078, | |
| "epoch": 0.7128712871287128, | |
| "grad_norm": 1.6915416717529297, | |
| "kl": 0.703125, | |
| "learning_rate": 6.903740130727312e-07, | |
| "loss": 0.1459, | |
| "reward": 0.5000000149011612, | |
| "reward_std": 0.33057980239391327, | |
| "rewards/accuracy_reward": 0.5000000149011612, | |
| "rewards/format_reward": 0.0, | |
| "step": 108 | |
| }, | |
| { | |
| "completion_length": 1075.5833740234375, | |
| "epoch": 0.7194719471947195, | |
| "grad_norm": 0.8402836918830872, | |
| "kl": 0.9658203125, | |
| "learning_rate": 6.6121064479388e-07, | |
| "loss": 0.0926, | |
| "reward": 0.2916666716337204, | |
| "reward_std": 0.19364918768405914, | |
| "rewards/accuracy_reward": 0.2916666716337204, | |
| "rewards/format_reward": 0.0, | |
| "step": 109 | |
| }, | |
| { | |
| "completion_length": 848.4375305175781, | |
| "epoch": 0.7260726072607261, | |
| "grad_norm": 4.569257736206055, | |
| "kl": 0.77294921875, | |
| "learning_rate": 6.325014957480202e-07, | |
| "loss": 0.221, | |
| "reward": 0.5416666865348816, | |
| "reward_std": 0.4422449842095375, | |
| "rewards/accuracy_reward": 0.5416666865348816, | |
| "rewards/format_reward": 0.0, | |
| "step": 110 | |
| }, | |
| { | |
| "completion_length": 891.8750381469727, | |
| "epoch": 0.7326732673267327, | |
| "grad_norm": 1.430153489112854, | |
| "kl": 1.53662109375, | |
| "learning_rate": 6.04262112445821e-07, | |
| "loss": 0.1013, | |
| "reward": 0.43750000558793545, | |
| "reward_std": 0.2350771203637123, | |
| "rewards/accuracy_reward": 0.43750000558793545, | |
| "rewards/format_reward": 0.0, | |
| "step": 111 | |
| }, | |
| { | |
| "completion_length": 755.6458435058594, | |
| "epoch": 0.7392739273927392, | |
| "grad_norm": 0.3871193826198578, | |
| "kl": 0.632415771484375, | |
| "learning_rate": 5.765077870115125e-07, | |
| "loss": 0.0524, | |
| "reward": 0.43750002048909664, | |
| "reward_std": 0.21764283627271652, | |
| "rewards/accuracy_reward": 0.43750002048909664, | |
| "rewards/format_reward": 0.0, | |
| "step": 112 | |
| }, | |
| { | |
| "completion_length": 899.6875305175781, | |
| "epoch": 0.7458745874587459, | |
| "grad_norm": 2.0980653762817383, | |
| "kl": 0.4720458984375, | |
| "learning_rate": 5.492535489019345e-07, | |
| "loss": 0.1302, | |
| "reward": 0.41666667349636555, | |
| "reward_std": 0.3332235962152481, | |
| "rewards/accuracy_reward": 0.41666667349636555, | |
| "rewards/format_reward": 0.0, | |
| "step": 113 | |
| }, | |
| { | |
| "completion_length": 769.0417022705078, | |
| "epoch": 0.7524752475247525, | |
| "grad_norm": 2.5095949172973633, | |
| "kl": 0.71484375, | |
| "learning_rate": 5.225141567678172e-07, | |
| "loss": 0.1271, | |
| "reward": 0.5416666865348816, | |
| "reward_std": 0.4500761739909649, | |
| "rewards/accuracy_reward": 0.5416666865348816, | |
| "rewards/format_reward": 0.0, | |
| "step": 114 | |
| }, | |
| { | |
| "completion_length": 1073.0416870117188, | |
| "epoch": 0.759075907590759, | |
| "grad_norm": 1.9907251596450806, | |
| "kl": 1.24365234375, | |
| "learning_rate": 4.963040904617131e-07, | |
| "loss": 0.1739, | |
| "reward": 0.3333333507180214, | |
| "reward_std": 0.350657869130373, | |
| "rewards/accuracy_reward": 0.3333333507180214, | |
| "rewards/format_reward": 0.0, | |
| "step": 115 | |
| }, | |
| { | |
| "completion_length": 935.7500305175781, | |
| "epoch": 0.7656765676567657, | |
| "grad_norm": 3.745811939239502, | |
| "kl": 3.703125, | |
| "learning_rate": 4.7063754319689976e-07, | |
| "loss": 0.2618, | |
| "reward": 0.1458333395421505, | |
| "reward_std": 0.23507710546255112, | |
| "rewards/accuracy_reward": 0.1458333395421505, | |
| "rewards/format_reward": 0.0, | |
| "step": 116 | |
| }, | |
| { | |
| "completion_length": 1031.5208587646484, | |
| "epoch": 0.7722772277227723, | |
| "grad_norm": 5.434175968170166, | |
| "kl": 3.8812255859375, | |
| "learning_rate": 4.4552841386150737e-07, | |
| "loss": 0.2507, | |
| "reward": 0.22916666977107525, | |
| "reward_std": 0.25515518710017204, | |
| "rewards/accuracy_reward": 0.22916666977107525, | |
| "rewards/format_reward": 0.0, | |
| "step": 117 | |
| }, | |
| { | |
| "completion_length": 894.9375305175781, | |
| "epoch": 0.7788778877887789, | |
| "grad_norm": 2.229215621948242, | |
| "kl": 1.484375, | |
| "learning_rate": 4.2099029949202353e-07, | |
| "loss": 0.2131, | |
| "reward": 0.5625000223517418, | |
| "reward_std": 0.39208584651350975, | |
| "rewards/accuracy_reward": 0.5625000223517418, | |
| "rewards/format_reward": 0.0, | |
| "step": 118 | |
| }, | |
| { | |
| "completion_length": 884.8333435058594, | |
| "epoch": 0.7854785478547854, | |
| "grad_norm": 3.206272602081299, | |
| "kl": 2.19384765625, | |
| "learning_rate": 3.9703648791025716e-07, | |
| "loss": 0.121, | |
| "reward": 0.2291666753590107, | |
| "reward_std": 0.2621144950389862, | |
| "rewards/accuracy_reward": 0.2291666753590107, | |
| "rewards/format_reward": 0.0, | |
| "step": 119 | |
| }, | |
| { | |
| "completion_length": 927.5208587646484, | |
| "epoch": 0.7920792079207921, | |
| "grad_norm": 1.4413719177246094, | |
| "kl": 1.4619140625, | |
| "learning_rate": 3.736799505277512e-07, | |
| "loss": 0.2241, | |
| "reward": 0.3750000074505806, | |
| "reward_std": 0.2957112640142441, | |
| "rewards/accuracy_reward": 0.3750000074505806, | |
| "rewards/format_reward": 0.0, | |
| "step": 120 | |
| }, | |
| { | |
| "completion_length": 1030.8333740234375, | |
| "epoch": 0.7986798679867987, | |
| "grad_norm": 2.288118362426758, | |
| "kl": 2.2021484375, | |
| "learning_rate": 3.5093333532153313e-07, | |
| "loss": 0.2132, | |
| "reward": 0.5000000149011612, | |
| "reward_std": 0.3332235887646675, | |
| "rewards/accuracy_reward": 0.5000000149011612, | |
| "rewards/format_reward": 0.0, | |
| "step": 121 | |
| }, | |
| { | |
| "completion_length": 795.2291870117188, | |
| "epoch": 0.8052805280528053, | |
| "grad_norm": 4.600270748138428, | |
| "kl": 0.909210205078125, | |
| "learning_rate": 3.288089599850112e-07, | |
| "loss": 0.1673, | |
| "reward": 0.6250000149011612, | |
| "reward_std": 0.23116153478622437, | |
| "rewards/accuracy_reward": 0.6250000149011612, | |
| "rewards/format_reward": 0.0, | |
| "step": 122 | |
| }, | |
| { | |
| "completion_length": 820.2708740234375, | |
| "epoch": 0.8118811881188119, | |
| "grad_norm": 1.4757143259048462, | |
| "kl": 0.82757568359375, | |
| "learning_rate": 3.073188052577282e-07, | |
| "loss": 0.0951, | |
| "reward": 0.3333333432674408, | |
| "reward_std": 0.20412414520978928, | |
| "rewards/accuracy_reward": 0.3333333432674408, | |
| "rewards/format_reward": 0.0, | |
| "step": 123 | |
| }, | |
| { | |
| "completion_length": 1093.0417022705078, | |
| "epoch": 0.8184818481848185, | |
| "grad_norm": 1.748262643814087, | |
| "kl": 2.27734375, | |
| "learning_rate": 2.86474508437579e-07, | |
| "loss": 0.2423, | |
| "reward": 0.37500001303851604, | |
| "reward_std": 0.4248107150197029, | |
| "rewards/accuracy_reward": 0.37500001303851604, | |
| "rewards/format_reward": 0.0, | |
| "step": 124 | |
| }, | |
| { | |
| "completion_length": 906.8958587646484, | |
| "epoch": 0.8250825082508251, | |
| "grad_norm": 1.2707668542861938, | |
| "kl": 2.072265625, | |
| "learning_rate": 2.6628735707900655e-07, | |
| "loss": 0.1805, | |
| "reward": 0.3750000074505806, | |
| "reward_std": 0.3332236036658287, | |
| "rewards/accuracy_reward": 0.3750000074505806, | |
| "rewards/format_reward": 0.0, | |
| "step": 125 | |
| }, | |
| { | |
| "completion_length": 1030.1041717529297, | |
| "epoch": 0.8316831683168316, | |
| "grad_norm": 1.1277378797531128, | |
| "kl": 2.2890625, | |
| "learning_rate": 2.467682828805956e-07, | |
| "loss": 0.1976, | |
| "reward": 0.3125000111758709, | |
| "reward_std": 0.28219256177544594, | |
| "rewards/accuracy_reward": 0.3125000111758709, | |
| "rewards/format_reward": 0.0, | |
| "step": 126 | |
| }, | |
| { | |
| "completion_length": 740.7291870117188, | |
| "epoch": 0.8382838283828383, | |
| "grad_norm": 1.2501320838928223, | |
| "kl": 1.093414306640625, | |
| "learning_rate": 2.2792785576536108e-07, | |
| "loss": 0.1343, | |
| "reward": 0.6666666716337204, | |
| "reward_std": 0.16661179438233376, | |
| "rewards/accuracy_reward": 0.6666666716337204, | |
| "rewards/format_reward": 0.0, | |
| "step": 127 | |
| }, | |
| { | |
| "completion_length": 924.0625305175781, | |
| "epoch": 0.8448844884488449, | |
| "grad_norm": 1.4948713779449463, | |
| "kl": 1.78173828125, | |
| "learning_rate": 2.0977627815695215e-07, | |
| "loss": 0.1317, | |
| "reward": 0.3333333507180214, | |
| "reward_std": 0.23116152361035347, | |
| "rewards/accuracy_reward": 0.3333333507180214, | |
| "rewards/format_reward": 0.0, | |
| "step": 128 | |
| }, | |
| { | |
| "completion_length": 942.5208740234375, | |
| "epoch": 0.8514851485148515, | |
| "grad_norm": 0.5847099423408508, | |
| "kl": 0.99560546875, | |
| "learning_rate": 1.9232337945485655e-07, | |
| "loss": 0.0936, | |
| "reward": 0.4375, | |
| "reward_std": 0.06846532225608826, | |
| "rewards/accuracy_reward": 0.4375, | |
| "rewards/format_reward": 0.0, | |
| "step": 129 | |
| }, | |
| { | |
| "completion_length": 917.1666870117188, | |
| "epoch": 0.858085808580858, | |
| "grad_norm": 0.5804275274276733, | |
| "kl": 0.701263427734375, | |
| "learning_rate": 1.7557861071160953e-07, | |
| "loss": 0.0218, | |
| "reward": 0.31250000558793545, | |
| "reward_std": 0.2446802258491516, | |
| "rewards/accuracy_reward": 0.31250000558793545, | |
| "rewards/format_reward": 0.0, | |
| "step": 130 | |
| }, | |
| { | |
| "completion_length": 1004.8333587646484, | |
| "epoch": 0.8646864686468647, | |
| "grad_norm": 0.8942325711250305, | |
| "kl": 0.6883544921875, | |
| "learning_rate": 1.5955103951488177e-07, | |
| "loss": 0.1236, | |
| "reward": 0.35416667722165585, | |
| "reward_std": 0.36417656019330025, | |
| "rewards/accuracy_reward": 0.35416667722165585, | |
| "rewards/format_reward": 0.0, | |
| "step": 131 | |
| }, | |
| { | |
| "completion_length": 881.7916870117188, | |
| "epoch": 0.8712871287128713, | |
| "grad_norm": 2.220564365386963, | |
| "kl": 0.78369140625, | |
| "learning_rate": 1.4424934507721927e-07, | |
| "loss": 0.1166, | |
| "reward": 0.41666667722165585, | |
| "reward_std": 0.2686738669872284, | |
| "rewards/accuracy_reward": 0.41666667722165585, | |
| "rewards/format_reward": 0.0, | |
| "step": 132 | |
| }, | |
| { | |
| "completion_length": 997.3541870117188, | |
| "epoch": 0.8778877887788779, | |
| "grad_norm": 1.1939600706100464, | |
| "kl": 0.900390625, | |
| "learning_rate": 1.2968181353609853e-07, | |
| "loss": 0.0486, | |
| "reward": 0.3125000111758709, | |
| "reward_std": 0.1801304891705513, | |
| "rewards/accuracy_reward": 0.3125000111758709, | |
| "rewards/format_reward": 0.0, | |
| "step": 133 | |
| }, | |
| { | |
| "completion_length": 728.3541793823242, | |
| "epoch": 0.8844884488448845, | |
| "grad_norm": 0.9555450677871704, | |
| "kl": 0.58154296875, | |
| "learning_rate": 1.1585633346683655e-07, | |
| "loss": 0.0358, | |
| "reward": 0.45833334885537624, | |
| "reward_std": 0.3332235924899578, | |
| "rewards/accuracy_reward": 0.45833334885537624, | |
| "rewards/format_reward": 0.0, | |
| "step": 134 | |
| }, | |
| { | |
| "completion_length": 979.4167175292969, | |
| "epoch": 0.8910891089108911, | |
| "grad_norm": 0.5510440468788147, | |
| "kl": 0.7451171875, | |
| "learning_rate": 1.0278039161078634e-07, | |
| "loss": 0.0197, | |
| "reward": 0.3125000111758709, | |
| "reward_std": 0.25515517219901085, | |
| "rewards/accuracy_reward": 0.3125000111758709, | |
| "rewards/format_reward": 0.0, | |
| "step": 135 | |
| }, | |
| { | |
| "completion_length": 975.3750152587891, | |
| "epoch": 0.8976897689768977, | |
| "grad_norm": 0.4859466552734375, | |
| "kl": 0.39111328125, | |
| "learning_rate": 9.046106882113752e-08, | |
| "loss": 0.0393, | |
| "reward": 0.4791666716337204, | |
| "reward_std": 0.2525114044547081, | |
| "rewards/accuracy_reward": 0.4791666716337204, | |
| "rewards/format_reward": 0.0, | |
| "step": 136 | |
| }, | |
| { | |
| "completion_length": 873.8333587646484, | |
| "epoch": 0.9042904290429042, | |
| "grad_norm": 0.8227368593215942, | |
| "kl": 0.5703125, | |
| "learning_rate": 7.89050362285062e-08, | |
| "loss": 0.0192, | |
| "reward": 0.4583333507180214, | |
| "reward_std": 0.3855264447629452, | |
| "rewards/accuracy_reward": 0.4583333507180214, | |
| "rewards/format_reward": 0.0, | |
| "step": 137 | |
| }, | |
| { | |
| "completion_length": 879.6667022705078, | |
| "epoch": 0.9108910891089109, | |
| "grad_norm": 2.8329885005950928, | |
| "kl": 0.44970703125, | |
| "learning_rate": 6.811855162840214e-08, | |
| "loss": 0.1331, | |
| "reward": 0.4791666753590107, | |
| "reward_std": 0.27258947491645813, | |
| "rewards/accuracy_reward": 0.4791666753590107, | |
| "rewards/format_reward": 0.0, | |
| "step": 138 | |
| }, | |
| { | |
| "completion_length": 677.9166870117188, | |
| "epoch": 0.9174917491749175, | |
| "grad_norm": 2.366013288497925, | |
| "kl": 0.3238067626953125, | |
| "learning_rate": 5.810745609252166e-08, | |
| "loss": 0.1267, | |
| "reward": 0.5208333637565374, | |
| "reward_std": 0.25515517592430115, | |
| "rewards/accuracy_reward": 0.5208333637565374, | |
| "rewards/format_reward": 0.0, | |
| "step": 139 | |
| }, | |
| { | |
| "completion_length": 1135.9791870117188, | |
| "epoch": 0.9240924092409241, | |
| "grad_norm": 0.692542314529419, | |
| "kl": 1.0966796875, | |
| "learning_rate": 4.887717080570431e-08, | |
| "loss": 0.1422, | |
| "reward": 0.06250000186264515, | |
| "reward_std": 0.1530931107699871, | |
| "rewards/accuracy_reward": 0.06250000186264515, | |
| "rewards/format_reward": 0.0, | |
| "step": 140 | |
| }, | |
| { | |
| "completion_length": 1054.6875305175781, | |
| "epoch": 0.9306930693069307, | |
| "grad_norm": 2.231672763824463, | |
| "kl": 0.7978515625, | |
| "learning_rate": 4.0432694130264294e-08, | |
| "loss": 0.166, | |
| "reward": 0.29166667722165585, | |
| "reward_std": 0.2686738669872284, | |
| "rewards/accuracy_reward": 0.29166667722165585, | |
| "rewards/format_reward": 0.0, | |
| "step": 141 | |
| }, | |
| { | |
| "completion_length": 692.4375152587891, | |
| "epoch": 0.9372937293729373, | |
| "grad_norm": 0.28133484721183777, | |
| "kl": 0.2666015625, | |
| "learning_rate": 3.277859889929147e-08, | |
| "loss": -0.0049, | |
| "reward": 0.3125000149011612, | |
| "reward_std": 0.11558076366782188, | |
| "rewards/accuracy_reward": 0.3125000149011612, | |
| "rewards/format_reward": 0.0, | |
| "step": 142 | |
| }, | |
| { | |
| "completion_length": 908.3750534057617, | |
| "epoch": 0.9438943894389439, | |
| "grad_norm": 0.6757098436355591, | |
| "kl": 0.28759765625, | |
| "learning_rate": 2.5919029940380145e-08, | |
| "loss": -0.0196, | |
| "reward": 0.3333333507180214, | |
| "reward_std": 0.30354244261980057, | |
| "rewards/accuracy_reward": 0.3333333507180214, | |
| "rewards/format_reward": 0.0, | |
| "step": 143 | |
| }, | |
| { | |
| "completion_length": 788.1666793823242, | |
| "epoch": 0.9504950495049505, | |
| "grad_norm": 2.814443349838257, | |
| "kl": 0.331787109375, | |
| "learning_rate": 1.985770183113117e-08, | |
| "loss": 0.1527, | |
| "reward": 0.41666669212281704, | |
| "reward_std": 0.3680921457707882, | |
| "rewards/accuracy_reward": 0.41666669212281704, | |
| "rewards/format_reward": 0.0, | |
| "step": 144 | |
| }, | |
| { | |
| "completion_length": 876.6041717529297, | |
| "epoch": 0.9570957095709571, | |
| "grad_norm": 1.0456154346466064, | |
| "kl": 0.763427734375, | |
| "learning_rate": 1.4597896887644457e-08, | |
| "loss": 0.086, | |
| "reward": 0.29166667349636555, | |
| "reward_std": 0.16661180555820465, | |
| "rewards/accuracy_reward": 0.29166667349636555, | |
| "rewards/format_reward": 0.0, | |
| "step": 145 | |
| }, | |
| { | |
| "completion_length": 947.5208587646484, | |
| "epoch": 0.9636963696369637, | |
| "grad_norm": 0.5970892310142517, | |
| "kl": 0.9365234375, | |
| "learning_rate": 1.0142463387085465e-08, | |
| "loss": 0.0589, | |
| "reward": 0.1458333358168602, | |
| "reward_std": 0.13301505148410797, | |
| "rewards/accuracy_reward": 0.1458333358168602, | |
| "rewards/format_reward": 0.0, | |
| "step": 146 | |
| }, | |
| { | |
| "completion_length": 971.2916870117188, | |
| "epoch": 0.9702970297029703, | |
| "grad_norm": 0.4104056656360626, | |
| "kl": 0.8409423828125, | |
| "learning_rate": 6.493814025293476e-09, | |
| "loss": 0.0431, | |
| "reward": 0.33333334140479565, | |
| "reward_std": 0.2686738818883896, | |
| "rewards/accuracy_reward": 0.33333334140479565, | |
| "rewards/format_reward": 0.0, | |
| "step": 147 | |
| }, | |
| { | |
| "completion_length": 869.8958587646484, | |
| "epoch": 0.976897689768977, | |
| "grad_norm": 3.5218846797943115, | |
| "kl": 0.5218505859375, | |
| "learning_rate": 3.6539246102637037e-09, | |
| "loss": 0.1713, | |
| "reward": 0.4791666828095913, | |
| "reward_std": 0.2996268570423126, | |
| "rewards/accuracy_reward": 0.4791666828095913, | |
| "rewards/format_reward": 0.0, | |
| "step": 148 | |
| }, | |
| { | |
| "completion_length": 775.1875152587891, | |
| "epoch": 0.9834983498349835, | |
| "grad_norm": 2.006347417831421, | |
| "kl": 0.35626220703125, | |
| "learning_rate": 1.624332992213151e-09, | |
| "loss": 0.1167, | |
| "reward": 0.5625000223517418, | |
| "reward_std": 0.28219255805015564, | |
| "rewards/accuracy_reward": 0.5625000223517418, | |
| "rewards/format_reward": 0.0, | |
| "step": 149 | |
| }, | |
| { | |
| "completion_length": 690.9166870117188, | |
| "epoch": 0.9900990099009901, | |
| "grad_norm": 3.488175630569458, | |
| "kl": 0.43896484375, | |
| "learning_rate": 4.0613823080742907e-10, | |
| "loss": 0.094, | |
| "reward": 0.6250000149011612, | |
| "reward_std": 0.3977733254432678, | |
| "rewards/accuracy_reward": 0.6250000149011612, | |
| "rewards/format_reward": 0.0, | |
| "step": 150 | |
| }, | |
| { | |
| "completion_length": 961.7500305175781, | |
| "epoch": 0.9966996699669967, | |
| "grad_norm": 2.3887975215911865, | |
| "kl": 0.55078125, | |
| "learning_rate": 0.0, | |
| "loss": 0.1768, | |
| "reward": 0.3958333507180214, | |
| "reward_std": 0.41912319883704185, | |
| "rewards/accuracy_reward": 0.3958333507180214, | |
| "rewards/format_reward": 0.0, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 0.9966996699669967, | |
| "step": 151, | |
| "total_flos": 0.0, | |
| "train_loss": 0.09152872039099531, | |
| "train_runtime": 28966.5677, | |
| "train_samples_per_second": 0.042, | |
| "train_steps_per_second": 0.005 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 151, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 3, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |