{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.7619047619047619, "eval_steps": 10, "global_step": 240, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio": 0.0, "completion_length": 1826.446533203125, "epoch": 0.0031746031746031746, "grad_norm": 0.34752090657228324, "kl": 0.0, "learning_rate": 1e-07, "loss": -0.0327, "reward": 1.2598215341567993, "reward_std": 0.510444164276123, "rewards/": 6.299107551574707, "rewards/math_compute_score": 0.0, "step": 1 }, { "clip_ratio": 0.0, "completion_length": 1842.607177734375, "epoch": 0.006349206349206349, "grad_norm": 0.37130067128404515, "kl": 0.0, "learning_rate": 2e-07, "loss": 0.0544, "reward": 1.4971821308135986, "reward_std": 0.7506331205368042, "rewards/": 5.914481163024902, "rewards/math_compute_score": 0.392857164144516, "step": 2 }, { "clip_ratio": 0.0, "completion_length": 2011.33935546875, "epoch": 0.009523809523809525, "grad_norm": 0.3200176373062262, "kl": 0.0002689361572265625, "learning_rate": 3e-07, "loss": 0.0077, "reward": 0.7667689919471741, "reward_std": 0.6793785691261292, "rewards/": 5.548130989074707, "rewards/math_compute_score": -0.4285714626312256, "step": 3 }, { "clip_ratio": 0.0, "completion_length": 1912.08935546875, "epoch": 0.012698412698412698, "grad_norm": 0.33904036931712517, "kl": 0.000278472900390625, "learning_rate": 4e-07, "loss": 0.0411, "reward": 1.14453125, "reward_std": 0.7682722806930542, "rewards/": 5.865513801574707, "rewards/math_compute_score": -0.0357142873108387, "step": 4 }, { "clip_ratio": 0.0, "completion_length": 1943.5357666015625, "epoch": 0.015873015873015872, "grad_norm": 0.3316962176535279, "kl": 0.0002994537353515625, "learning_rate": 5e-07, "loss": 0.0439, "reward": 0.9941790103912354, "reward_std": 0.9224013090133667, "rewards/": 5.899466514587402, "rewards/math_compute_score": -0.2321428656578064, "step": 5 }, { "clip_ratio": 0.0, "completion_length": 1823.5357666015625, "epoch": 0.01904761904761905, "grad_norm": 0.39521708946388423, "kl": 0.0003261566162109375, "learning_rate": 6e-07, "loss": 0.0678, "reward": 1.3013323545455933, "reward_std": 0.7257120013237, "rewards/": 6.363804817199707, "rewards/math_compute_score": 0.0357142873108387, "step": 6 }, { "clip_ratio": 0.0, "completion_length": 1816.696533203125, "epoch": 0.022222222222222223, "grad_norm": 0.3977474793000202, "kl": 0.0002841949462890625, "learning_rate": 7e-07, "loss": 0.0524, "reward": 0.7803781032562256, "reward_std": 0.8279339075088501, "rewards/": 4.901890754699707, "rewards/math_compute_score": -0.25, "step": 7 }, { "clip_ratio": 0.0, "completion_length": 1811.21435546875, "epoch": 0.025396825396825397, "grad_norm": 0.3883548683310925, "kl": 0.000301361083984375, "learning_rate": 8e-07, "loss": 0.0334, "reward": 1.5578125715255737, "reward_std": 0.5970480442047119, "rewards/": 6.503348350524902, "rewards/math_compute_score": 0.3214285969734192, "step": 8 }, { "clip_ratio": 0.0, "completion_length": 1728.857177734375, "epoch": 0.02857142857142857, "grad_norm": 0.37089633220316565, "kl": 0.00034332275390625, "learning_rate": 9e-07, "loss": 0.087, "reward": 1.2353515625, "reward_std": 0.7409225702285767, "rewards/": 5.891043663024902, "rewards/math_compute_score": 0.0714285746216774, "step": 9 }, { "epoch": 0.031746031746031744, "grad_norm": 0.31106869753963556, "learning_rate": 1e-06, "loss": 0.0123, "step": 10 }, { "epoch": 0.031746031746031744, "eval_clip_ratio": 0.0, "eval_completion_length": 1893.8313802083333, "eval_kl": 0.0003102620442708333, "eval_loss": 0.026542577892541885, "eval_reward": 1.1524926622708638, "eval_reward_std": 0.7845939000447592, "eval_rewards/": 6.024367809295654, "eval_rewards/math_compute_score": -0.06547619650761287, "eval_runtime": 142.6603, "eval_samples_per_second": 0.147, "eval_steps_per_second": 0.007, "step": 10 }, { "clip_ratio": 0.0, "completion_length": 1901.52685546875, "epoch": 0.03492063492063492, "grad_norm": 0.4187176065388408, "kl": 0.000331878662109375, "learning_rate": 9.99973476170006e-07, "loss": 0.052, "reward": 1.0824219584465027, "reward_std": 0.5579104721546173, "rewards/": 6.197824001312256, "rewards/math_compute_score": -0.1964285857975483, "step": 11 }, { "clip_ratio": 0.0, "completion_length": 1792.232177734375, "epoch": 0.0380952380952381, "grad_norm": 0.3897812351982433, "kl": 0.000331878662109375, "learning_rate": 9.998939074940787e-07, "loss": 0.0559, "reward": 1.3557896614074707, "reward_std": 0.6514952778816223, "rewards/": 5.921805381774902, "rewards/math_compute_score": 0.2142857313156128, "step": 12 }, { "clip_ratio": 0.0, "completion_length": 1953.571533203125, "epoch": 0.04126984126984127, "grad_norm": 0.3386069686056558, "kl": 0.000308990478515625, "learning_rate": 9.997613024140818e-07, "loss": 0.0464, "reward": 1.1771763563156128, "reward_std": 0.8344842195510864, "rewards/": 6.528738975524902, "rewards/math_compute_score": -0.1607142984867096, "step": 13 }, { "clip_ratio": 0.0, "completion_length": 1966.3037109375, "epoch": 0.044444444444444446, "grad_norm": 0.3436190087829617, "kl": 0.0003261566162109375, "learning_rate": 9.995756749987941e-07, "loss": 0.0058, "reward": 1.0176271200180054, "reward_std": 0.6552860736846924, "rewards/": 6.373849391937256, "rewards/math_compute_score": -0.3214285969734192, "step": 14 }, { "clip_ratio": 0.0, "completion_length": 1922.58935546875, "epoch": 0.047619047619047616, "grad_norm": 0.3312111219541393, "kl": 0.0002460479736328125, "learning_rate": 9.993370449424152e-07, "loss": 0.0395, "reward": 1.332235336303711, "reward_std": 0.7883400321006775, "rewards/": 6.089747905731201, "rewards/math_compute_score": 0.1428571492433548, "step": 15 }, { "clip_ratio": 0.0, "completion_length": 1920.607177734375, "epoch": 0.050793650793650794, "grad_norm": 0.3660345032806764, "kl": 0.0003185272216796875, "learning_rate": 9.990454375624776e-07, "loss": 0.0429, "reward": 0.9837054014205933, "reward_std": 0.56076979637146, "rewards/": 6.489955425262451, "rewards/math_compute_score": -0.392857164144516, "step": 16 }, { "clip_ratio": 0.0, "completion_length": 1781.5537109375, "epoch": 0.05396825396825397, "grad_norm": 0.3159968435422686, "kl": 0.00029754638671875, "learning_rate": 9.987008837971594e-07, "loss": 0.0258, "reward": 1.4613840579986572, "reward_std": 0.5660971403121948, "rewards/": 5.878348350524902, "rewards/math_compute_score": 0.3571428656578064, "step": 17 }, { "clip_ratio": 0.0, "completion_length": 1722.08935546875, "epoch": 0.05714285714285714, "grad_norm": 0.3769499591892152, "kl": 0.0002956390380859375, "learning_rate": 9.98303420202003e-07, "loss": 0.0515, "reward": 1.5921318531036377, "reward_std": 0.657545268535614, "rewards/": 6.103516101837158, "rewards/math_compute_score": 0.4642857313156128, "step": 18 }, { "clip_ratio": 0.0, "completion_length": 1859.8751220703125, "epoch": 0.06031746031746032, "grad_norm": 0.37701505516549555, "kl": 0.000335693359375, "learning_rate": 9.978530889460349e-07, "loss": 0.0532, "reward": 1.2727400064468384, "reward_std": 0.6805964708328247, "rewards/": 6.435128688812256, "rewards/math_compute_score": -0.01785714365541935, "step": 19 }, { "epoch": 0.06349206349206349, "grad_norm": 0.3560929502812583, "learning_rate": 9.973499378072946e-07, "loss": 0.0505, "step": 20 }, { "epoch": 0.06349206349206349, "eval_clip_ratio": 0.0, "eval_completion_length": 1902.3988444010417, "eval_kl": 0.0003344217936197917, "eval_loss": 0.03622707724571228, "eval_reward": 1.2230852444966633, "eval_reward_std": 0.7650324503580729, "eval_rewards/": 6.020188331604004, "eval_rewards/math_compute_score": 0.023809528599182766, "eval_runtime": 140.9236, "eval_samples_per_second": 0.149, "eval_steps_per_second": 0.007, "step": 20 }, { "clip_ratio": 0.0, "completion_length": 1718.2589721679688, "epoch": 0.06666666666666667, "grad_norm": 0.3375758039734621, "kl": 0.000278472900390625, "learning_rate": 9.967940201677625e-07, "loss": 0.0309, "reward": 1.532582402229309, "reward_std": 0.5138258934020996, "rewards/": 6.23434042930603, "rewards/math_compute_score": 0.3571428805589676, "step": 21 }, { "clip_ratio": 0.0, "completion_length": 1812.821533203125, "epoch": 0.06984126984126984, "grad_norm": 0.3990303343369039, "kl": 0.0004177093505859375, "learning_rate": 9.96185395007699e-07, "loss": 0.0801, "reward": 1.3509488105773926, "reward_std": 0.9690964818000793, "rewards/": 6.040457725524902, "rewards/math_compute_score": 0.1785714328289032, "step": 22 }, { "clip_ratio": 0.0, "completion_length": 1906.821533203125, "epoch": 0.07301587301587302, "grad_norm": 0.3549321131443626, "kl": 0.00028228759765625, "learning_rate": 9.95524126899385e-07, "loss": -0.0105, "reward": 1.0774554014205933, "reward_std": 0.8010032176971436, "rewards/": 6.244420051574707, "rewards/math_compute_score": -0.2142857313156128, "step": 23 }, { "clip_ratio": 0.0, "completion_length": 1859.821533203125, "epoch": 0.0761904761904762, "grad_norm": 0.400791590884524, "kl": 0.0003490447998046875, "learning_rate": 9.94810286000272e-07, "loss": 0.0269, "reward": 1.2509558200836182, "reward_std": 0.6348705291748047, "rewards/": 5.683350086212158, "rewards/math_compute_score": 0.1428571492433548, "step": 24 }, { "clip_ratio": 0.0, "completion_length": 1910.6787109375, "epoch": 0.07936507936507936, "grad_norm": 0.32248521687421966, "kl": 0.0003147125244140625, "learning_rate": 9.940439480455385e-07, "loss": 0.0147, "reward": 1.06690514087677, "reward_std": 0.6229907870292664, "rewards/": 6.048810958862305, "rewards/math_compute_score": -0.1785714328289032, "step": 25 }, { "clip_ratio": 0.0, "completion_length": 1673.5357666015625, "epoch": 0.08253968253968254, "grad_norm": 0.4098565397161361, "kl": 0.0003509521484375, "learning_rate": 9.932251943400553e-07, "loss": 0.0784, "reward": 1.8423550128936768, "reward_std": 0.5337907075881958, "rewards/": 6.497488975524902, "rewards/math_compute_score": 0.6785714626312256, "step": 26 }, { "clip_ratio": 0.0, "completion_length": 1848.6429443359375, "epoch": 0.08571428571428572, "grad_norm": 0.3413878426395489, "kl": 0.000400543212890625, "learning_rate": 9.923541117497585e-07, "loss": -0.0184, "reward": 1.2410855293273926, "reward_std": 0.5728386044502258, "rewards/": 6.205427169799805, "rewards/math_compute_score": 0.0, "step": 27 }, { "clip_ratio": 0.0, "completion_length": 1833.607177734375, "epoch": 0.08888888888888889, "grad_norm": 0.398653320294794, "kl": 0.0004863739013671875, "learning_rate": 9.914307926924344e-07, "loss": 0.0011, "reward": 1.2547712326049805, "reward_std": 0.5897310376167297, "rewards/": 6.416713237762451, "rewards/math_compute_score": -0.0357142873108387, "step": 28 }, { "clip_ratio": 0.0, "completion_length": 1974.982177734375, "epoch": 0.09206349206349207, "grad_norm": 0.34060151455859505, "kl": 0.00042724609375, "learning_rate": 9.904553351279136e-07, "loss": 0.0252, "reward": 0.9052909016609192, "reward_std": 0.7842908501625061, "rewards/": 5.812169075012207, "rewards/math_compute_score": -0.3214285969734192, "step": 29 }, { "epoch": 0.09523809523809523, "grad_norm": 0.3555389006533955, "learning_rate": 9.894278425476788e-07, "loss": 0.0422, "step": 30 }, { "epoch": 0.09523809523809523, "eval_clip_ratio": 0.0, "eval_completion_length": 1847.3909505208333, "eval_kl": 0.0004094441731770833, "eval_loss": 0.027497123926877975, "eval_reward": 1.240986665089925, "eval_reward_std": 0.79307621717453, "eval_rewards/": 5.8715996742248535, "eval_rewards/math_compute_score": 0.0833333432674408, "eval_runtime": 139.8981, "eval_samples_per_second": 0.15, "eval_steps_per_second": 0.007, "step": 30 }, { "clip_ratio": 0.0, "completion_length": 1868.71435546875, "epoch": 0.09841269841269841, "grad_norm": 0.3260634697668482, "kl": 0.0004024505615234375, "learning_rate": 9.88348423963884e-07, "loss": 0.0114, "reward": 1.2039064168930054, "reward_std": 0.64045649766922, "rewards/": 6.305245876312256, "rewards/math_compute_score": -0.0714285783469677, "step": 31 }, { "clip_ratio": 0.0, "completion_length": 1790.857177734375, "epoch": 0.10158730158730159, "grad_norm": 0.3567249854105024, "kl": 0.00042724609375, "learning_rate": 9.872171938977893e-07, "loss": 0.0165, "reward": 1.3144984245300293, "reward_std": 0.8585119843482971, "rewards/": 5.858206748962402, "rewards/math_compute_score": 0.1785714328289032, "step": 32 }, { "clip_ratio": 0.0, "completion_length": 1813.7857666015625, "epoch": 0.10476190476190476, "grad_norm": 0.380215941310899, "kl": 0.0005035400390625, "learning_rate": 9.860342723676104e-07, "loss": 0.0179, "reward": 1.340485692024231, "reward_std": 0.7554614543914795, "rewards/": 5.773856163024902, "rewards/math_compute_score": 0.2321428656578064, "step": 33 }, { "clip_ratio": 0.0, "completion_length": 1642.2501220703125, "epoch": 0.10793650793650794, "grad_norm": 0.3669725090275098, "kl": 0.0003681182861328125, "learning_rate": 9.847997848757854e-07, "loss": 0.0739, "reward": 1.350502371788025, "reward_std": 0.4604892134666443, "rewards/": 5.681082725524902, "rewards/math_compute_score": 0.267857164144516, "step": 34 }, { "clip_ratio": 0.0, "completion_length": 1810.3751220703125, "epoch": 0.1111111111111111, "grad_norm": 0.3898398578219401, "kl": 0.00051116943359375, "learning_rate": 9.835138623956602e-07, "loss": -0.0012, "reward": 1.1720424890518188, "reward_std": 0.6071317195892334, "rewards/": 5.717355251312256, "rewards/math_compute_score": 0.0357142873108387, "step": 35 }, { "clip_ratio": 0.0, "completion_length": 1676.46435546875, "epoch": 0.11428571428571428, "grad_norm": 0.3415556235291945, "kl": 0.000461578369140625, "learning_rate": 9.821766413575914e-07, "loss": 0.0237, "reward": 1.5515068769454956, "reward_std": 0.41333118081092834, "rewards/": 6.043248176574707, "rewards/math_compute_score": 0.4285714626312256, "step": 36 }, { "clip_ratio": 0.0, "completion_length": 1821.0357666015625, "epoch": 0.11746031746031746, "grad_norm": 0.3841969882035549, "kl": 0.00049591064453125, "learning_rate": 9.80788263634473e-07, "loss": 0.0204, "reward": 1.4578125476837158, "reward_std": 0.6289081573486328, "rewards/": 6.574777126312256, "rewards/math_compute_score": 0.1785714328289032, "step": 37 }, { "clip_ratio": 0.0, "completion_length": 1789.196533203125, "epoch": 0.12063492063492064, "grad_norm": 0.4014410966440469, "kl": 0.000530242919921875, "learning_rate": 9.793488765266838e-07, "loss": 0.07, "reward": 1.5906460285186768, "reward_std": 0.8234072327613831, "rewards/": 6.0960869789123535, "rewards/math_compute_score": 0.4642857313156128, "step": 38 }, { "clip_ratio": 0.0, "completion_length": 1829.0537109375, "epoch": 0.12380952380952381, "grad_norm": 0.34941213890761874, "kl": 0.00049591064453125, "learning_rate": 9.778586327464597e-07, "loss": 0.0278, "reward": 1.5846540927886963, "reward_std": 0.6426271200180054, "rewards/": 6.637556076049805, "rewards/math_compute_score": 0.3214285969734192, "step": 39 }, { "epoch": 0.12698412698412698, "grad_norm": 0.38140427899057316, "learning_rate": 9.763176904016913e-07, "loss": 0.0294, "step": 40 }, { "epoch": 0.12698412698412698, "eval_clip_ratio": 0.0, "eval_completion_length": 1791.4822184244792, "eval_kl": 0.0005544026692708334, "eval_loss": 0.05520148575305939, "eval_reward": 1.4818546374638875, "eval_reward_std": 0.7397708296775818, "eval_rewards/": 6.504511038462321, "eval_rewards/math_compute_score": 0.2261904776096344, "eval_runtime": 138.6839, "eval_samples_per_second": 0.151, "eval_steps_per_second": 0.007, "step": 40 }, { "clip_ratio": 0.0, "completion_length": 1758.419677734375, "epoch": 0.13015873015873017, "grad_norm": 0.38719549506679857, "kl": 0.0005283355712890625, "learning_rate": 9.747262129791495e-07, "loss": 0.0051, "reward": 1.2441372275352478, "reward_std": 0.5995893478393555, "rewards/": 6.0064003467559814, "rewards/math_compute_score": 0.0535714253783226, "step": 41 }, { "clip_ratio": 0.0, "completion_length": 1710.3035888671875, "epoch": 0.13333333333333333, "grad_norm": 0.3698050435119778, "kl": 0.000522613525390625, "learning_rate": 9.730843693271413e-07, "loss": 0.0453, "reward": 1.4925503730773926, "reward_std": 0.6253259181976318, "rewards/": 5.819894313812256, "rewards/math_compute_score": 0.4107142984867096, "step": 42 }, { "clip_ratio": 0.0, "completion_length": 1744.3929443359375, "epoch": 0.1365079365079365, "grad_norm": 0.3735801890902873, "kl": 0.0006866455078125, "learning_rate": 9.713923336375936e-07, "loss": 0.0102, "reward": 1.4331055879592896, "reward_std": 0.829824686050415, "rewards/": 5.951241970062256, "rewards/math_compute_score": 0.3035714328289032, "step": 43 }, { "clip_ratio": 0.0, "completion_length": 1797.232177734375, "epoch": 0.13968253968253969, "grad_norm": 0.35251183170280326, "kl": 0.00058746337890625, "learning_rate": 9.696502854275748e-07, "loss": 0.0273, "reward": 1.3867467641830444, "reward_std": 0.6368395686149597, "rewards/": 6.219447612762451, "rewards/math_compute_score": 0.1785714328289032, "step": 44 }, { "clip_ratio": 0.0, "completion_length": 1759.08935546875, "epoch": 0.14285714285714285, "grad_norm": 0.36571909135002856, "kl": 0.000553131103515625, "learning_rate": 9.678584095202469e-07, "loss": 0.0341, "reward": 0.931584894657135, "reward_std": 0.5959246754646301, "rewards/": 5.729352951049805, "rewards/math_compute_score": -0.267857164144516, "step": 45 }, { "clip_ratio": 0.0, "completion_length": 1736.357177734375, "epoch": 0.14603174603174604, "grad_norm": 0.33831757661050815, "kl": 0.000553131103515625, "learning_rate": 9.660168960252575e-07, "loss": 0.0437, "reward": 1.6665178537368774, "reward_std": 0.4598635137081146, "rewards/": 6.046875476837158, "rewards/math_compute_score": 0.5714285969734192, "step": 46 }, { "clip_ratio": 0.0, "completion_length": 1807.8037109375, "epoch": 0.1492063492063492, "grad_norm": 0.36813538905077847, "kl": 0.000637054443359375, "learning_rate": 9.641259403185704e-07, "loss": 0.031, "reward": 0.9589914083480835, "reward_std": 0.5812153816223145, "rewards/": 5.080671310424805, "rewards/math_compute_score": -0.0714285746216774, "step": 47 }, { "clip_ratio": 0.0, "completion_length": 1591.982177734375, "epoch": 0.1523809523809524, "grad_norm": 0.3231025820418364, "kl": 0.000698089599609375, "learning_rate": 9.621857430217365e-07, "loss": 0.0424, "reward": 1.8917970657348633, "reward_std": 0.3157893121242523, "rewards/": 6.744699001312256, "rewards/math_compute_score": 0.6785714626312256, "step": 48 }, { "clip_ratio": 0.0, "completion_length": 1881.982177734375, "epoch": 0.15555555555555556, "grad_norm": 0.3466507877034998, "kl": 0.0007781982421875, "learning_rate": 9.601965099806084e-07, "loss": 0.0405, "reward": 1.4976422786712646, "reward_std": 0.7847145199775696, "rewards/": 6.416783332824707, "rewards/math_compute_score": 0.267857164144516, "step": 49 }, { "epoch": 0.15873015873015872, "grad_norm": 0.3331388102754494, "learning_rate": 9.581584522435023e-07, "loss": 0.0388, "step": 50 }, { "epoch": 0.15873015873015872, "eval_clip_ratio": 0.0, "eval_completion_length": 1816.2877604166667, "eval_kl": 0.0007731119791666666, "eval_loss": 0.04274662211537361, "eval_reward": 1.551771879196167, "eval_reward_std": 0.641852875550588, "eval_rewards/": 6.258859157562256, "eval_rewards/math_compute_score": 0.3750000149011612, "eval_runtime": 138.9269, "eval_samples_per_second": 0.151, "eval_steps_per_second": 0.007, "step": 50 }, { "clip_ratio": 0.0, "completion_length": 1660.357177734375, "epoch": 0.1619047619047619, "grad_norm": 0.41262546854699034, "kl": 0.0008182525634765625, "learning_rate": 9.56071786038806e-07, "loss": 0.0192, "reward": 1.6869142055511475, "reward_std": 0.39825020730495453, "rewards/": 6.577427625656128, "rewards/math_compute_score": 0.4642857313156128, "step": 51 }, { "clip_ratio": 0.0, "completion_length": 1935.571533203125, "epoch": 0.16507936507936508, "grad_norm": 0.34500490912598375, "kl": 0.000762939453125, "learning_rate": 9.53936732752038e-07, "loss": 0.0028, "reward": 1.2544364929199219, "reward_std": 0.6236512660980225, "rewards/": 6.557896614074707, "rewards/math_compute_score": -0.0714285746216774, "step": 52 }, { "clip_ratio": 0.0, "completion_length": 1827.482177734375, "epoch": 0.16825396825396827, "grad_norm": 0.34348526502845717, "kl": 0.000732421875, "learning_rate": 9.517535189023601e-07, "loss": -0.0054, "reward": 1.2513673305511475, "reward_std": 0.8205690979957581, "rewards/": 5.8282647132873535, "rewards/math_compute_score": 0.1071428656578064, "step": 53 }, { "clip_ratio": 0.0, "completion_length": 1753.4285888671875, "epoch": 0.17142857142857143, "grad_norm": 0.36933753325659874, "kl": 0.00099945068359375, "learning_rate": 9.495223761185441e-07, "loss": 0.0034, "reward": 1.4741246700286865, "reward_std": 0.6187431216239929, "rewards/": 6.513480186462402, "rewards/math_compute_score": 0.2142857313156128, "step": 54 }, { "clip_ratio": 0.0, "completion_length": 1771.8751220703125, "epoch": 0.1746031746031746, "grad_norm": 0.34576119098823954, "kl": 0.0009918212890625, "learning_rate": 9.472435411143977e-07, "loss": 0.0191, "reward": 1.5563616752624512, "reward_std": 0.6519899368286133, "rewards/": 6.781808376312256, "rewards/math_compute_score": 0.25, "step": 55 }, { "clip_ratio": 0.0, "completion_length": 1711.1785888671875, "epoch": 0.17777777777777778, "grad_norm": 0.3525345385926051, "kl": 0.000843048095703125, "learning_rate": 9.449172556636497e-07, "loss": -0.0206, "reward": 1.422028660774231, "reward_std": 0.8101202249526978, "rewards/": 6.110142707824707, "rewards/math_compute_score": 0.25, "step": 56 }, { "clip_ratio": 0.0, "completion_length": 1774.6251220703125, "epoch": 0.18095238095238095, "grad_norm": 0.40494567349730853, "kl": 0.000972747802734375, "learning_rate": 9.425437665742997e-07, "loss": 0.0519, "reward": 1.3338658809661865, "reward_std": 0.7232537865638733, "rewards/": 6.240757942199707, "rewards/math_compute_score": 0.1071428656578064, "step": 57 }, { "clip_ratio": 0.0, "completion_length": 1828.071533203125, "epoch": 0.18412698412698414, "grad_norm": 0.33450583156617786, "kl": 0.000881195068359375, "learning_rate": 9.401233256624316e-07, "loss": 0.0249, "reward": 1.4562500715255737, "reward_std": 0.6799939274787903, "rewards/": 6.566964626312256, "rewards/math_compute_score": 0.1785714328289032, "step": 58 }, { "clip_ratio": 0.0, "completion_length": 1660.696533203125, "epoch": 0.1873015873015873, "grad_norm": 0.4070810399603553, "kl": 0.0011444091796875, "learning_rate": 9.376561897254987e-07, "loss": -0.0269, "reward": 1.5412318706512451, "reward_std": 0.5850991606712341, "rewards/": 6.277588367462158, "rewards/math_compute_score": 0.3571428656578064, "step": 59 }, { "epoch": 0.19047619047619047, "grad_norm": 0.37255871829826, "learning_rate": 9.351426205150776e-07, "loss": 0.038, "step": 60 }, { "epoch": 0.19047619047619047, "eval_clip_ratio": 0.0, "eval_completion_length": 1717.0694986979167, "eval_kl": 0.0009918212890625, "eval_loss": 0.03207956254482269, "eval_reward": 1.5469355980555217, "eval_reward_std": 0.6412561237812042, "eval_rewards/": 6.091820240020752, "eval_rewards/math_compute_score": 0.4107142984867096, "eval_runtime": 136.3487, "eval_samples_per_second": 0.154, "eval_steps_per_second": 0.007, "step": 60 }, { "clip_ratio": 0.0, "completion_length": 1612.7322387695312, "epoch": 0.19365079365079366, "grad_norm": 0.3484954653008926, "kl": 0.001018524169921875, "learning_rate": 9.32582884709098e-07, "loss": 0.0535, "reward": 1.569977879524231, "reward_std": 0.5602003335952759, "rewards/": 6.2784600257873535, "rewards/math_compute_score": 0.39285717345774174, "step": 61 }, { "clip_ratio": 0.0, "completion_length": 1474.1429443359375, "epoch": 0.19682539682539682, "grad_norm": 0.456250830578578, "kl": 0.0013275146484375, "learning_rate": 9.299772538835491e-07, "loss": -0.0324, "reward": 1.570549726486206, "reward_std": 0.3879093527793884, "rewards/": 6.138463020324707, "rewards/math_compute_score": 0.4285714626312256, "step": 62 }, { "clip_ratio": 0.0, "completion_length": 1645.482177734375, "epoch": 0.2, "grad_norm": 0.3717148388714456, "kl": 0.00115203857421875, "learning_rate": 9.273260044836673e-07, "loss": 0.0842, "reward": 1.741573691368103, "reward_std": 0.49386849999427795, "rewards/": 6.779297351837158, "rewards/math_compute_score": 0.4821428656578064, "step": 63 }, { "clip_ratio": 0.0, "completion_length": 1534.857177734375, "epoch": 0.20317460317460317, "grad_norm": 0.3745652306218341, "kl": 0.00112152099609375, "learning_rate": 9.246294177946062e-07, "loss": 0.0523, "reward": 1.82996666431427, "reward_std": 0.42854541540145874, "rewards/": 6.721261501312256, "rewards/math_compute_score": 0.6071428656578064, "step": 64 }, { "clip_ratio": 0.0, "completion_length": 1709.9285888671875, "epoch": 0.20634920634920634, "grad_norm": 0.4185811440041477, "kl": 0.00122833251953125, "learning_rate": 9.218877799115927e-07, "loss": 0.0595, "reward": 1.7054688930511475, "reward_std": 0.5991591215133667, "rewards/": 6.813058376312256, "rewards/math_compute_score": 0.4285714626312256, "step": 65 }, { "clip_ratio": 0.0, "completion_length": 1851.0357666015625, "epoch": 0.20952380952380953, "grad_norm": 0.34158790586957477, "kl": 0.0010986328125, "learning_rate": 9.191013817095761e-07, "loss": 0.0013, "reward": 1.466183066368103, "reward_std": 0.5523228049278259, "rewards/": 7.045201301574707, "rewards/math_compute_score": 0.0714285746216774, "step": 66 }, { "clip_ratio": 0.0, "completion_length": 1783.4287109375, "epoch": 0.2126984126984127, "grad_norm": 0.40084205903248576, "kl": 0.0013427734375, "learning_rate": 9.162705188123646e-07, "loss": 0.0218, "reward": 1.456040859222412, "reward_std": 0.5050444602966309, "rewards/": 6.994489669799805, "rewards/math_compute_score": 0.0714285746216774, "step": 67 }, { "clip_ratio": 0.0, "completion_length": 1766.2501220703125, "epoch": 0.21587301587301588, "grad_norm": 0.38102579204059783, "kl": 0.0012969970703125, "learning_rate": 9.133954915612634e-07, "loss": 0.087, "reward": 1.6629464626312256, "reward_std": 0.7155088782310486, "rewards/": 6.957589626312256, "rewards/math_compute_score": 0.3392857313156128, "step": 68 }, { "clip_ratio": 0.0, "completion_length": 1658.96435546875, "epoch": 0.21904761904761905, "grad_norm": 0.36394590935604426, "kl": 0.00115966796875, "learning_rate": 9.104766049832087e-07, "loss": 0.0179, "reward": 1.6870676279067993, "reward_std": 0.4471067190170288, "rewards/": 6.506766319274902, "rewards/math_compute_score": 0.4821428656578064, "step": 69 }, { "epoch": 0.2222222222222222, "grad_norm": 0.3374654129906019, "learning_rate": 9.075141687584056e-07, "loss": 0.0178, "step": 70 }, { "epoch": 0.2222222222222222, "eval_clip_ratio": 0.0, "eval_completion_length": 1738.0059814453125, "eval_kl": 0.00133514404296875, "eval_loss": 0.05646166205406189, "eval_reward": 1.5958195527394612, "eval_reward_std": 0.6562197208404541, "eval_rewards/": 6.621954282124837, "eval_rewards/math_compute_score": 0.3392857213815053, "eval_runtime": 136.062, "eval_samples_per_second": 0.154, "eval_steps_per_second": 0.007, "step": 70 }, { "clip_ratio": 0.0, "completion_length": 1752.3304443359375, "epoch": 0.2253968253968254, "grad_norm": 0.38489393931988863, "kl": 0.001201629638671875, "learning_rate": 9.045084971874737e-07, "loss": -0.0611, "reward": 1.3448200225830078, "reward_std": 0.5091241598129272, "rewards/": 6.402670860290527, "rewards/math_compute_score": 0.0803571492433548, "step": 71 }, { "clip_ratio": 0.0, "completion_length": 1880.2501220703125, "epoch": 0.22857142857142856, "grad_norm": 0.39120717997716, "kl": 0.00131988525390625, "learning_rate": 9.014599091580998e-07, "loss": 0.048, "reward": 1.76941978931427, "reward_std": 0.5547811388969421, "rewards/": 7.2042412757873535, "rewards/math_compute_score": 0.4107142984867096, "step": 72 }, { "clip_ratio": 0.0, "completion_length": 1588.08935546875, "epoch": 0.23174603174603176, "grad_norm": 0.3304077909442911, "kl": 0.00144195556640625, "learning_rate": 8.983687281112064e-07, "loss": 0.0299, "reward": 2.067634105682373, "reward_std": 0.4065239131450653, "rewards/": 7.195312976837158, "rewards/math_compute_score": 0.785714328289032, "step": 73 }, { "clip_ratio": 0.0, "completion_length": 1834.7857666015625, "epoch": 0.23492063492063492, "grad_norm": 0.31261547252144517, "kl": 0.001434326171875, "learning_rate": 8.952352820066358e-07, "loss": 0.0338, "reward": 1.8736608028411865, "reward_std": 0.4773353934288025, "rewards/": 6.939732551574707, "rewards/math_compute_score": 0.6071428656578064, "step": 74 }, { "clip_ratio": 0.0, "completion_length": 1668.071533203125, "epoch": 0.23809523809523808, "grad_norm": 0.37467501438016265, "kl": 0.0016326904296875, "learning_rate": 8.920599032883552e-07, "loss": -0.0187, "reward": 1.3380582332611084, "reward_std": 0.505685031414032, "rewards/": 5.690290451049805, "rewards/math_compute_score": 0.25, "step": 75 }, { "clip_ratio": 0.0, "completion_length": 1687.107177734375, "epoch": 0.24126984126984127, "grad_norm": 0.3823169614619227, "kl": 0.00157928466796875, "learning_rate": 8.888429288491855e-07, "loss": 0.0274, "reward": 1.6549667119979858, "reward_std": 0.3212043046951294, "rewards/": 6.560547351837158, "rewards/math_compute_score": 0.4285714626312256, "step": 76 }, { "clip_ratio": 0.0, "completion_length": 1807.1787109375, "epoch": 0.24444444444444444, "grad_norm": 0.35610484187919744, "kl": 0.0015869140625, "learning_rate": 8.855846999950595e-07, "loss": 0.0261, "reward": 1.2699779272079468, "reward_std": 0.5984498858451843, "rewards/": 6.492745876312256, "rewards/math_compute_score": -0.0357142873108387, "step": 77 }, { "clip_ratio": 0.0, "completion_length": 1851.9107666015625, "epoch": 0.24761904761904763, "grad_norm": 0.2883614180110202, "kl": 0.00138092041015625, "learning_rate": 8.822855624088097e-07, "loss": -0.0185, "reward": 1.5325753688812256, "reward_std": 0.6921989321708679, "rewards/": 6.734305381774902, "rewards/math_compute_score": 0.2321428656578064, "step": 78 }, { "clip_ratio": 0.0, "completion_length": 1517.8751220703125, "epoch": 0.2507936507936508, "grad_norm": 0.3716530172552612, "kl": 0.00165557861328125, "learning_rate": 8.789458661134942e-07, "loss": 0.049, "reward": 1.7504465579986572, "reward_std": 0.39499327540397644, "rewards/": 6.466517925262451, "rewards/math_compute_score": 0.5714285969734192, "step": 79 }, { "epoch": 0.25396825396825395, "grad_norm": 0.3590906991873432, "learning_rate": 8.755659654352599e-07, "loss": 0.0028, "step": 80 }, { "epoch": 0.25396825396825395, "eval_clip_ratio": 0.0, "eval_completion_length": 1681.073486328125, "eval_kl": 0.017834981282552082, "eval_loss": 0.015299047343432903, "eval_reward": 1.720870574315389, "eval_reward_std": 0.5228437781333923, "eval_rewards/": 6.556733926137288, "eval_rewards/math_compute_score": 0.5119047897557417, "eval_runtime": 135.3946, "eval_samples_per_second": 0.155, "eval_steps_per_second": 0.007, "step": 80 }, { "clip_ratio": 0.0, "completion_length": 1618.1160888671875, "epoch": 0.2571428571428571, "grad_norm": 0.3174887107646142, "kl": 0.00168609619140625, "learning_rate": 8.721462189657509e-07, "loss": 0.0154, "reward": 1.8046876192092896, "reward_std": 0.45894815027713776, "rewards/": 6.273437738418579, "rewards/math_compute_score": 0.6875000298023224, "step": 81 }, { "clip_ratio": 0.0, "completion_length": 1785.607177734375, "epoch": 0.26031746031746034, "grad_norm": 0.34079321817593966, "kl": 0.0016326904296875, "learning_rate": 8.686869895240631e-07, "loss": -0.01, "reward": 1.5418108701705933, "reward_std": 0.6146999597549438, "rewards/": 6.923340320587158, "rewards/math_compute_score": 0.196428582072258, "step": 82 }, { "clip_ratio": 0.0, "completion_length": 1692.9285888671875, "epoch": 0.2634920634920635, "grad_norm": 0.3374435642100782, "kl": 0.00174713134765625, "learning_rate": 8.651886441182508e-07, "loss": 0.027, "reward": 1.9006696939468384, "reward_std": 0.45941323041915894, "rewards/": 7.217634201049805, "rewards/math_compute_score": 0.5714285969734192, "step": 83 }, { "clip_ratio": 0.0, "completion_length": 1753.946533203125, "epoch": 0.26666666666666666, "grad_norm": 0.31783244878975037, "kl": 0.001495361328125, "learning_rate": 8.616515539063894e-07, "loss": 0.003, "reward": 1.582680106163025, "reward_std": 0.7250080704689026, "rewards/": 6.556257247924805, "rewards/math_compute_score": 0.3392857313156128, "step": 84 }, { "clip_ratio": 0.0, "completion_length": 1419.107177734375, "epoch": 0.2698412698412698, "grad_norm": 0.42796739682838303, "kl": 0.0025634765625, "learning_rate": 8.580760941571966e-07, "loss": 0.0542, "reward": 1.8185827732086182, "reward_std": 0.24573805928230286, "rewards/": 6.521484851837158, "rewards/math_compute_score": 0.6428571939468384, "step": 85 }, { "clip_ratio": 0.0, "completion_length": 1636.6607666015625, "epoch": 0.273015873015873, "grad_norm": 0.42716916595705023, "kl": 0.0019989013671875, "learning_rate": 8.544626442102187e-07, "loss": 0.0444, "reward": 1.9161133766174316, "reward_std": 0.5437954664230347, "rewards/": 7.009138107299805, "rewards/math_compute_score": 0.6428571939468384, "step": 86 }, { "clip_ratio": 0.0, "completion_length": 1772.196533203125, "epoch": 0.2761904761904762, "grad_norm": 0.36873824811015804, "kl": 0.00189208984375, "learning_rate": 8.508115874355839e-07, "loss": 0.059, "reward": 1.403194785118103, "reward_std": 0.5319852828979492, "rewards/": 6.873116970062256, "rewards/math_compute_score": 0.0357142873108387, "step": 87 }, { "clip_ratio": 0.0, "completion_length": 1486.107177734375, "epoch": 0.27936507936507937, "grad_norm": 0.4198714949042119, "kl": 0.00194549560546875, "learning_rate": 8.47123311193329e-07, "loss": 0.0626, "reward": 1.9519531726837158, "reward_std": 0.4156142473220825, "rewards/": 7.3311944007873535, "rewards/math_compute_score": 0.6071428656578064, "step": 88 }, { "clip_ratio": 0.0, "completion_length": 1490.982177734375, "epoch": 0.28253968253968254, "grad_norm": 0.3741322266459614, "kl": 0.0023956298828125, "learning_rate": 8.433982067923021e-07, "loss": 0.0168, "reward": 1.99573814868927, "reward_std": 0.4969152510166168, "rewards/": 6.978690147399902, "rewards/math_compute_score": 0.7500000596046448, "step": 89 }, { "epoch": 0.2857142857142857, "grad_norm": 0.32032292095086534, "learning_rate": 8.396366694486466e-07, "loss": 0.0544, "step": 90 }, { "epoch": 0.2857142857142857, "eval_clip_ratio": 0.0, "eval_completion_length": 1615.83935546875, "eval_kl": 0.002166748046875, "eval_loss": 0.04793115332722664, "eval_reward": 1.9063432614008586, "eval_reward_std": 0.47585757573445636, "eval_rewards/": 6.746001084645589, "eval_rewards/math_compute_score": 0.6964285969734192, "eval_runtime": 133.7778, "eval_samples_per_second": 0.157, "eval_steps_per_second": 0.007, "step": 90 }, { "clip_ratio": 0.0, "completion_length": 1566.7858276367188, "epoch": 0.28888888888888886, "grad_norm": 0.3482667132376765, "kl": 0.00211334228515625, "learning_rate": 8.358390982438705e-07, "loss": -0.0106, "reward": 1.6325893998146057, "reward_std": 0.42033930122852325, "rewards/": 6.4486610889434814, "rewards/math_compute_score": 0.4285714477300644, "step": 91 }, { "clip_ratio": 0.0, "completion_length": 1555.5535888671875, "epoch": 0.2920634920634921, "grad_norm": 0.3459245295737851, "kl": 0.00384521484375, "learning_rate": 8.320058960825058e-07, "loss": 0.0209, "reward": 1.5438895225524902, "reward_std": 0.4550023376941681, "rewards/": 6.4337334632873535, "rewards/math_compute_score": 0.3214285969734192, "step": 92 }, { "clip_ratio": 0.0, "completion_length": 1714.1607666015625, "epoch": 0.29523809523809524, "grad_norm": 0.42497165251755675, "kl": 0.0028839111328125, "learning_rate": 8.281374696493626e-07, "loss": 0.0165, "reward": 1.7839986085891724, "reward_std": 0.6411929130554199, "rewards/": 7.062849044799805, "rewards/math_compute_score": 0.4642857313156128, "step": 93 }, { "clip_ratio": 0.0, "completion_length": 1584.946533203125, "epoch": 0.2984126984126984, "grad_norm": 0.39574012867238795, "kl": 0.0023956298828125, "learning_rate": 8.242342293663809e-07, "loss": 0.0325, "reward": 1.5983260869979858, "reward_std": 0.4390745162963867, "rewards/": 6.991629600524902, "rewards/math_compute_score": 0.25, "step": 94 }, { "clip_ratio": 0.0, "completion_length": 1597.482177734375, "epoch": 0.30158730158730157, "grad_norm": 0.3834564701675289, "kl": 0.0025634765625, "learning_rate": 8.202965893490876e-07, "loss": -0.0186, "reward": 1.340318202972412, "reward_std": 0.5021482110023499, "rewards/": 5.915876388549805, "rewards/math_compute_score": 0.196428582072258, "step": 95 }, { "clip_ratio": 0.0, "completion_length": 1800.607177734375, "epoch": 0.3047619047619048, "grad_norm": 0.39206140103553405, "kl": 0.0027923583984375, "learning_rate": 8.163249673626602e-07, "loss": 0.0298, "reward": 1.2919502258300781, "reward_std": 0.6428667902946472, "rewards/": 6.031180381774902, "rewards/math_compute_score": 0.1071428656578064, "step": 96 }, { "clip_ratio": 0.0, "completion_length": 1651.8035888671875, "epoch": 0.30793650793650795, "grad_norm": 0.3826880063008121, "kl": 0.00238037109375, "learning_rate": 8.123197847776042e-07, "loss": 0.0461, "reward": 1.705224633216858, "reward_std": 0.5447785258293152, "rewards/": 6.668980598449707, "rewards/math_compute_score": 0.4642857313156128, "step": 97 }, { "clip_ratio": 0.0, "completion_length": 1901.321533203125, "epoch": 0.3111111111111111, "grad_norm": 0.3643029236288363, "kl": 0.0025177001953125, "learning_rate": 8.082814665250476e-07, "loss": 0.0243, "reward": 1.3659180402755737, "reward_std": 0.7389653921127319, "rewards/": 6.543875694274902, "rewards/math_compute_score": 0.0714285746216774, "step": 98 }, { "clip_ratio": 0.0, "completion_length": 1414.696533203125, "epoch": 0.3142857142857143, "grad_norm": 0.42740784739438414, "kl": 0.0030975341796875, "learning_rate": 8.042104410516575e-07, "loss": 0.0245, "reward": 1.5442662239074707, "reward_std": 0.5408600568771362, "rewards/": 6.292759895324707, "rewards/math_compute_score": 0.3571428656578064, "step": 99 }, { "epoch": 0.31746031746031744, "grad_norm": 0.3367757080515649, "learning_rate": 8.001071402741842e-07, "loss": 0.007, "step": 100 }, { "epoch": 0.31746031746031744, "eval_clip_ratio": 0.0, "eval_completion_length": 1632.1806233723958, "eval_kl": 0.0024922688802083335, "eval_loss": 0.026421261951327324, "eval_reward": 1.9330463409423828, "eval_reward_std": 0.39154160519440967, "eval_rewards/": 7.0461835861206055, "eval_rewards/math_compute_score": 0.6547619154055914, "eval_runtime": 134.9073, "eval_samples_per_second": 0.156, "eval_steps_per_second": 0.007, "step": 100 }, { "clip_ratio": 0.0, "completion_length": 1608.5447387695312, "epoch": 0.32063492063492066, "grad_norm": 0.35681279315589565, "kl": 0.00269317626953125, "learning_rate": 7.959719995336363e-07, "loss": 0.029, "reward": 1.7852399349212646, "reward_std": 0.40309859812259674, "rewards/": 6.890485763549805, "rewards/math_compute_score": 0.508928582072258, "step": 101 }, { "clip_ratio": 0.0, "completion_length": 1488.3751220703125, "epoch": 0.3238095238095238, "grad_norm": 0.44950984490187945, "kl": 0.0035400390625, "learning_rate": 7.918054575490943e-07, "loss": 0.0435, "reward": 1.8601562976837158, "reward_std": 0.5413349270820618, "rewards/": 6.8722100257873535, "rewards/math_compute_score": 0.6071428656578064, "step": 102 }, { "clip_ratio": 0.0, "completion_length": 1592.2501220703125, "epoch": 0.326984126984127, "grad_norm": 0.3485861719916155, "kl": 0.002471923828125, "learning_rate": 7.876079563711631e-07, "loss": 0.0519, "reward": 1.5621094703674316, "reward_std": 0.5689576864242554, "rewards/": 6.524832725524902, "rewards/math_compute_score": 0.3214285969734192, "step": 103 }, { "clip_ratio": 0.0, "completion_length": 1584.821533203125, "epoch": 0.33015873015873015, "grad_norm": 0.42767837474542764, "kl": 0.00335693359375, "learning_rate": 7.83379941335073e-07, "loss": 0.093, "reward": 1.7258999347686768, "reward_std": 0.7036234140396118, "rewards/": 6.486642360687256, "rewards/math_compute_score": 0.535714328289032, "step": 104 }, { "clip_ratio": 0.0, "completion_length": 1589.08935546875, "epoch": 0.3333333333333333, "grad_norm": 0.32351467813897694, "kl": 0.002105712890625, "learning_rate": 7.791218610134322e-07, "loss": 0.0241, "reward": 1.7169644832611084, "reward_std": 0.38069403171539307, "rewards/": 6.656250476837158, "rewards/math_compute_score": 0.4821428656578064, "step": 105 }, { "clip_ratio": 0.0, "completion_length": 1518.446533203125, "epoch": 0.33650793650793653, "grad_norm": 0.39674608646311715, "kl": 0.00274658203125, "learning_rate": 7.748341671686354e-07, "loss": 0.0463, "reward": 1.785309910774231, "reward_std": 0.5290029048919678, "rewards/": 6.8551201820373535, "rewards/math_compute_score": 0.5178571939468384, "step": 106 }, { "clip_ratio": 0.0, "completion_length": 1639.982177734375, "epoch": 0.3396825396825397, "grad_norm": 0.34724586686391384, "kl": 0.0028839111328125, "learning_rate": 7.705173147049325e-07, "loss": 0.0139, "reward": 1.8501187562942505, "reward_std": 0.4504697918891907, "rewards/": 6.964879035949707, "rewards/math_compute_score": 0.5714285969734192, "step": 107 }, { "clip_ratio": 0.0, "completion_length": 1484.1785888671875, "epoch": 0.34285714285714286, "grad_norm": 0.3499020427456334, "kl": 0.0025787353515625, "learning_rate": 7.661717616201668e-07, "loss": -0.0305, "reward": 1.6777344942092896, "reward_std": 0.3427577316761017, "rewards/": 5.817243576049805, "rewards/math_compute_score": 0.6428571939468384, "step": 108 }, { "clip_ratio": 0.0, "completion_length": 1837.96435546875, "epoch": 0.346031746031746, "grad_norm": 0.4037551751682681, "kl": 0.0028228759765625, "learning_rate": 7.617979689571839e-07, "loss": 0.0442, "reward": 1.3643137216567993, "reward_std": 0.6223567724227905, "rewards/": 6.535853862762451, "rewards/math_compute_score": 0.0714285746216774, "step": 109 }, { "epoch": 0.3492063492063492, "grad_norm": 0.4514262612365042, "learning_rate": 7.573964007549154e-07, "loss": -0.0428, "step": 110 }, { "epoch": 0.3492063492063492, "eval_clip_ratio": 0.0, "eval_completion_length": 1610.9127604166667, "eval_kl": 0.0030568440755208335, "eval_loss": 0.059969693422317505, "eval_reward": 1.8927596807479858, "eval_reward_std": 0.5094525118668874, "eval_rewards/": 6.86855951944987, "eval_rewards/math_compute_score": 0.6488095621267954, "eval_runtime": 139.4711, "eval_samples_per_second": 0.151, "eval_steps_per_second": 0.007, "step": 110 }, { "clip_ratio": 0.0, "completion_length": 1355.6964721679688, "epoch": 0.3523809523809524, "grad_norm": 0.416064136246473, "kl": 0.003692626953125, "learning_rate": 7.529675239991482e-07, "loss": 0.007, "reward": 2.1203389167785645, "reward_std": 0.34416940808296204, "rewards/": 7.280264854431152, "rewards/math_compute_score": 0.830357164144516, "step": 111 }, { "clip_ratio": 0.0, "completion_length": 1646.607177734375, "epoch": 0.35555555555555557, "grad_norm": 0.3080164040844727, "kl": 0.00250244140625, "learning_rate": 7.485118085729789e-07, "loss": 0.013, "reward": 1.5503767728805542, "reward_std": 0.4306219816207886, "rewards/": 6.180455207824707, "rewards/math_compute_score": 0.392857164144516, "step": 112 }, { "clip_ratio": 0.0, "completion_length": 1678.1785888671875, "epoch": 0.35873015873015873, "grad_norm": 0.3694201814855355, "kl": 0.00311279296875, "learning_rate": 7.440297272069614e-07, "loss": 0.0761, "reward": 2.034709930419922, "reward_std": 0.43929895758628845, "rewards/": 7.530692100524902, "rewards/math_compute_score": 0.660714328289032, "step": 113 }, { "clip_ratio": 0.0, "completion_length": 1546.7679443359375, "epoch": 0.3619047619047619, "grad_norm": 0.3908115750027717, "kl": 0.0034942626953125, "learning_rate": 7.395217554289523e-07, "loss": -0.011, "reward": 1.7967495918273926, "reward_std": 0.22745420038700104, "rewards/": 6.983747482299805, "rewards/math_compute_score": 0.5, "step": 114 }, { "clip_ratio": 0.0, "completion_length": 1633.9107666015625, "epoch": 0.36507936507936506, "grad_norm": 0.3199396618433185, "kl": 0.0025634765625, "learning_rate": 7.3498837151366e-07, "loss": 0.0194, "reward": 1.8163504600524902, "reward_std": 0.5509271621704102, "rewards/": 7.081752777099609, "rewards/math_compute_score": 0.5, "step": 115 }, { "clip_ratio": 0.0, "completion_length": 1633.83935546875, "epoch": 0.3682539682539683, "grad_norm": 0.3989797872601438, "kl": 0.002685546875, "learning_rate": 7.304300564319013e-07, "loss": 0.0215, "reward": 1.52039635181427, "reward_std": 0.4950607120990753, "rewards/": 6.744838237762451, "rewards/math_compute_score": 0.2142857313156128, "step": 116 }, { "clip_ratio": 0.0, "completion_length": 1765.0537109375, "epoch": 0.37142857142857144, "grad_norm": 0.33315626188294234, "kl": 0.003143310546875, "learning_rate": 7.258472937995735e-07, "loss": -0.0229, "reward": 1.795814871788025, "reward_std": 0.49838095903396606, "rewards/": 7.193359851837158, "rewards/math_compute_score": 0.4464285969734192, "step": 117 }, { "clip_ratio": 0.0, "completion_length": 1562.5535888671875, "epoch": 0.3746031746031746, "grad_norm": 0.3286142283747825, "kl": 0.0029754638671875, "learning_rate": 7.212405698263446e-07, "loss": 0.0002, "reward": 2.0463171005249023, "reward_std": 0.4698004722595215, "rewards/": 7.2315850257873535, "rewards/math_compute_score": 0.7500000596046448, "step": 118 }, { "clip_ratio": 0.0, "completion_length": 1634.5535888671875, "epoch": 0.37777777777777777, "grad_norm": 0.3834857277163386, "kl": 0.003143310546875, "learning_rate": 7.166103732640681e-07, "loss": 0.0034, "reward": 1.5640347003936768, "reward_std": 0.48661187291145325, "rewards/": 6.820173263549805, "rewards/math_compute_score": 0.25, "step": 119 }, { "epoch": 0.38095238095238093, "grad_norm": 0.3930354077134069, "learning_rate": 7.119571953549304e-07, "loss": 0.0164, "step": 120 }, { "epoch": 0.38095238095238093, "eval_clip_ratio": 0.0, "eval_completion_length": 1561.2083740234375, "eval_kl": 0.0033162434895833335, "eval_loss": 0.034597255289554596, "eval_reward": 1.9965635935465496, "eval_reward_std": 0.49989163875579834, "eval_rewards/": 6.982817490895589, "eval_rewards/math_compute_score": 0.7500000397364298, "eval_runtime": 131.6643, "eval_samples_per_second": 0.159, "eval_steps_per_second": 0.008, "step": 120 }, { "clip_ratio": 0.0, "completion_length": 1541.071533203125, "epoch": 0.38412698412698415, "grad_norm": 0.3736113886490285, "kl": 0.00330352783203125, "learning_rate": 7.072815297793302e-07, "loss": -0.0227, "reward": 1.750962734222412, "reward_std": 0.44300225377082825, "rewards/": 6.719099044799805, "rewards/math_compute_score": 0.5089285969734192, "step": 121 }, { "clip_ratio": 0.0, "completion_length": 1791.482177734375, "epoch": 0.3873015873015873, "grad_norm": 0.36023685839775266, "kl": 0.00299072265625, "learning_rate": 7.025838726035031e-07, "loss": 0.0158, "reward": 1.2251116037368774, "reward_std": 0.5226312875747681, "rewards/": 6.4112725257873535, "rewards/math_compute_score": -0.0714285746216774, "step": 122 }, { "clip_ratio": 0.0, "completion_length": 1458.21435546875, "epoch": 0.3904761904761905, "grad_norm": 0.4040624897084209, "kl": 0.0032501220703125, "learning_rate": 6.978647222268903e-07, "loss": 0.0145, "reward": 1.9194753170013428, "reward_std": 0.5019935965538025, "rewards/": 7.025949001312256, "rewards/math_compute_score": 0.6428571939468384, "step": 123 }, { "clip_ratio": 0.0, "completion_length": 1518.232177734375, "epoch": 0.39365079365079364, "grad_norm": 0.3500940260968403, "kl": 0.0032958984375, "learning_rate": 6.93124579329261e-07, "loss": 0.0429, "reward": 1.8887277841567993, "reward_std": 0.3185231387615204, "rewards/": 7.157924652099609, "rewards/math_compute_score": 0.5714285969734192, "step": 124 }, { "clip_ratio": 0.0, "completion_length": 1352.5179443359375, "epoch": 0.3968253968253968, "grad_norm": 0.35908221717719274, "kl": 0.0038909912109375, "learning_rate": 6.883639468175925e-07, "loss": 0.0207, "reward": 2.0699777603149414, "reward_std": 0.2467387318611145, "rewards/": 6.992745876312256, "rewards/math_compute_score": 0.8392857313156128, "step": 125 }, { "clip_ratio": 0.0, "completion_length": 1712.21435546875, "epoch": 0.4, "grad_norm": 0.3624308105870553, "kl": 0.0030364990234375, "learning_rate": 6.835833297727147e-07, "loss": 0.0443, "reward": 1.5869420766830444, "reward_std": 0.39860856533050537, "rewards/": 6.506138801574707, "rewards/math_compute_score": 0.3571428656578064, "step": 126 }, { "clip_ratio": 0.0, "completion_length": 1478.2857666015625, "epoch": 0.4031746031746032, "grad_norm": 0.41421111807827343, "kl": 0.0032958984375, "learning_rate": 6.787832353957224e-07, "loss": 0.0465, "reward": 2.0360493659973145, "reward_std": 0.5292332172393799, "rewards/": 7.180245876312256, "rewards/math_compute_score": 0.7500000596046448, "step": 127 }, { "clip_ratio": 0.0, "completion_length": 1562.3751220703125, "epoch": 0.40634920634920635, "grad_norm": 0.30993254735116554, "kl": 0.003448486328125, "learning_rate": 6.739641729541644e-07, "loss": 0.0384, "reward": 1.7399276494979858, "reward_std": 0.3811955749988556, "rewards/": 6.699637413024902, "rewards/math_compute_score": 0.5, "step": 128 }, { "clip_ratio": 0.0, "completion_length": 1551.0535888671875, "epoch": 0.4095238095238095, "grad_norm": 0.3771726274865134, "kl": 0.00347900390625, "learning_rate": 6.691266537280127e-07, "loss": -0.0003, "reward": 1.817131757736206, "reward_std": 0.511991024017334, "rewards/": 6.7999444007873535, "rewards/math_compute_score": 0.5714285969734192, "step": 129 }, { "epoch": 0.4126984126984127, "grad_norm": 0.35621529328169527, "learning_rate": 6.642711909554174e-07, "loss": 0.0192, "step": 130 }, { "epoch": 0.4126984126984127, "eval_clip_ratio": 0.0, "eval_completion_length": 1621.2857666015625, "eval_kl": 0.0035502115885416665, "eval_loss": 0.03674715757369995, "eval_reward": 1.8930153846740723, "eval_reward_std": 0.4688274661699931, "eval_rewards/": 6.774600346883138, "eval_rewards/math_compute_score": 0.6726190646489462, "eval_runtime": 133.1814, "eval_samples_per_second": 0.158, "eval_steps_per_second": 0.008, "step": 130 }, { "clip_ratio": 0.0, "completion_length": 1673.3839721679688, "epoch": 0.4158730158730159, "grad_norm": 0.4405464519743735, "kl": 0.0037384033203125, "learning_rate": 6.593982997782548e-07, "loss": 0.0243, "reward": 1.7884975671768188, "reward_std": 0.6518445014953613, "rewards/": 6.942487955093384, "rewards/math_compute_score": 0.5000000149011612, "step": 131 }, { "clip_ratio": 0.0, "completion_length": 1482.696533203125, "epoch": 0.41904761904761906, "grad_norm": 0.39457572146801523, "kl": 0.003662109375, "learning_rate": 6.545084971874736e-07, "loss": 0.0455, "reward": 1.9908483028411865, "reward_std": 0.49568915367126465, "rewards/": 6.9542412757873535, "rewards/math_compute_score": 0.7500000596046448, "step": 132 }, { "clip_ratio": 0.0, "completion_length": 1558.321533203125, "epoch": 0.4222222222222222, "grad_norm": 0.3723322635366814, "kl": 0.0032196044921875, "learning_rate": 6.496023019682446e-07, "loss": 0.0261, "reward": 2.116741180419922, "reward_std": 0.34433886408805847, "rewards/": 7.155134201049805, "rewards/math_compute_score": 0.8571429252624512, "step": 133 }, { "clip_ratio": 0.0, "completion_length": 1776.482177734375, "epoch": 0.4253968253968254, "grad_norm": 0.3342896457505393, "kl": 0.00286865234375, "learning_rate": 6.44680234644919e-07, "loss": 0.0427, "reward": 1.4701590538024902, "reward_std": 0.552353024482727, "rewards/": 6.636509895324707, "rewards/math_compute_score": 0.1785714328289032, "step": 134 }, { "clip_ratio": 0.0, "completion_length": 1594.1429443359375, "epoch": 0.42857142857142855, "grad_norm": 0.3548357521298244, "kl": 0.0038299560546875, "learning_rate": 6.397428174258047e-07, "loss": 0.0483, "reward": 2.0053014755249023, "reward_std": 0.3594396710395813, "rewards/": 7.455078601837158, "rewards/math_compute_score": 0.6428571939468384, "step": 135 }, { "clip_ratio": 0.0, "completion_length": 1353.71435546875, "epoch": 0.43174603174603177, "grad_norm": 0.4428436053114652, "kl": 0.004180908203125, "learning_rate": 6.347905741477612e-07, "loss": 0.0562, "reward": 1.9524275064468384, "reward_std": 0.42881521582603455, "rewards/": 7.1192803382873535, "rewards/math_compute_score": 0.660714328289032, "step": 136 }, { "clip_ratio": 0.0, "completion_length": 1373.732177734375, "epoch": 0.43492063492063493, "grad_norm": 0.3804164543335114, "kl": 0.004302978515625, "learning_rate": 6.298240302206241e-07, "loss": 0.0371, "reward": 2.0057549476623535, "reward_std": 0.3291456401348114, "rewards/": 7.028774261474609, "rewards/math_compute_score": 0.7500000596046448, "step": 137 }, { "clip_ratio": 0.0, "completion_length": 1614.732177734375, "epoch": 0.4380952380952381, "grad_norm": 0.3920737510841866, "kl": 0.004119873046875, "learning_rate": 6.2484371257146e-07, "loss": 0.031, "reward": 1.8299667835235596, "reward_std": 0.5113915205001831, "rewards/": 7.149832725524902, "rewards/math_compute_score": 0.5, "step": 138 }, { "clip_ratio": 0.0, "completion_length": 1481.08935546875, "epoch": 0.44126984126984126, "grad_norm": 0.38144253817590884, "kl": 0.00408935546875, "learning_rate": 6.198501495886638e-07, "loss": -0.0053, "reward": 1.7764790058135986, "reward_std": 0.40745845437049866, "rewards/": 6.739537239074707, "rewards/math_compute_score": 0.535714328289032, "step": 139 }, { "epoch": 0.4444444444444444, "grad_norm": 0.40734808818587626, "learning_rate": 6.148438710658978e-07, "loss": 0.0635, "step": 140 }, { "epoch": 0.4444444444444444, "eval_clip_ratio": 0.0, "eval_completion_length": 1595.1012369791667, "eval_kl": 0.0038808186848958335, "eval_loss": 0.03390444815158844, "eval_reward": 1.973921298980713, "eval_reward_std": 0.47822797298431396, "eval_rewards/": 7.107701142628987, "eval_rewards/math_compute_score": 0.6904762188593546, "eval_runtime": 132.0027, "eval_samples_per_second": 0.159, "eval_steps_per_second": 0.008, "step": 140 }, { "clip_ratio": 0.0, "completion_length": 1492.65185546875, "epoch": 0.44761904761904764, "grad_norm": 0.4080068354053614, "kl": 0.0045166015625, "learning_rate": 6.098254081458838e-07, "loss": 0.0728, "reward": 1.9614050388336182, "reward_std": 0.5336401164531708, "rewards/": 7.09273886680603, "rewards/math_compute_score": 0.6785714626312256, "step": 141 }, { "clip_ratio": 0.0, "completion_length": 1850.3929443359375, "epoch": 0.4507936507936508, "grad_norm": 0.34401058222235625, "kl": 0.0030670166015625, "learning_rate": 6.047952932640512e-07, "loss": 0.0162, "reward": 1.3241490125656128, "reward_std": 0.568393886089325, "rewards/": 6.263602256774902, "rewards/math_compute_score": 0.0892857164144516, "step": 142 }, { "clip_ratio": 0.0, "completion_length": 1395.8035888671875, "epoch": 0.45396825396825397, "grad_norm": 0.3084325410352855, "kl": 0.0038909912109375, "learning_rate": 5.997540600920478e-07, "loss": 0.0174, "reward": 1.953850507736206, "reward_std": 0.4086017906665802, "rewards/": 6.769252300262451, "rewards/math_compute_score": 0.7500000596046448, "step": 143 }, { "clip_ratio": 0.0, "completion_length": 1625.21435546875, "epoch": 0.45714285714285713, "grad_norm": 0.41338154741382704, "kl": 0.0048828125, "learning_rate": 5.947022434811201e-07, "loss": 0.0367, "reward": 1.5787110328674316, "reward_std": 0.49698716402053833, "rewards/": 6.750697612762451, "rewards/math_compute_score": 0.2857142984867096, "step": 144 }, { "clip_ratio": 0.0, "completion_length": 1505.0179443359375, "epoch": 0.4603174603174603, "grad_norm": 0.38852067924976885, "kl": 0.00421142578125, "learning_rate": 5.896403794053678e-07, "loss": 0.0451, "reward": 1.9318640232086182, "reward_std": 0.5010179877281189, "rewards/": 6.945034027099609, "rewards/math_compute_score": 0.6785714626312256, "step": 145 }, { "clip_ratio": 0.0, "completion_length": 1491.3751220703125, "epoch": 0.4634920634920635, "grad_norm": 0.3492147620184295, "kl": 0.0042724609375, "learning_rate": 5.845690049048798e-07, "loss": 0.0706, "reward": 2.178906202316284, "reward_std": 0.457103431224823, "rewards/": 7.323102951049805, "rewards/math_compute_score": 0.8928571939468384, "step": 146 }, { "clip_ratio": 0.0, "completion_length": 1746.1607666015625, "epoch": 0.4666666666666667, "grad_norm": 0.44104615846617024, "kl": 0.005401611328125, "learning_rate": 5.794886580287564e-07, "loss": 0.0404, "reward": 1.4707032442092896, "reward_std": 0.8705157041549683, "rewards/": 6.4249444007873535, "rewards/math_compute_score": 0.2321428656578064, "step": 147 }, { "clip_ratio": 0.0, "completion_length": 1627.08935546875, "epoch": 0.46984126984126984, "grad_norm": 0.395676436764285, "kl": 0.003997802734375, "learning_rate": 5.743998777780251e-07, "loss": 0.057, "reward": 1.7963100671768188, "reward_std": 0.5916603207588196, "rewards/": 7.410121917724609, "rewards/math_compute_score": 0.392857164144516, "step": 148 }, { "clip_ratio": 0.0, "completion_length": 1585.08935546875, "epoch": 0.473015873015873, "grad_norm": 0.32362413396596257, "kl": 0.0037078857421875, "learning_rate": 5.693032040484547e-07, "loss": -0.0072, "reward": 1.691545844078064, "reward_std": 0.4373229146003723, "rewards/": 6.529157638549805, "rewards/math_compute_score": 0.4821428656578064, "step": 149 }, { "epoch": 0.47619047619047616, "grad_norm": 0.38103977643395054, "learning_rate": 5.641991775732755e-07, "loss": 0.0482, "step": 150 }, { "epoch": 0.47619047619047616, "eval_clip_ratio": 0.0, "eval_completion_length": 1561.9564208984375, "eval_kl": 0.004628499348958333, "eval_loss": 0.023879073560237885, "eval_reward": 1.9818453788757324, "eval_reward_std": 0.42304734388987225, "eval_rewards/": 6.980655034383138, "eval_rewards/math_compute_score": 0.7321428954601288, "eval_runtime": 131.655, "eval_samples_per_second": 0.16, "eval_steps_per_second": 0.008, "step": 150 }, { "clip_ratio": 0.0, "completion_length": 1668.759033203125, "epoch": 0.4793650793650794, "grad_norm": 0.31698478601407637, "kl": 0.00412750244140625, "learning_rate": 5.590883398658094e-07, "loss": 0.0239, "reward": 1.6429409980773926, "reward_std": 0.5557140111923218, "rewards/": 6.857561826705933, "rewards/math_compute_score": 0.3392857313156128, "step": 151 }, { "clip_ratio": 0.0, "completion_length": 1623.08935546875, "epoch": 0.48253968253968255, "grad_norm": 0.3705285203587748, "kl": 0.00408935546875, "learning_rate": 5.539712331620185e-07, "loss": 0.0581, "reward": 1.4986224174499512, "reward_std": 0.43796294927597046, "rewards/": 6.7788262367248535, "rewards/math_compute_score": 0.1785714328289032, "step": 152 }, { "clip_ratio": 0.0, "completion_length": 1689.446533203125, "epoch": 0.4857142857142857, "grad_norm": 0.3861278436124767, "kl": 0.00445556640625, "learning_rate": 5.488484003629758e-07, "loss": 0.0305, "reward": 1.5913225412368774, "reward_std": 0.6033198833465576, "rewards/": 6.813755989074707, "rewards/math_compute_score": 0.2857142984867096, "step": 153 }, { "clip_ratio": 0.0, "completion_length": 1477.107177734375, "epoch": 0.4888888888888889, "grad_norm": 0.36475288664194044, "kl": 0.0047607421875, "learning_rate": 5.437203849772664e-07, "loss": 0.0404, "reward": 1.8446986675262451, "reward_std": 0.32607829570770264, "rewards/": 7.080636501312256, "rewards/math_compute_score": 0.535714328289032, "step": 154 }, { "clip_ratio": 0.0, "completion_length": 1420.3035888671875, "epoch": 0.49206349206349204, "grad_norm": 0.3896113098470113, "kl": 0.004180908203125, "learning_rate": 5.385877310633232e-07, "loss": -0.0126, "reward": 1.838978886604309, "reward_std": 0.4181511104106903, "rewards/": 6.694894313812256, "rewards/math_compute_score": 0.625, "step": 155 }, { "clip_ratio": 0.0, "completion_length": 1735.4107666015625, "epoch": 0.49523809523809526, "grad_norm": 0.366034646195721, "kl": 0.004364013671875, "learning_rate": 5.334509831717058e-07, "loss": 0.0191, "reward": 1.687611699104309, "reward_std": 0.5151606798171997, "rewards/": 7.2237725257873535, "rewards/math_compute_score": 0.3035714328289032, "step": 156 }, { "clip_ratio": 0.0, "completion_length": 1403.2857666015625, "epoch": 0.4984126984126984, "grad_norm": 0.3664836889631256, "kl": 0.005584716796875, "learning_rate": 5.283106862873252e-07, "loss": 0.0848, "reward": 2.293659210205078, "reward_std": 0.24820633232593536, "rewards/": 7.611154079437256, "rewards/math_compute_score": 0.9642857313156128, "step": 157 }, { "clip_ratio": 0.0, "completion_length": 1685.2501220703125, "epoch": 0.5015873015873016, "grad_norm": 0.3481512366201298, "kl": 0.004180908203125, "learning_rate": 5.231673857716243e-07, "loss": 0.041, "reward": 1.8583705425262451, "reward_std": 0.42498812079429626, "rewards/": 6.4347100257873535, "rewards/math_compute_score": 0.7142857313156128, "step": 158 }, { "clip_ratio": 0.0, "completion_length": 1417.6785888671875, "epoch": 0.5047619047619047, "grad_norm": 0.3977021065047324, "kl": 0.00439453125, "learning_rate": 5.18021627304717e-07, "loss": 0.0716, "reward": 1.9547433853149414, "reward_std": 0.4087882936000824, "rewards/": 7.273716926574707, "rewards/math_compute_score": 0.625, "step": 159 }, { "epoch": 0.5079365079365079, "grad_norm": 0.3539481723030545, "learning_rate": 5.128739568274943e-07, "loss": 0.0753, "step": 160 }, { "epoch": 0.5079365079365079, "eval_clip_ratio": 0.0, "eval_completion_length": 1508.3075764973958, "eval_kl": 0.004781087239583333, "eval_loss": 0.010089995339512825, "eval_reward": 1.9843844572703044, "eval_reward_std": 0.4194992780685425, "eval_rewards/": 7.017159938812256, "eval_rewards/math_compute_score": 0.7261905074119568, "eval_runtime": 130.1634, "eval_samples_per_second": 0.161, "eval_steps_per_second": 0.008, "step": 160 }, { "clip_ratio": 0.0, "completion_length": 1747.4286499023438, "epoch": 0.5111111111111111, "grad_norm": 0.3330449401616063, "kl": 0.00437164306640625, "learning_rate": 5.077249204837025e-07, "loss": 0.019, "reward": 1.6631278991699219, "reward_std": 0.6911021769046783, "rewards/": 6.672781944274902, "rewards/math_compute_score": 0.4107142984867096, "step": 161 }, { "clip_ratio": 0.0, "completion_length": 1497.482177734375, "epoch": 0.5142857142857142, "grad_norm": 0.42213450988969753, "kl": 0.004486083984375, "learning_rate": 5.025750645620004e-07, "loss": 0.0252, "reward": 1.7366769313812256, "reward_std": 0.6020164489746094, "rewards/": 6.540527820587158, "rewards/math_compute_score": 0.535714328289032, "step": 162 }, { "clip_ratio": 0.0, "completion_length": 1306.2857666015625, "epoch": 0.5174603174603175, "grad_norm": 0.6578798978662977, "kl": 0.011474609375, "learning_rate": 4.974249354379996e-07, "loss": 0.05, "reward": 2.152064800262451, "reward_std": 0.43625935912132263, "rewards/": 7.474609851837158, "rewards/math_compute_score": 0.8214285969734192, "step": 163 }, { "clip_ratio": 0.0, "completion_length": 1578.5179443359375, "epoch": 0.5206349206349207, "grad_norm": 0.3630493679815709, "kl": 0.0047607421875, "learning_rate": 4.922750795162973e-07, "loss": 0.0061, "reward": 1.8181921243667603, "reward_std": 0.45378485321998596, "rewards/": 7.305245876312256, "rewards/math_compute_score": 0.4464285969734192, "step": 164 }, { "clip_ratio": 0.0, "completion_length": 1523.857177734375, "epoch": 0.5238095238095238, "grad_norm": 0.40781680936494136, "kl": 0.0057373046875, "learning_rate": 4.871260431725058e-07, "loss": 0.0301, "reward": 1.784919261932373, "reward_std": 0.5600239634513855, "rewards/": 6.710309982299805, "rewards/math_compute_score": 0.5535714626312256, "step": 165 }, { "clip_ratio": 0.0, "completion_length": 1653.607177734375, "epoch": 0.526984126984127, "grad_norm": 0.3627537206069918, "kl": 0.005035400390625, "learning_rate": 4.81978372695283e-07, "loss": 0.0296, "reward": 1.5985910892486572, "reward_std": 0.39210641384124756, "rewards/": 6.707240581512451, "rewards/math_compute_score": 0.3214285969734192, "step": 166 }, { "clip_ratio": 0.0, "completion_length": 1424.1785888671875, "epoch": 0.5301587301587302, "grad_norm": 0.3566511267927936, "kl": 0.00543212890625, "learning_rate": 4.768326142283756e-07, "loss": 0.0401, "reward": 1.9029020071029663, "reward_std": 0.253780335187912, "rewards/": 6.871652126312256, "rewards/math_compute_score": 0.660714328289032, "step": 167 }, { "clip_ratio": 0.0, "completion_length": 1728.1251220703125, "epoch": 0.5333333333333333, "grad_norm": 0.3668634566920126, "kl": 0.004486083984375, "learning_rate": 4.7168931371267473e-07, "loss": 0.0457, "reward": 1.5901787281036377, "reward_std": 0.5735574960708618, "rewards/": 6.950893402099609, "rewards/math_compute_score": 0.25, "step": 168 }, { "clip_ratio": 0.0, "completion_length": 1611.5357666015625, "epoch": 0.5365079365079365, "grad_norm": 0.3213785200743163, "kl": 0.00469970703125, "learning_rate": 4.665490168282943e-07, "loss": 0.0072, "reward": 1.8737167119979858, "reward_std": 0.6218880414962769, "rewards/": 6.940011501312256, "rewards/math_compute_score": 0.6071428656578064, "step": 169 }, { "epoch": 0.5396825396825397, "grad_norm": 0.35976271648220715, "learning_rate": 4.614122689366768e-07, "loss": 0.0402, "step": 170 }, { "epoch": 0.5396825396825397, "eval_clip_ratio": 0.0, "eval_completion_length": 1549.6785888671875, "eval_kl": 0.004964192708333333, "eval_loss": 0.03550202399492264, "eval_reward": 1.9592356284459431, "eval_reward_std": 0.42070769270261127, "eval_rewards/": 6.962844530741374, "eval_rewards/math_compute_score": 0.7083333631356558, "eval_runtime": 131.0866, "eval_samples_per_second": 0.16, "eval_steps_per_second": 0.008, "step": 170 }, { "clip_ratio": 0.0, "completion_length": 1788.1697387695312, "epoch": 0.5428571428571428, "grad_norm": 0.30243975738537365, "kl": 0.00441741943359375, "learning_rate": 4.562796150227337e-07, "loss": 0.0388, "reward": 1.6102469563484192, "reward_std": 0.5664084255695343, "rewards/": 6.872663497924805, "rewards/math_compute_score": 0.2946428656578064, "step": 171 }, { "clip_ratio": 0.0, "completion_length": 1281.8929443359375, "epoch": 0.546031746031746, "grad_norm": 0.4239327955548188, "kl": 0.006072998046875, "learning_rate": 4.511515996370243e-07, "loss": -0.0323, "reward": 2.155747890472412, "reward_std": 0.38098111748695374, "rewards/": 7.064453601837158, "rewards/math_compute_score": 0.9285714626312256, "step": 172 }, { "clip_ratio": 0.0, "completion_length": 1426.08935546875, "epoch": 0.5492063492063493, "grad_norm": 0.3848409508825075, "kl": 0.004547119140625, "learning_rate": 4.460287668379814e-07, "loss": 0.0612, "reward": 1.9045759439468384, "reward_std": 0.19524678587913513, "rewards/": 6.522879600524902, "rewards/math_compute_score": 0.7500000596046448, "step": 173 }, { "clip_ratio": 0.0, "completion_length": 1506.8751220703125, "epoch": 0.5523809523809524, "grad_norm": 0.36448706461592817, "kl": 0.00469970703125, "learning_rate": 4.409116601341907e-07, "loss": -0.0125, "reward": 2.0860493183135986, "reward_std": 0.4216456711292267, "rewards/": 7.073102951049805, "rewards/math_compute_score": 0.8392857313156128, "step": 174 }, { "clip_ratio": 0.0, "completion_length": 1531.4285888671875, "epoch": 0.5555555555555556, "grad_norm": 0.4058679353297223, "kl": 0.005218505859375, "learning_rate": 4.3580082242672444e-07, "loss": 0.0513, "reward": 2.0486607551574707, "reward_std": 0.5620574951171875, "rewards/": 7.171875476837158, "rewards/math_compute_score": 0.7678571939468384, "step": 175 }, { "clip_ratio": 0.0, "completion_length": 1354.0535888671875, "epoch": 0.5587301587301587, "grad_norm": 0.41211128125944263, "kl": 0.006805419921875, "learning_rate": 4.3069679595154536e-07, "loss": 0.0461, "reward": 2.362277030944824, "reward_std": 0.20701220631599426, "rewards/": 7.811384201049805, "rewards/math_compute_score": 1.0, "step": 176 }, { "clip_ratio": 0.0, "completion_length": 1543.571533203125, "epoch": 0.5619047619047619, "grad_norm": 0.3763535929052217, "kl": 0.005279541015625, "learning_rate": 4.2560012222197506e-07, "loss": 0.0072, "reward": 1.9439733028411865, "reward_std": 0.26243603229522705, "rewards/": 6.791295051574707, "rewards/math_compute_score": 0.7321428656578064, "step": 177 }, { "clip_ratio": 0.0, "completion_length": 1764.21435546875, "epoch": 0.5650793650793651, "grad_norm": 0.38459117939596527, "kl": 0.0054931640625, "learning_rate": 4.205113419712435e-07, "loss": 0.0308, "reward": 1.413002371788025, "reward_std": 0.48099908232688904, "rewards/": 6.493582725524902, "rewards/math_compute_score": 0.1428571492433548, "step": 178 }, { "clip_ratio": 0.0, "completion_length": 1457.071533203125, "epoch": 0.5682539682539682, "grad_norm": 0.3648394332582025, "kl": 0.005157470703125, "learning_rate": 4.1543099509512023e-07, "loss": 0.0135, "reward": 2.1016740798950195, "reward_std": 0.4655519425868988, "rewards/": 7.508370876312256, "rewards/math_compute_score": 0.7500000596046448, "step": 179 }, { "epoch": 0.5714285714285714, "grad_norm": 0.4347832296603526, "learning_rate": 4.1035962059463224e-07, "loss": -0.0031, "step": 180 }, { "epoch": 0.5714285714285714, "eval_clip_ratio": 0.0, "eval_completion_length": 1564.9504801432292, "eval_kl": 0.005472819010416667, "eval_loss": 0.04810946434736252, "eval_reward": 2.0953497886657715, "eval_reward_std": 0.3557452509800593, "eval_rewards/": 7.143415451049805, "eval_rewards/math_compute_score": 0.8333333532015482, "eval_runtime": 130.6017, "eval_samples_per_second": 0.161, "eval_steps_per_second": 0.008, "step": 180 }, { "clip_ratio": 0.0, "completion_length": 1442.7589721679688, "epoch": 0.5746031746031746, "grad_norm": 0.31604533911482663, "kl": 0.0054168701171875, "learning_rate": 4.052977565188799e-07, "loss": 0.0286, "reward": 1.9001396894454956, "reward_std": 0.35335803031921387, "rewards/": 6.929269313812256, "rewards/math_compute_score": 0.6428571492433548, "step": 181 }, { "clip_ratio": 0.0, "completion_length": 1675.7857666015625, "epoch": 0.5777777777777777, "grad_norm": 0.40418194066418917, "kl": 0.00537109375, "learning_rate": 4.0024593990795223e-07, "loss": 0.0231, "reward": 1.45106041431427, "reward_std": 0.7710850238800049, "rewards/": 6.826730251312256, "rewards/math_compute_score": 0.1071428656578064, "step": 182 }, { "clip_ratio": 0.0, "completion_length": 1623.696533203125, "epoch": 0.580952380952381, "grad_norm": 20.231824794338248, "kl": 1.4296875, "learning_rate": 3.952047067359487e-07, "loss": 0.1031, "reward": 1.6864677667617798, "reward_std": 0.3528647720813751, "rewards/": 6.860909938812256, "rewards/math_compute_score": 0.392857164144516, "step": 183 }, { "clip_ratio": 0.0, "completion_length": 1415.08935546875, "epoch": 0.5841269841269842, "grad_norm": 0.38374407884466366, "kl": 0.005889892578125, "learning_rate": 3.9017459185411614e-07, "loss": -0.0095, "reward": 1.9470704793930054, "reward_std": 0.3512076735496521, "rewards/": 6.878209114074707, "rewards/math_compute_score": 0.7142857313156128, "step": 184 }, { "clip_ratio": 0.0, "completion_length": 1437.857177734375, "epoch": 0.5873015873015873, "grad_norm": 0.3960042132238463, "kl": 0.004913330078125, "learning_rate": 3.8515612893410224e-07, "loss": 0.0284, "reward": 1.9784600734710693, "reward_std": 0.3158036768436432, "rewards/": 7.035156726837158, "rewards/math_compute_score": 0.7142857313156128, "step": 185 }, { "clip_ratio": 0.0, "completion_length": 1376.5535888671875, "epoch": 0.5904761904761905, "grad_norm": 0.4231910527525782, "kl": 0.005767822265625, "learning_rate": 3.8014985041133626e-07, "loss": 0.0177, "reward": 2.132868528366089, "reward_std": 0.47174835205078125, "rewards/": 7.450056076049805, "rewards/math_compute_score": 0.8035714626312256, "step": 186 }, { "clip_ratio": 0.0, "completion_length": 1735.0357666015625, "epoch": 0.5936507936507937, "grad_norm": 0.370413487882503, "kl": 0.004730224609375, "learning_rate": 3.7515628742853997e-07, "loss": 0.032, "reward": 1.568275809288025, "reward_std": 0.4366031587123871, "rewards/": 6.555664539337158, "rewards/math_compute_score": 0.3214285969734192, "step": 187 }, { "clip_ratio": 0.0, "completion_length": 1384.107177734375, "epoch": 0.5968253968253968, "grad_norm": 0.34069036045598083, "kl": 0.005096435546875, "learning_rate": 3.70175969779376e-07, "loss": -0.0015, "reward": 1.8890068531036377, "reward_std": 0.3061581552028656, "rewards/": 7.1593194007873535, "rewards/math_compute_score": 0.5714285969734192, "step": 188 }, { "clip_ratio": 0.0, "completion_length": 1711.58935546875, "epoch": 0.6, "grad_norm": 0.3411825559980518, "kl": 0.004913330078125, "learning_rate": 3.6520942585223866e-07, "loss": -0.0253, "reward": 1.6066406965255737, "reward_std": 0.5342792868614197, "rewards/": 6.533203601837158, "rewards/math_compute_score": 0.3750000298023224, "step": 189 }, { "epoch": 0.6031746031746031, "grad_norm": 0.35592352794000803, "learning_rate": 3.602571825741953e-07, "loss": 0.0205, "step": 190 }, { "epoch": 0.6031746031746031, "eval_clip_ratio": 0.0, "eval_completion_length": 1554.7837727864583, "eval_kl": 0.005961100260416667, "eval_loss": -0.003977527376264334, "eval_reward": 1.9975679318110149, "eval_reward_std": 0.45795708894729614, "eval_rewards/": 6.940220673878987, "eval_rewards/math_compute_score": 0.761904795964559, "eval_runtime": 131.2541, "eval_samples_per_second": 0.16, "eval_steps_per_second": 0.008, "step": 190 }, { "clip_ratio": 0.0, "completion_length": 1356.5625610351562, "epoch": 0.6063492063492063, "grad_norm": 0.3596374444943596, "kl": 0.0055999755859375, "learning_rate": 3.55319765355081e-07, "loss": -0.0037, "reward": 2.0135952830314636, "reward_std": 0.33204740285873413, "rewards/": 6.925118923187256, "rewards/math_compute_score": 0.785714328289032, "step": 191 }, { "clip_ratio": 0.0, "completion_length": 1531.6251220703125, "epoch": 0.6095238095238096, "grad_norm": 0.3616323272673789, "kl": 0.006195068359375, "learning_rate": 3.503976980317554e-07, "loss": -0.0153, "reward": 2.023493528366089, "reward_std": 0.3276682496070862, "rewards/": 7.5460381507873535, "rewards/math_compute_score": 0.6428571939468384, "step": 192 }, { "clip_ratio": 0.0, "completion_length": 1549.982177734375, "epoch": 0.6126984126984127, "grad_norm": 0.3739633675188606, "kl": 0.0057373046875, "learning_rate": 3.454915028125263e-07, "loss": 0.0195, "reward": 1.88074791431427, "reward_std": 0.5293351411819458, "rewards/": 6.975167751312256, "rewards/math_compute_score": 0.6071428656578064, "step": 193 }, { "clip_ratio": 0.0, "completion_length": 1632.857177734375, "epoch": 0.6158730158730159, "grad_norm": 0.3844476214545633, "kl": 0.00555419921875, "learning_rate": 3.4060170022174517e-07, "loss": -0.0172, "reward": 1.9814037084579468, "reward_std": 0.6114475727081299, "rewards/": 7.192731857299805, "rewards/math_compute_score": 0.6785714626312256, "step": 194 }, { "clip_ratio": 0.0, "completion_length": 1197.08935546875, "epoch": 0.6190476190476191, "grad_norm": 0.3588871009271226, "kl": 0.005828857421875, "learning_rate": 3.357288090445826e-07, "loss": 0.0417, "reward": 2.2577009201049805, "reward_std": 0.24351288378238678, "rewards/": 7.431362152099609, "rewards/math_compute_score": 0.9642857313156128, "step": 195 }, { "clip_ratio": 0.0, "completion_length": 1516.5535888671875, "epoch": 0.6222222222222222, "grad_norm": 0.4221755143300646, "kl": 0.006378173828125, "learning_rate": 3.3087334627198727e-07, "loss": 0.0476, "reward": 2.0338730812072754, "reward_std": 0.45778489112854004, "rewards/": 7.455078601837158, "rewards/math_compute_score": 0.6785714626312256, "step": 196 }, { "clip_ratio": 0.0, "completion_length": 1591.9285888671875, "epoch": 0.6253968253968254, "grad_norm": 0.37451153018874117, "kl": 0.005859375, "learning_rate": 3.260358270458354e-07, "loss": 0.0074, "reward": 1.9349645376205444, "reward_std": 0.4895531237125397, "rewards/": 6.960536956787109, "rewards/math_compute_score": 0.6785714626312256, "step": 197 }, { "clip_ratio": 0.0, "completion_length": 1527.196533203125, "epoch": 0.6285714285714286, "grad_norm": 0.3302300732895513, "kl": 0.005035400390625, "learning_rate": 3.212167646042776e-07, "loss": -0.0122, "reward": 1.9998327493667603, "reward_std": 0.2854258418083191, "rewards/": 7.213449001312256, "rewards/math_compute_score": 0.6964285969734192, "step": 198 }, { "clip_ratio": 0.0, "completion_length": 1533.3751220703125, "epoch": 0.6317460317460317, "grad_norm": 0.36563930742752804, "kl": 0.005340576171875, "learning_rate": 3.164166702272855e-07, "loss": 0.0625, "reward": 2.099107265472412, "reward_std": 0.40774399042129517, "rewards/": 7.638393402099609, "rewards/math_compute_score": 0.7142857313156128, "step": 199 }, { "epoch": 0.6349206349206349, "grad_norm": 0.3003969277270976, "learning_rate": 3.1163605318240736e-07, "loss": 0.0211, "step": 200 }, { "epoch": 0.6349206349206349, "eval_clip_ratio": 0.0, "eval_completion_length": 1513.3889567057292, "eval_kl": 0.0057373046875, "eval_loss": -0.0034506141673773527, "eval_reward": 2.0185548464457193, "eval_reward_std": 0.42673546075820923, "eval_rewards/": 7.140392780303955, "eval_rewards/math_compute_score": 0.7380952636400858, "eval_runtime": 131.2464, "eval_samples_per_second": 0.16, "eval_steps_per_second": 0.008, "step": 200 }, { "clip_ratio": 0.0, "completion_length": 1553.607177734375, "epoch": 0.638095238095238, "grad_norm": 0.4204253637756095, "kl": 0.00592041015625, "learning_rate": 3.0687542067073915e-07, "loss": 0.0522, "reward": 1.829387605190277, "reward_std": 0.4581097811460495, "rewards/": 6.932652473449707, "rewards/math_compute_score": 0.5535714626312256, "step": 201 }, { "clip_ratio": 0.0, "completion_length": 1458.4107666015625, "epoch": 0.6412698412698413, "grad_norm": 0.37293790377807984, "kl": 0.00726318359375, "learning_rate": 3.021352777731095e-07, "loss": 0.0798, "reward": 1.893429160118103, "reward_std": 0.7099537253379822, "rewards/": 7.252860069274902, "rewards/math_compute_score": 0.5535714626312256, "step": 202 }, { "clip_ratio": 0.0, "completion_length": 1727.446533203125, "epoch": 0.6444444444444445, "grad_norm": 0.364624475214311, "kl": 0.005828857421875, "learning_rate": 2.974161273964969e-07, "loss": 0.0132, "reward": 1.3616769313812256, "reward_std": 0.5690730214118958, "rewards/": 6.379813194274902, "rewards/math_compute_score": 0.1071428656578064, "step": 203 }, { "clip_ratio": 0.0, "completion_length": 1601.232177734375, "epoch": 0.6476190476190476, "grad_norm": 0.3723608933130564, "kl": 0.005279541015625, "learning_rate": 2.9271847022066987e-07, "loss": 0.0167, "reward": 1.7884488105773926, "reward_std": 0.3653627932071686, "rewards/": 7.227957725524902, "rewards/math_compute_score": 0.4285714626312256, "step": 204 }, { "clip_ratio": 0.0, "completion_length": 1664.3035888671875, "epoch": 0.6507936507936508, "grad_norm": 0.37220118357197624, "kl": 0.0057373046875, "learning_rate": 2.880428046450697e-07, "loss": 0.0547, "reward": 2.2032926082611084, "reward_std": 0.5352621674537659, "rewards/": 7.873605251312256, "rewards/math_compute_score": 0.785714328289032, "step": 205 }, { "clip_ratio": 0.0, "completion_length": 1527.446533203125, "epoch": 0.653968253968254, "grad_norm": 0.44669994879904146, "kl": 0.00592041015625, "learning_rate": 2.8338962673593194e-07, "loss": 0.0466, "reward": 1.9878350496292114, "reward_std": 0.36021357774734497, "rewards/": 7.224888801574707, "rewards/math_compute_score": 0.6785714626312256, "step": 206 }, { "clip_ratio": 0.0, "completion_length": 1485.3929443359375, "epoch": 0.6571428571428571, "grad_norm": 0.36237163203607514, "kl": 0.0067138671875, "learning_rate": 2.7875943017365556e-07, "loss": 0.0087, "reward": 2.0885045528411865, "reward_std": 0.23854570090770721, "rewards/": 7.585379600524902, "rewards/math_compute_score": 0.7142857313156128, "step": 207 }, { "clip_ratio": 0.0, "completion_length": 1390.21435546875, "epoch": 0.6603174603174603, "grad_norm": 0.43492788329993204, "kl": 0.006011962890625, "learning_rate": 2.7415270620042634e-07, "loss": 0.0368, "reward": 2.027120590209961, "reward_std": 0.32408618927001953, "rewards/": 7.135602951049805, "rewards/math_compute_score": 0.7500000596046448, "step": 208 }, { "clip_ratio": 0.0, "completion_length": 1614.232177734375, "epoch": 0.6634920634920635, "grad_norm": 0.4285860950638222, "kl": 0.005950927734375, "learning_rate": 2.695699435680986e-07, "loss": -0.0219, "reward": 1.80555260181427, "reward_std": 0.39699044823646545, "rewards/": 7.170619964599609, "rewards/math_compute_score": 0.4642857313156128, "step": 209 }, { "epoch": 0.6666666666666666, "grad_norm": 0.4073585516563059, "learning_rate": 2.6501162848634016e-07, "loss": 0.0635, "step": 210 }, { "epoch": 0.6666666666666666, "eval_clip_ratio": 0.0, "eval_completion_length": 1593.8790690104167, "eval_kl": 0.006266276041666667, "eval_loss": 0.010452189482748508, "eval_reward": 2.0239213705062866, "eval_reward_std": 0.4319620430469513, "eval_rewards/": 7.357701142628987, "eval_rewards/math_compute_score": 0.6904762089252472, "eval_runtime": 132.1993, "eval_samples_per_second": 0.159, "eval_steps_per_second": 0.008, "step": 210 }, { "clip_ratio": 0.0, "completion_length": 1548.5625610351562, "epoch": 0.6698412698412698, "grad_norm": 0.3896154656236107, "kl": 0.0064697265625, "learning_rate": 2.604782445710476e-07, "loss": 0.0685, "reward": 1.9150113463401794, "reward_std": 0.5420868694782257, "rewards/": 7.289341926574707, "rewards/math_compute_score": 0.5714285969734192, "step": 211 }, { "clip_ratio": 0.0, "completion_length": 1175.875, "epoch": 0.6730158730158731, "grad_norm": 0.3761166584078419, "kl": 0.00653076171875, "learning_rate": 2.559702727930386e-07, "loss": 0.0143, "reward": 2.3515625, "reward_std": 0.2300196886062622, "rewards/": 7.757812976837158, "rewards/math_compute_score": 1.0, "step": 212 }, { "clip_ratio": 0.0, "completion_length": 1592.0535888671875, "epoch": 0.6761904761904762, "grad_norm": 0.3547503060575964, "kl": 0.005767822265625, "learning_rate": 2.5148819142702095e-07, "loss": 0.0157, "reward": 2.0560269355773926, "reward_std": 0.4543492794036865, "rewards/": 7.565848350524902, "rewards/math_compute_score": 0.6785714626312256, "step": 213 }, { "clip_ratio": 0.0, "completion_length": 1390.7679443359375, "epoch": 0.6793650793650794, "grad_norm": 0.41727155544044037, "kl": 0.00811767578125, "learning_rate": 2.470324760008517e-07, "loss": 0.0064, "reward": 1.9091730117797852, "reward_std": 0.4644983112812042, "rewards/": 6.545863628387451, "rewards/math_compute_score": 0.7500000596046448, "step": 214 }, { "clip_ratio": 0.0, "completion_length": 1498.2857666015625, "epoch": 0.6825396825396826, "grad_norm": 0.38605442508491833, "kl": 0.006195068359375, "learning_rate": 2.426035992450848e-07, "loss": 0.0188, "reward": 1.966071605682373, "reward_std": 0.4697768986225128, "rewards/": 7.2589287757873535, "rewards/math_compute_score": 0.6428571939468384, "step": 215 }, { "clip_ratio": 0.0, "completion_length": 1406.232177734375, "epoch": 0.6857142857142857, "grad_norm": 0.3466649961934692, "kl": 0.004974365234375, "learning_rate": 2.382020310428161e-07, "loss": 0.0045, "reward": 2.04665207862854, "reward_std": 0.2766338586807251, "rewards/": 7.233259201049805, "rewards/math_compute_score": 0.7500000596046448, "step": 216 }, { "clip_ratio": 0.0, "completion_length": 1642.9107666015625, "epoch": 0.6888888888888889, "grad_norm": 0.3395883468462042, "kl": 0.00592041015625, "learning_rate": 2.3382823837983312e-07, "loss": 0.0283, "reward": 1.9188058376312256, "reward_std": 0.31119585037231445, "rewards/": 7.308315277099609, "rewards/math_compute_score": 0.5714285969734192, "step": 217 }, { "clip_ratio": 0.0, "completion_length": 1288.75, "epoch": 0.692063492063492, "grad_norm": 0.4169289379787409, "kl": 0.00665283203125, "learning_rate": 2.2948268529506765e-07, "loss": 0.0257, "reward": 2.0765626430511475, "reward_std": 0.4043300747871399, "rewards/": 7.811384201049805, "rewards/math_compute_score": 0.6428571939468384, "step": 218 }, { "clip_ratio": 0.0, "completion_length": 1746.0535888671875, "epoch": 0.6952380952380952, "grad_norm": 0.38975967722789145, "kl": 0.005126953125, "learning_rate": 2.251658328313647e-07, "loss": -0.002, "reward": 1.3670480251312256, "reward_std": 0.6043983697891235, "rewards/": 6.7638115882873535, "rewards/math_compute_score": 0.01785714365541935, "step": 219 }, { "epoch": 0.6984126984126984, "grad_norm": 0.3971534991234478, "learning_rate": 2.208781389865677e-07, "loss": 0.0496, "step": 220 }, { "epoch": 0.6984126984126984, "eval_clip_ratio": 0.0, "eval_completion_length": 1515.7163492838542, "eval_kl": 0.006093343098958333, "eval_loss": 0.050898950546979904, "eval_reward": 1.9710845947265625, "eval_reward_std": 0.48362330595652264, "eval_rewards/": 7.117327372233073, "eval_rewards/math_compute_score": 0.6845238407452902, "eval_runtime": 130.2534, "eval_samples_per_second": 0.161, "eval_steps_per_second": 0.008, "step": 220 }, { "clip_ratio": 0.0, "completion_length": 1566.7857666015625, "epoch": 0.7015873015873015, "grad_norm": 0.32907894573216867, "kl": 0.0057525634765625, "learning_rate": 2.1662005866492715e-07, "loss": 0.0312, "reward": 1.7159180641174316, "reward_std": 0.4860241115093231, "rewards/": 7.329590320587158, "rewards/math_compute_score": 0.3125000149011612, "step": 221 }, { "clip_ratio": 0.0, "completion_length": 1622.3751220703125, "epoch": 0.7047619047619048, "grad_norm": 0.36878225894218164, "kl": 0.006378173828125, "learning_rate": 2.1239204362883695e-07, "loss": 0.0173, "reward": 1.4388115406036377, "reward_std": 0.48099273443222046, "rewards/": 6.4083428382873535, "rewards/math_compute_score": 0.196428582072258, "step": 222 }, { "clip_ratio": 0.0, "completion_length": 1494.3929443359375, "epoch": 0.707936507936508, "grad_norm": 0.38962068270348554, "kl": 0.006134033203125, "learning_rate": 2.0819454245090568e-07, "loss": 0.0288, "reward": 1.872544765472412, "reward_std": 0.4542202055454254, "rewards/": 6.934152126312256, "rewards/math_compute_score": 0.6071428656578064, "step": 223 }, { "clip_ratio": 0.0, "completion_length": 1317.2857666015625, "epoch": 0.7111111111111111, "grad_norm": 0.36894582377255875, "kl": 0.00543212890625, "learning_rate": 2.0402800046636364e-07, "loss": 0.0548, "reward": 2.053738832473755, "reward_std": 0.22289863228797913, "rewards/": 6.911551475524902, "rewards/math_compute_score": 0.8392857313156128, "step": 224 }, { "clip_ratio": 0.0, "completion_length": 1480.732177734375, "epoch": 0.7142857142857143, "grad_norm": 0.35221132675838623, "kl": 0.004974365234375, "learning_rate": 1.9989285972581593e-07, "loss": -0.0313, "reward": 1.808510184288025, "reward_std": 0.404258131980896, "rewards/": 6.471121788024902, "rewards/math_compute_score": 0.6428571939468384, "step": 225 }, { "clip_ratio": 0.0, "completion_length": 1309.25, "epoch": 0.7174603174603175, "grad_norm": 0.4356097059014855, "kl": 0.006500244140625, "learning_rate": 1.9578955894834258e-07, "loss": 0.0394, "reward": 2.175558090209961, "reward_std": 0.37214693427085876, "rewards/": 7.734933376312256, "rewards/math_compute_score": 0.785714328289032, "step": 226 }, { "clip_ratio": 0.0, "completion_length": 1685.0001220703125, "epoch": 0.7206349206349206, "grad_norm": 0.4352041974732675, "kl": 0.007537841796875, "learning_rate": 1.917185334749523e-07, "loss": 0.0324, "reward": 1.6248048543930054, "reward_std": 0.2427731454372406, "rewards/": 6.624023914337158, "rewards/math_compute_score": 0.3750000298023224, "step": 227 }, { "clip_ratio": 0.0, "completion_length": 1574.83935546875, "epoch": 0.7238095238095238, "grad_norm": 0.39645948514529894, "kl": 0.006988525390625, "learning_rate": 1.8768021522239574e-07, "loss": 0.0111, "reward": 1.8595423698425293, "reward_std": 0.20890839397907257, "rewards/": 7.297712326049805, "rewards/math_compute_score": 0.5, "step": 228 }, { "clip_ratio": 0.0, "completion_length": 1461.446533203125, "epoch": 0.726984126984127, "grad_norm": 0.43535613207045143, "kl": 0.006072998046875, "learning_rate": 1.836750326373398e-07, "loss": 0.024, "reward": 1.9434850215911865, "reward_std": 0.37709271907806396, "rewards/": 7.145996570587158, "rewards/math_compute_score": 0.6428571939468384, "step": 229 }, { "epoch": 0.7301587301587301, "grad_norm": 0.346576656418996, "learning_rate": 1.7970341065091243e-07, "loss": -0.0127, "step": 230 }, { "epoch": 0.7301587301587301, "eval_clip_ratio": 0.0, "eval_completion_length": 1533.8690999348958, "eval_kl": 0.005940755208333333, "eval_loss": 0.012749183923006058, "eval_reward": 2.0542319615681968, "eval_reward_std": 0.41574782133102417, "eval_rewards/": 7.247349580128987, "eval_rewards/math_compute_score": 0.7559523979822794, "eval_runtime": 130.0387, "eval_samples_per_second": 0.161, "eval_steps_per_second": 0.008, "step": 230 }, { "clip_ratio": 0.0, "completion_length": 1560.6964721679688, "epoch": 0.7333333333333333, "grad_norm": 0.41442561377155895, "kl": 0.00677490234375, "learning_rate": 1.7576577063361918e-07, "loss": -0.0333, "reward": 1.7915493249893188, "reward_std": 0.3907308280467987, "rewards/": 6.957746505737305, "rewards/math_compute_score": 0.5000000298023224, "step": 231 }, { "clip_ratio": 0.0, "completion_length": 1500.7501220703125, "epoch": 0.7365079365079366, "grad_norm": 0.38386522257967753, "kl": 0.00628662109375, "learning_rate": 1.7186253035063736e-07, "loss": 0.0055, "reward": 1.9492467641830444, "reward_std": 0.5078251957893372, "rewards/": 7.031948089599609, "rewards/math_compute_score": 0.6785714626312256, "step": 232 }, { "clip_ratio": 0.0, "completion_length": 1712.83935546875, "epoch": 0.7396825396825397, "grad_norm": 0.3381282235296574, "kl": 0.005828857421875, "learning_rate": 1.6799410391749414e-07, "loss": 0.0371, "reward": 1.826283574104309, "reward_std": 0.623534083366394, "rewards/": 7.345703601837158, "rewards/math_compute_score": 0.4464285969734192, "step": 233 }, { "clip_ratio": 0.0, "completion_length": 1552.321533203125, "epoch": 0.7428571428571429, "grad_norm": 0.3949250198934789, "kl": 0.005645751953125, "learning_rate": 1.6416090175612958e-07, "loss": 0.0482, "reward": 1.816322684288025, "reward_std": 0.5752555131912231, "rewards/": 7.3673272132873535, "rewards/math_compute_score": 0.4285714626312256, "step": 234 }, { "clip_ratio": 0.0, "completion_length": 1474.71435546875, "epoch": 0.746031746031746, "grad_norm": 0.3997394557335043, "kl": 0.006072998046875, "learning_rate": 1.6036333055135344e-07, "loss": 0.0706, "reward": 2.0150113105773926, "reward_std": 0.37213414907455444, "rewards/": 7.360770225524902, "rewards/math_compute_score": 0.6785714626312256, "step": 235 }, { "clip_ratio": 0.0, "completion_length": 1357.3929443359375, "epoch": 0.7492063492063492, "grad_norm": 0.3581961494125316, "kl": 0.00592041015625, "learning_rate": 1.5660179320769788e-07, "loss": 0.0471, "reward": 2.19921875, "reward_std": 0.36445701122283936, "rewards/": 7.5675225257873535, "rewards/math_compute_score": 0.8571429252624512, "step": 236 }, { "clip_ratio": 0.0, "completion_length": 1471.83935546875, "epoch": 0.7523809523809524, "grad_norm": 0.37492809441966657, "kl": 0.006866455078125, "learning_rate": 1.5287668880667104e-07, "loss": 0.0772, "reward": 1.8618303537368774, "reward_std": 0.49302613735198975, "rewards/": 7.023437976837158, "rewards/math_compute_score": 0.5714285969734192, "step": 237 }, { "clip_ratio": 0.0, "completion_length": 1382.6785888671875, "epoch": 0.7555555555555555, "grad_norm": 0.38754661617692593, "kl": 0.00714111328125, "learning_rate": 1.49188412564416e-07, "loss": 0.0385, "reward": 1.967801570892334, "reward_std": 0.3383636176586151, "rewards/": 7.053292751312256, "rewards/math_compute_score": 0.6964285969734192, "step": 238 }, { "clip_ratio": 0.0, "completion_length": 1664.0535888671875, "epoch": 0.7587301587301587, "grad_norm": 0.4007176962544528, "kl": 0.0062255859375, "learning_rate": 1.455373557897814e-07, "loss": 0.0594, "reward": 1.5234934091567993, "reward_std": 0.551922619342804, "rewards/": 6.331752300262451, "rewards/math_compute_score": 0.3214285969734192, "step": 239 }, { "epoch": 0.7619047619047619, "grad_norm": 0.3831506478541147, "learning_rate": 1.4192390584280344e-07, "loss": 0.0014, "step": 240 }, { "epoch": 0.7619047619047619, "eval_clip_ratio": 0.0, "eval_completion_length": 1517.2461344401042, "eval_kl": 0.005961100260416667, "eval_loss": 0.0475773885846138, "eval_reward": 1.965123454729716, "eval_reward_std": 0.47016530235608417, "eval_rewards/": 7.039902687072754, "eval_rewards/math_compute_score": 0.6964285969734192, "eval_runtime": 130.1244, "eval_samples_per_second": 0.161, "eval_steps_per_second": 0.008, "step": 240 } ], "logging_steps": 1.0, "max_steps": 315, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 80, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }