| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.7619047619047619, | |
| "eval_steps": 10, | |
| "global_step": 240, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1826.446533203125, | |
| "epoch": 0.0031746031746031746, | |
| "grad_norm": 0.34752090657228324, | |
| "kl": 0.0, | |
| "learning_rate": 1e-07, | |
| "loss": -0.0327, | |
| "reward": 1.2598215341567993, | |
| "reward_std": 0.510444164276123, | |
| "rewards/": 6.299107551574707, | |
| "rewards/math_compute_score": 0.0, | |
| "step": 1 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1842.607177734375, | |
| "epoch": 0.006349206349206349, | |
| "grad_norm": 0.37130067128404515, | |
| "kl": 0.0, | |
| "learning_rate": 2e-07, | |
| "loss": 0.0544, | |
| "reward": 1.4971821308135986, | |
| "reward_std": 0.7506331205368042, | |
| "rewards/": 5.914481163024902, | |
| "rewards/math_compute_score": 0.392857164144516, | |
| "step": 2 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2011.33935546875, | |
| "epoch": 0.009523809523809525, | |
| "grad_norm": 0.3200176373062262, | |
| "kl": 0.0002689361572265625, | |
| "learning_rate": 3e-07, | |
| "loss": 0.0077, | |
| "reward": 0.7667689919471741, | |
| "reward_std": 0.6793785691261292, | |
| "rewards/": 5.548130989074707, | |
| "rewards/math_compute_score": -0.4285714626312256, | |
| "step": 3 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1912.08935546875, | |
| "epoch": 0.012698412698412698, | |
| "grad_norm": 0.33904036931712517, | |
| "kl": 0.000278472900390625, | |
| "learning_rate": 4e-07, | |
| "loss": 0.0411, | |
| "reward": 1.14453125, | |
| "reward_std": 0.7682722806930542, | |
| "rewards/": 5.865513801574707, | |
| "rewards/math_compute_score": -0.0357142873108387, | |
| "step": 4 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1943.5357666015625, | |
| "epoch": 0.015873015873015872, | |
| "grad_norm": 0.3316962176535279, | |
| "kl": 0.0002994537353515625, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0439, | |
| "reward": 0.9941790103912354, | |
| "reward_std": 0.9224013090133667, | |
| "rewards/": 5.899466514587402, | |
| "rewards/math_compute_score": -0.2321428656578064, | |
| "step": 5 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1823.5357666015625, | |
| "epoch": 0.01904761904761905, | |
| "grad_norm": 0.39521708946388423, | |
| "kl": 0.0003261566162109375, | |
| "learning_rate": 6e-07, | |
| "loss": 0.0678, | |
| "reward": 1.3013323545455933, | |
| "reward_std": 0.7257120013237, | |
| "rewards/": 6.363804817199707, | |
| "rewards/math_compute_score": 0.0357142873108387, | |
| "step": 6 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1816.696533203125, | |
| "epoch": 0.022222222222222223, | |
| "grad_norm": 0.3977474793000202, | |
| "kl": 0.0002841949462890625, | |
| "learning_rate": 7e-07, | |
| "loss": 0.0524, | |
| "reward": 0.7803781032562256, | |
| "reward_std": 0.8279339075088501, | |
| "rewards/": 4.901890754699707, | |
| "rewards/math_compute_score": -0.25, | |
| "step": 7 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1811.21435546875, | |
| "epoch": 0.025396825396825397, | |
| "grad_norm": 0.3883548683310925, | |
| "kl": 0.000301361083984375, | |
| "learning_rate": 8e-07, | |
| "loss": 0.0334, | |
| "reward": 1.5578125715255737, | |
| "reward_std": 0.5970480442047119, | |
| "rewards/": 6.503348350524902, | |
| "rewards/math_compute_score": 0.3214285969734192, | |
| "step": 8 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1728.857177734375, | |
| "epoch": 0.02857142857142857, | |
| "grad_norm": 0.37089633220316565, | |
| "kl": 0.00034332275390625, | |
| "learning_rate": 9e-07, | |
| "loss": 0.087, | |
| "reward": 1.2353515625, | |
| "reward_std": 0.7409225702285767, | |
| "rewards/": 5.891043663024902, | |
| "rewards/math_compute_score": 0.0714285746216774, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.031746031746031744, | |
| "grad_norm": 0.31106869753963556, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0123, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.031746031746031744, | |
| "eval_clip_ratio": 0.0, | |
| "eval_completion_length": 1893.8313802083333, | |
| "eval_kl": 0.0003102620442708333, | |
| "eval_loss": 0.026542577892541885, | |
| "eval_reward": 1.1524926622708638, | |
| "eval_reward_std": 0.7845939000447592, | |
| "eval_rewards/": 6.024367809295654, | |
| "eval_rewards/math_compute_score": -0.06547619650761287, | |
| "eval_runtime": 142.6603, | |
| "eval_samples_per_second": 0.147, | |
| "eval_steps_per_second": 0.007, | |
| "step": 10 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1901.52685546875, | |
| "epoch": 0.03492063492063492, | |
| "grad_norm": 0.4187176065388408, | |
| "kl": 0.000331878662109375, | |
| "learning_rate": 9.99973476170006e-07, | |
| "loss": 0.052, | |
| "reward": 1.0824219584465027, | |
| "reward_std": 0.5579104721546173, | |
| "rewards/": 6.197824001312256, | |
| "rewards/math_compute_score": -0.1964285857975483, | |
| "step": 11 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1792.232177734375, | |
| "epoch": 0.0380952380952381, | |
| "grad_norm": 0.3897812351982433, | |
| "kl": 0.000331878662109375, | |
| "learning_rate": 9.998939074940787e-07, | |
| "loss": 0.0559, | |
| "reward": 1.3557896614074707, | |
| "reward_std": 0.6514952778816223, | |
| "rewards/": 5.921805381774902, | |
| "rewards/math_compute_score": 0.2142857313156128, | |
| "step": 12 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1953.571533203125, | |
| "epoch": 0.04126984126984127, | |
| "grad_norm": 0.3386069686056558, | |
| "kl": 0.000308990478515625, | |
| "learning_rate": 9.997613024140818e-07, | |
| "loss": 0.0464, | |
| "reward": 1.1771763563156128, | |
| "reward_std": 0.8344842195510864, | |
| "rewards/": 6.528738975524902, | |
| "rewards/math_compute_score": -0.1607142984867096, | |
| "step": 13 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1966.3037109375, | |
| "epoch": 0.044444444444444446, | |
| "grad_norm": 0.3436190087829617, | |
| "kl": 0.0003261566162109375, | |
| "learning_rate": 9.995756749987941e-07, | |
| "loss": 0.0058, | |
| "reward": 1.0176271200180054, | |
| "reward_std": 0.6552860736846924, | |
| "rewards/": 6.373849391937256, | |
| "rewards/math_compute_score": -0.3214285969734192, | |
| "step": 14 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1922.58935546875, | |
| "epoch": 0.047619047619047616, | |
| "grad_norm": 0.3312111219541393, | |
| "kl": 0.0002460479736328125, | |
| "learning_rate": 9.993370449424152e-07, | |
| "loss": 0.0395, | |
| "reward": 1.332235336303711, | |
| "reward_std": 0.7883400321006775, | |
| "rewards/": 6.089747905731201, | |
| "rewards/math_compute_score": 0.1428571492433548, | |
| "step": 15 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1920.607177734375, | |
| "epoch": 0.050793650793650794, | |
| "grad_norm": 0.3660345032806764, | |
| "kl": 0.0003185272216796875, | |
| "learning_rate": 9.990454375624776e-07, | |
| "loss": 0.0429, | |
| "reward": 0.9837054014205933, | |
| "reward_std": 0.56076979637146, | |
| "rewards/": 6.489955425262451, | |
| "rewards/math_compute_score": -0.392857164144516, | |
| "step": 16 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1781.5537109375, | |
| "epoch": 0.05396825396825397, | |
| "grad_norm": 0.3159968435422686, | |
| "kl": 0.00029754638671875, | |
| "learning_rate": 9.987008837971594e-07, | |
| "loss": 0.0258, | |
| "reward": 1.4613840579986572, | |
| "reward_std": 0.5660971403121948, | |
| "rewards/": 5.878348350524902, | |
| "rewards/math_compute_score": 0.3571428656578064, | |
| "step": 17 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1722.08935546875, | |
| "epoch": 0.05714285714285714, | |
| "grad_norm": 0.3769499591892152, | |
| "kl": 0.0002956390380859375, | |
| "learning_rate": 9.98303420202003e-07, | |
| "loss": 0.0515, | |
| "reward": 1.5921318531036377, | |
| "reward_std": 0.657545268535614, | |
| "rewards/": 6.103516101837158, | |
| "rewards/math_compute_score": 0.4642857313156128, | |
| "step": 18 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1859.8751220703125, | |
| "epoch": 0.06031746031746032, | |
| "grad_norm": 0.37701505516549555, | |
| "kl": 0.000335693359375, | |
| "learning_rate": 9.978530889460349e-07, | |
| "loss": 0.0532, | |
| "reward": 1.2727400064468384, | |
| "reward_std": 0.6805964708328247, | |
| "rewards/": 6.435128688812256, | |
| "rewards/math_compute_score": -0.01785714365541935, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.06349206349206349, | |
| "grad_norm": 0.3560929502812583, | |
| "learning_rate": 9.973499378072946e-07, | |
| "loss": 0.0505, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.06349206349206349, | |
| "eval_clip_ratio": 0.0, | |
| "eval_completion_length": 1902.3988444010417, | |
| "eval_kl": 0.0003344217936197917, | |
| "eval_loss": 0.03622707724571228, | |
| "eval_reward": 1.2230852444966633, | |
| "eval_reward_std": 0.7650324503580729, | |
| "eval_rewards/": 6.020188331604004, | |
| "eval_rewards/math_compute_score": 0.023809528599182766, | |
| "eval_runtime": 140.9236, | |
| "eval_samples_per_second": 0.149, | |
| "eval_steps_per_second": 0.007, | |
| "step": 20 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1718.2589721679688, | |
| "epoch": 0.06666666666666667, | |
| "grad_norm": 0.3375758039734621, | |
| "kl": 0.000278472900390625, | |
| "learning_rate": 9.967940201677625e-07, | |
| "loss": 0.0309, | |
| "reward": 1.532582402229309, | |
| "reward_std": 0.5138258934020996, | |
| "rewards/": 6.23434042930603, | |
| "rewards/math_compute_score": 0.3571428805589676, | |
| "step": 21 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1812.821533203125, | |
| "epoch": 0.06984126984126984, | |
| "grad_norm": 0.3990303343369039, | |
| "kl": 0.0004177093505859375, | |
| "learning_rate": 9.96185395007699e-07, | |
| "loss": 0.0801, | |
| "reward": 1.3509488105773926, | |
| "reward_std": 0.9690964818000793, | |
| "rewards/": 6.040457725524902, | |
| "rewards/math_compute_score": 0.1785714328289032, | |
| "step": 22 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1906.821533203125, | |
| "epoch": 0.07301587301587302, | |
| "grad_norm": 0.3549321131443626, | |
| "kl": 0.00028228759765625, | |
| "learning_rate": 9.95524126899385e-07, | |
| "loss": -0.0105, | |
| "reward": 1.0774554014205933, | |
| "reward_std": 0.8010032176971436, | |
| "rewards/": 6.244420051574707, | |
| "rewards/math_compute_score": -0.2142857313156128, | |
| "step": 23 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1859.821533203125, | |
| "epoch": 0.0761904761904762, | |
| "grad_norm": 0.400791590884524, | |
| "kl": 0.0003490447998046875, | |
| "learning_rate": 9.94810286000272e-07, | |
| "loss": 0.0269, | |
| "reward": 1.2509558200836182, | |
| "reward_std": 0.6348705291748047, | |
| "rewards/": 5.683350086212158, | |
| "rewards/math_compute_score": 0.1428571492433548, | |
| "step": 24 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1910.6787109375, | |
| "epoch": 0.07936507936507936, | |
| "grad_norm": 0.32248521687421966, | |
| "kl": 0.0003147125244140625, | |
| "learning_rate": 9.940439480455385e-07, | |
| "loss": 0.0147, | |
| "reward": 1.06690514087677, | |
| "reward_std": 0.6229907870292664, | |
| "rewards/": 6.048810958862305, | |
| "rewards/math_compute_score": -0.1785714328289032, | |
| "step": 25 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1673.5357666015625, | |
| "epoch": 0.08253968253968254, | |
| "grad_norm": 0.4098565397161361, | |
| "kl": 0.0003509521484375, | |
| "learning_rate": 9.932251943400553e-07, | |
| "loss": 0.0784, | |
| "reward": 1.8423550128936768, | |
| "reward_std": 0.5337907075881958, | |
| "rewards/": 6.497488975524902, | |
| "rewards/math_compute_score": 0.6785714626312256, | |
| "step": 26 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1848.6429443359375, | |
| "epoch": 0.08571428571428572, | |
| "grad_norm": 0.3413878426395489, | |
| "kl": 0.000400543212890625, | |
| "learning_rate": 9.923541117497585e-07, | |
| "loss": -0.0184, | |
| "reward": 1.2410855293273926, | |
| "reward_std": 0.5728386044502258, | |
| "rewards/": 6.205427169799805, | |
| "rewards/math_compute_score": 0.0, | |
| "step": 27 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1833.607177734375, | |
| "epoch": 0.08888888888888889, | |
| "grad_norm": 0.398653320294794, | |
| "kl": 0.0004863739013671875, | |
| "learning_rate": 9.914307926924344e-07, | |
| "loss": 0.0011, | |
| "reward": 1.2547712326049805, | |
| "reward_std": 0.5897310376167297, | |
| "rewards/": 6.416713237762451, | |
| "rewards/math_compute_score": -0.0357142873108387, | |
| "step": 28 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1974.982177734375, | |
| "epoch": 0.09206349206349207, | |
| "grad_norm": 0.34060151455859505, | |
| "kl": 0.00042724609375, | |
| "learning_rate": 9.904553351279136e-07, | |
| "loss": 0.0252, | |
| "reward": 0.9052909016609192, | |
| "reward_std": 0.7842908501625061, | |
| "rewards/": 5.812169075012207, | |
| "rewards/math_compute_score": -0.3214285969734192, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.09523809523809523, | |
| "grad_norm": 0.3555389006533955, | |
| "learning_rate": 9.894278425476788e-07, | |
| "loss": 0.0422, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.09523809523809523, | |
| "eval_clip_ratio": 0.0, | |
| "eval_completion_length": 1847.3909505208333, | |
| "eval_kl": 0.0004094441731770833, | |
| "eval_loss": 0.027497123926877975, | |
| "eval_reward": 1.240986665089925, | |
| "eval_reward_std": 0.79307621717453, | |
| "eval_rewards/": 5.8715996742248535, | |
| "eval_rewards/math_compute_score": 0.0833333432674408, | |
| "eval_runtime": 139.8981, | |
| "eval_samples_per_second": 0.15, | |
| "eval_steps_per_second": 0.007, | |
| "step": 30 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1868.71435546875, | |
| "epoch": 0.09841269841269841, | |
| "grad_norm": 0.3260634697668482, | |
| "kl": 0.0004024505615234375, | |
| "learning_rate": 9.88348423963884e-07, | |
| "loss": 0.0114, | |
| "reward": 1.2039064168930054, | |
| "reward_std": 0.64045649766922, | |
| "rewards/": 6.305245876312256, | |
| "rewards/math_compute_score": -0.0714285783469677, | |
| "step": 31 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1790.857177734375, | |
| "epoch": 0.10158730158730159, | |
| "grad_norm": 0.3567249854105024, | |
| "kl": 0.00042724609375, | |
| "learning_rate": 9.872171938977893e-07, | |
| "loss": 0.0165, | |
| "reward": 1.3144984245300293, | |
| "reward_std": 0.8585119843482971, | |
| "rewards/": 5.858206748962402, | |
| "rewards/math_compute_score": 0.1785714328289032, | |
| "step": 32 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1813.7857666015625, | |
| "epoch": 0.10476190476190476, | |
| "grad_norm": 0.380215941310899, | |
| "kl": 0.0005035400390625, | |
| "learning_rate": 9.860342723676104e-07, | |
| "loss": 0.0179, | |
| "reward": 1.340485692024231, | |
| "reward_std": 0.7554614543914795, | |
| "rewards/": 5.773856163024902, | |
| "rewards/math_compute_score": 0.2321428656578064, | |
| "step": 33 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1642.2501220703125, | |
| "epoch": 0.10793650793650794, | |
| "grad_norm": 0.3669725090275098, | |
| "kl": 0.0003681182861328125, | |
| "learning_rate": 9.847997848757854e-07, | |
| "loss": 0.0739, | |
| "reward": 1.350502371788025, | |
| "reward_std": 0.4604892134666443, | |
| "rewards/": 5.681082725524902, | |
| "rewards/math_compute_score": 0.267857164144516, | |
| "step": 34 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1810.3751220703125, | |
| "epoch": 0.1111111111111111, | |
| "grad_norm": 0.3898398578219401, | |
| "kl": 0.00051116943359375, | |
| "learning_rate": 9.835138623956602e-07, | |
| "loss": -0.0012, | |
| "reward": 1.1720424890518188, | |
| "reward_std": 0.6071317195892334, | |
| "rewards/": 5.717355251312256, | |
| "rewards/math_compute_score": 0.0357142873108387, | |
| "step": 35 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1676.46435546875, | |
| "epoch": 0.11428571428571428, | |
| "grad_norm": 0.3415556235291945, | |
| "kl": 0.000461578369140625, | |
| "learning_rate": 9.821766413575914e-07, | |
| "loss": 0.0237, | |
| "reward": 1.5515068769454956, | |
| "reward_std": 0.41333118081092834, | |
| "rewards/": 6.043248176574707, | |
| "rewards/math_compute_score": 0.4285714626312256, | |
| "step": 36 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1821.0357666015625, | |
| "epoch": 0.11746031746031746, | |
| "grad_norm": 0.3841969882035549, | |
| "kl": 0.00049591064453125, | |
| "learning_rate": 9.80788263634473e-07, | |
| "loss": 0.0204, | |
| "reward": 1.4578125476837158, | |
| "reward_std": 0.6289081573486328, | |
| "rewards/": 6.574777126312256, | |
| "rewards/math_compute_score": 0.1785714328289032, | |
| "step": 37 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1789.196533203125, | |
| "epoch": 0.12063492063492064, | |
| "grad_norm": 0.4014410966440469, | |
| "kl": 0.000530242919921875, | |
| "learning_rate": 9.793488765266838e-07, | |
| "loss": 0.07, | |
| "reward": 1.5906460285186768, | |
| "reward_std": 0.8234072327613831, | |
| "rewards/": 6.0960869789123535, | |
| "rewards/math_compute_score": 0.4642857313156128, | |
| "step": 38 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1829.0537109375, | |
| "epoch": 0.12380952380952381, | |
| "grad_norm": 0.34941213890761874, | |
| "kl": 0.00049591064453125, | |
| "learning_rate": 9.778586327464597e-07, | |
| "loss": 0.0278, | |
| "reward": 1.5846540927886963, | |
| "reward_std": 0.6426271200180054, | |
| "rewards/": 6.637556076049805, | |
| "rewards/math_compute_score": 0.3214285969734192, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.12698412698412698, | |
| "grad_norm": 0.38140427899057316, | |
| "learning_rate": 9.763176904016913e-07, | |
| "loss": 0.0294, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.12698412698412698, | |
| "eval_clip_ratio": 0.0, | |
| "eval_completion_length": 1791.4822184244792, | |
| "eval_kl": 0.0005544026692708334, | |
| "eval_loss": 0.05520148575305939, | |
| "eval_reward": 1.4818546374638875, | |
| "eval_reward_std": 0.7397708296775818, | |
| "eval_rewards/": 6.504511038462321, | |
| "eval_rewards/math_compute_score": 0.2261904776096344, | |
| "eval_runtime": 138.6839, | |
| "eval_samples_per_second": 0.151, | |
| "eval_steps_per_second": 0.007, | |
| "step": 40 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1758.419677734375, | |
| "epoch": 0.13015873015873017, | |
| "grad_norm": 0.38719549506679857, | |
| "kl": 0.0005283355712890625, | |
| "learning_rate": 9.747262129791495e-07, | |
| "loss": 0.0051, | |
| "reward": 1.2441372275352478, | |
| "reward_std": 0.5995893478393555, | |
| "rewards/": 6.0064003467559814, | |
| "rewards/math_compute_score": 0.0535714253783226, | |
| "step": 41 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1710.3035888671875, | |
| "epoch": 0.13333333333333333, | |
| "grad_norm": 0.3698050435119778, | |
| "kl": 0.000522613525390625, | |
| "learning_rate": 9.730843693271413e-07, | |
| "loss": 0.0453, | |
| "reward": 1.4925503730773926, | |
| "reward_std": 0.6253259181976318, | |
| "rewards/": 5.819894313812256, | |
| "rewards/math_compute_score": 0.4107142984867096, | |
| "step": 42 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1744.3929443359375, | |
| "epoch": 0.1365079365079365, | |
| "grad_norm": 0.3735801890902873, | |
| "kl": 0.0006866455078125, | |
| "learning_rate": 9.713923336375936e-07, | |
| "loss": 0.0102, | |
| "reward": 1.4331055879592896, | |
| "reward_std": 0.829824686050415, | |
| "rewards/": 5.951241970062256, | |
| "rewards/math_compute_score": 0.3035714328289032, | |
| "step": 43 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1797.232177734375, | |
| "epoch": 0.13968253968253969, | |
| "grad_norm": 0.35251183170280326, | |
| "kl": 0.00058746337890625, | |
| "learning_rate": 9.696502854275748e-07, | |
| "loss": 0.0273, | |
| "reward": 1.3867467641830444, | |
| "reward_std": 0.6368395686149597, | |
| "rewards/": 6.219447612762451, | |
| "rewards/math_compute_score": 0.1785714328289032, | |
| "step": 44 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1759.08935546875, | |
| "epoch": 0.14285714285714285, | |
| "grad_norm": 0.36571909135002856, | |
| "kl": 0.000553131103515625, | |
| "learning_rate": 9.678584095202469e-07, | |
| "loss": 0.0341, | |
| "reward": 0.931584894657135, | |
| "reward_std": 0.5959246754646301, | |
| "rewards/": 5.729352951049805, | |
| "rewards/math_compute_score": -0.267857164144516, | |
| "step": 45 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1736.357177734375, | |
| "epoch": 0.14603174603174604, | |
| "grad_norm": 0.33831757661050815, | |
| "kl": 0.000553131103515625, | |
| "learning_rate": 9.660168960252575e-07, | |
| "loss": 0.0437, | |
| "reward": 1.6665178537368774, | |
| "reward_std": 0.4598635137081146, | |
| "rewards/": 6.046875476837158, | |
| "rewards/math_compute_score": 0.5714285969734192, | |
| "step": 46 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1807.8037109375, | |
| "epoch": 0.1492063492063492, | |
| "grad_norm": 0.36813538905077847, | |
| "kl": 0.000637054443359375, | |
| "learning_rate": 9.641259403185704e-07, | |
| "loss": 0.031, | |
| "reward": 0.9589914083480835, | |
| "reward_std": 0.5812153816223145, | |
| "rewards/": 5.080671310424805, | |
| "rewards/math_compute_score": -0.0714285746216774, | |
| "step": 47 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1591.982177734375, | |
| "epoch": 0.1523809523809524, | |
| "grad_norm": 0.3231025820418364, | |
| "kl": 0.000698089599609375, | |
| "learning_rate": 9.621857430217365e-07, | |
| "loss": 0.0424, | |
| "reward": 1.8917970657348633, | |
| "reward_std": 0.3157893121242523, | |
| "rewards/": 6.744699001312256, | |
| "rewards/math_compute_score": 0.6785714626312256, | |
| "step": 48 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1881.982177734375, | |
| "epoch": 0.15555555555555556, | |
| "grad_norm": 0.3466507877034998, | |
| "kl": 0.0007781982421875, | |
| "learning_rate": 9.601965099806084e-07, | |
| "loss": 0.0405, | |
| "reward": 1.4976422786712646, | |
| "reward_std": 0.7847145199775696, | |
| "rewards/": 6.416783332824707, | |
| "rewards/math_compute_score": 0.267857164144516, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.15873015873015872, | |
| "grad_norm": 0.3331388102754494, | |
| "learning_rate": 9.581584522435023e-07, | |
| "loss": 0.0388, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.15873015873015872, | |
| "eval_clip_ratio": 0.0, | |
| "eval_completion_length": 1816.2877604166667, | |
| "eval_kl": 0.0007731119791666666, | |
| "eval_loss": 0.04274662211537361, | |
| "eval_reward": 1.551771879196167, | |
| "eval_reward_std": 0.641852875550588, | |
| "eval_rewards/": 6.258859157562256, | |
| "eval_rewards/math_compute_score": 0.3750000149011612, | |
| "eval_runtime": 138.9269, | |
| "eval_samples_per_second": 0.151, | |
| "eval_steps_per_second": 0.007, | |
| "step": 50 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1660.357177734375, | |
| "epoch": 0.1619047619047619, | |
| "grad_norm": 0.41262546854699034, | |
| "kl": 0.0008182525634765625, | |
| "learning_rate": 9.56071786038806e-07, | |
| "loss": 0.0192, | |
| "reward": 1.6869142055511475, | |
| "reward_std": 0.39825020730495453, | |
| "rewards/": 6.577427625656128, | |
| "rewards/math_compute_score": 0.4642857313156128, | |
| "step": 51 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1935.571533203125, | |
| "epoch": 0.16507936507936508, | |
| "grad_norm": 0.34500490912598375, | |
| "kl": 0.000762939453125, | |
| "learning_rate": 9.53936732752038e-07, | |
| "loss": 0.0028, | |
| "reward": 1.2544364929199219, | |
| "reward_std": 0.6236512660980225, | |
| "rewards/": 6.557896614074707, | |
| "rewards/math_compute_score": -0.0714285746216774, | |
| "step": 52 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1827.482177734375, | |
| "epoch": 0.16825396825396827, | |
| "grad_norm": 0.34348526502845717, | |
| "kl": 0.000732421875, | |
| "learning_rate": 9.517535189023601e-07, | |
| "loss": -0.0054, | |
| "reward": 1.2513673305511475, | |
| "reward_std": 0.8205690979957581, | |
| "rewards/": 5.8282647132873535, | |
| "rewards/math_compute_score": 0.1071428656578064, | |
| "step": 53 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1753.4285888671875, | |
| "epoch": 0.17142857142857143, | |
| "grad_norm": 0.36933753325659874, | |
| "kl": 0.00099945068359375, | |
| "learning_rate": 9.495223761185441e-07, | |
| "loss": 0.0034, | |
| "reward": 1.4741246700286865, | |
| "reward_std": 0.6187431216239929, | |
| "rewards/": 6.513480186462402, | |
| "rewards/math_compute_score": 0.2142857313156128, | |
| "step": 54 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1771.8751220703125, | |
| "epoch": 0.1746031746031746, | |
| "grad_norm": 0.34576119098823954, | |
| "kl": 0.0009918212890625, | |
| "learning_rate": 9.472435411143977e-07, | |
| "loss": 0.0191, | |
| "reward": 1.5563616752624512, | |
| "reward_std": 0.6519899368286133, | |
| "rewards/": 6.781808376312256, | |
| "rewards/math_compute_score": 0.25, | |
| "step": 55 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1711.1785888671875, | |
| "epoch": 0.17777777777777778, | |
| "grad_norm": 0.3525345385926051, | |
| "kl": 0.000843048095703125, | |
| "learning_rate": 9.449172556636497e-07, | |
| "loss": -0.0206, | |
| "reward": 1.422028660774231, | |
| "reward_std": 0.8101202249526978, | |
| "rewards/": 6.110142707824707, | |
| "rewards/math_compute_score": 0.25, | |
| "step": 56 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1774.6251220703125, | |
| "epoch": 0.18095238095238095, | |
| "grad_norm": 0.40494567349730853, | |
| "kl": 0.000972747802734375, | |
| "learning_rate": 9.425437665742997e-07, | |
| "loss": 0.0519, | |
| "reward": 1.3338658809661865, | |
| "reward_std": 0.7232537865638733, | |
| "rewards/": 6.240757942199707, | |
| "rewards/math_compute_score": 0.1071428656578064, | |
| "step": 57 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1828.071533203125, | |
| "epoch": 0.18412698412698414, | |
| "grad_norm": 0.33450583156617786, | |
| "kl": 0.000881195068359375, | |
| "learning_rate": 9.401233256624316e-07, | |
| "loss": 0.0249, | |
| "reward": 1.4562500715255737, | |
| "reward_std": 0.6799939274787903, | |
| "rewards/": 6.566964626312256, | |
| "rewards/math_compute_score": 0.1785714328289032, | |
| "step": 58 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1660.696533203125, | |
| "epoch": 0.1873015873015873, | |
| "grad_norm": 0.4070810399603553, | |
| "kl": 0.0011444091796875, | |
| "learning_rate": 9.376561897254987e-07, | |
| "loss": -0.0269, | |
| "reward": 1.5412318706512451, | |
| "reward_std": 0.5850991606712341, | |
| "rewards/": 6.277588367462158, | |
| "rewards/math_compute_score": 0.3571428656578064, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.19047619047619047, | |
| "grad_norm": 0.37255871829826, | |
| "learning_rate": 9.351426205150776e-07, | |
| "loss": 0.038, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.19047619047619047, | |
| "eval_clip_ratio": 0.0, | |
| "eval_completion_length": 1717.0694986979167, | |
| "eval_kl": 0.0009918212890625, | |
| "eval_loss": 0.03207956254482269, | |
| "eval_reward": 1.5469355980555217, | |
| "eval_reward_std": 0.6412561237812042, | |
| "eval_rewards/": 6.091820240020752, | |
| "eval_rewards/math_compute_score": 0.4107142984867096, | |
| "eval_runtime": 136.3487, | |
| "eval_samples_per_second": 0.154, | |
| "eval_steps_per_second": 0.007, | |
| "step": 60 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1612.7322387695312, | |
| "epoch": 0.19365079365079366, | |
| "grad_norm": 0.3484954653008926, | |
| "kl": 0.001018524169921875, | |
| "learning_rate": 9.32582884709098e-07, | |
| "loss": 0.0535, | |
| "reward": 1.569977879524231, | |
| "reward_std": 0.5602003335952759, | |
| "rewards/": 6.2784600257873535, | |
| "rewards/math_compute_score": 0.39285717345774174, | |
| "step": 61 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1474.1429443359375, | |
| "epoch": 0.19682539682539682, | |
| "grad_norm": 0.456250830578578, | |
| "kl": 0.0013275146484375, | |
| "learning_rate": 9.299772538835491e-07, | |
| "loss": -0.0324, | |
| "reward": 1.570549726486206, | |
| "reward_std": 0.3879093527793884, | |
| "rewards/": 6.138463020324707, | |
| "rewards/math_compute_score": 0.4285714626312256, | |
| "step": 62 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1645.482177734375, | |
| "epoch": 0.2, | |
| "grad_norm": 0.3717148388714456, | |
| "kl": 0.00115203857421875, | |
| "learning_rate": 9.273260044836673e-07, | |
| "loss": 0.0842, | |
| "reward": 1.741573691368103, | |
| "reward_std": 0.49386849999427795, | |
| "rewards/": 6.779297351837158, | |
| "rewards/math_compute_score": 0.4821428656578064, | |
| "step": 63 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1534.857177734375, | |
| "epoch": 0.20317460317460317, | |
| "grad_norm": 0.3745652306218341, | |
| "kl": 0.00112152099609375, | |
| "learning_rate": 9.246294177946062e-07, | |
| "loss": 0.0523, | |
| "reward": 1.82996666431427, | |
| "reward_std": 0.42854541540145874, | |
| "rewards/": 6.721261501312256, | |
| "rewards/math_compute_score": 0.6071428656578064, | |
| "step": 64 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1709.9285888671875, | |
| "epoch": 0.20634920634920634, | |
| "grad_norm": 0.4185811440041477, | |
| "kl": 0.00122833251953125, | |
| "learning_rate": 9.218877799115927e-07, | |
| "loss": 0.0595, | |
| "reward": 1.7054688930511475, | |
| "reward_std": 0.5991591215133667, | |
| "rewards/": 6.813058376312256, | |
| "rewards/math_compute_score": 0.4285714626312256, | |
| "step": 65 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1851.0357666015625, | |
| "epoch": 0.20952380952380953, | |
| "grad_norm": 0.34158790586957477, | |
| "kl": 0.0010986328125, | |
| "learning_rate": 9.191013817095761e-07, | |
| "loss": 0.0013, | |
| "reward": 1.466183066368103, | |
| "reward_std": 0.5523228049278259, | |
| "rewards/": 7.045201301574707, | |
| "rewards/math_compute_score": 0.0714285746216774, | |
| "step": 66 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1783.4287109375, | |
| "epoch": 0.2126984126984127, | |
| "grad_norm": 0.40084205903248576, | |
| "kl": 0.0013427734375, | |
| "learning_rate": 9.162705188123646e-07, | |
| "loss": 0.0218, | |
| "reward": 1.456040859222412, | |
| "reward_std": 0.5050444602966309, | |
| "rewards/": 6.994489669799805, | |
| "rewards/math_compute_score": 0.0714285746216774, | |
| "step": 67 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1766.2501220703125, | |
| "epoch": 0.21587301587301588, | |
| "grad_norm": 0.38102579204059783, | |
| "kl": 0.0012969970703125, | |
| "learning_rate": 9.133954915612634e-07, | |
| "loss": 0.087, | |
| "reward": 1.6629464626312256, | |
| "reward_std": 0.7155088782310486, | |
| "rewards/": 6.957589626312256, | |
| "rewards/math_compute_score": 0.3392857313156128, | |
| "step": 68 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1658.96435546875, | |
| "epoch": 0.21904761904761905, | |
| "grad_norm": 0.36394590935604426, | |
| "kl": 0.00115966796875, | |
| "learning_rate": 9.104766049832087e-07, | |
| "loss": 0.0179, | |
| "reward": 1.6870676279067993, | |
| "reward_std": 0.4471067190170288, | |
| "rewards/": 6.506766319274902, | |
| "rewards/math_compute_score": 0.4821428656578064, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.2222222222222222, | |
| "grad_norm": 0.3374654129906019, | |
| "learning_rate": 9.075141687584056e-07, | |
| "loss": 0.0178, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.2222222222222222, | |
| "eval_clip_ratio": 0.0, | |
| "eval_completion_length": 1738.0059814453125, | |
| "eval_kl": 0.00133514404296875, | |
| "eval_loss": 0.05646166205406189, | |
| "eval_reward": 1.5958195527394612, | |
| "eval_reward_std": 0.6562197208404541, | |
| "eval_rewards/": 6.621954282124837, | |
| "eval_rewards/math_compute_score": 0.3392857213815053, | |
| "eval_runtime": 136.062, | |
| "eval_samples_per_second": 0.154, | |
| "eval_steps_per_second": 0.007, | |
| "step": 70 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1752.3304443359375, | |
| "epoch": 0.2253968253968254, | |
| "grad_norm": 0.38489393931988863, | |
| "kl": 0.001201629638671875, | |
| "learning_rate": 9.045084971874737e-07, | |
| "loss": -0.0611, | |
| "reward": 1.3448200225830078, | |
| "reward_std": 0.5091241598129272, | |
| "rewards/": 6.402670860290527, | |
| "rewards/math_compute_score": 0.0803571492433548, | |
| "step": 71 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1880.2501220703125, | |
| "epoch": 0.22857142857142856, | |
| "grad_norm": 0.39120717997716, | |
| "kl": 0.00131988525390625, | |
| "learning_rate": 9.014599091580998e-07, | |
| "loss": 0.048, | |
| "reward": 1.76941978931427, | |
| "reward_std": 0.5547811388969421, | |
| "rewards/": 7.2042412757873535, | |
| "rewards/math_compute_score": 0.4107142984867096, | |
| "step": 72 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1588.08935546875, | |
| "epoch": 0.23174603174603176, | |
| "grad_norm": 0.3304077909442911, | |
| "kl": 0.00144195556640625, | |
| "learning_rate": 8.983687281112064e-07, | |
| "loss": 0.0299, | |
| "reward": 2.067634105682373, | |
| "reward_std": 0.4065239131450653, | |
| "rewards/": 7.195312976837158, | |
| "rewards/math_compute_score": 0.785714328289032, | |
| "step": 73 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1834.7857666015625, | |
| "epoch": 0.23492063492063492, | |
| "grad_norm": 0.31261547252144517, | |
| "kl": 0.001434326171875, | |
| "learning_rate": 8.952352820066358e-07, | |
| "loss": 0.0338, | |
| "reward": 1.8736608028411865, | |
| "reward_std": 0.4773353934288025, | |
| "rewards/": 6.939732551574707, | |
| "rewards/math_compute_score": 0.6071428656578064, | |
| "step": 74 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1668.071533203125, | |
| "epoch": 0.23809523809523808, | |
| "grad_norm": 0.37467501438016265, | |
| "kl": 0.0016326904296875, | |
| "learning_rate": 8.920599032883552e-07, | |
| "loss": -0.0187, | |
| "reward": 1.3380582332611084, | |
| "reward_std": 0.505685031414032, | |
| "rewards/": 5.690290451049805, | |
| "rewards/math_compute_score": 0.25, | |
| "step": 75 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1687.107177734375, | |
| "epoch": 0.24126984126984127, | |
| "grad_norm": 0.3823169614619227, | |
| "kl": 0.00157928466796875, | |
| "learning_rate": 8.888429288491855e-07, | |
| "loss": 0.0274, | |
| "reward": 1.6549667119979858, | |
| "reward_std": 0.3212043046951294, | |
| "rewards/": 6.560547351837158, | |
| "rewards/math_compute_score": 0.4285714626312256, | |
| "step": 76 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1807.1787109375, | |
| "epoch": 0.24444444444444444, | |
| "grad_norm": 0.35610484187919744, | |
| "kl": 0.0015869140625, | |
| "learning_rate": 8.855846999950595e-07, | |
| "loss": 0.0261, | |
| "reward": 1.2699779272079468, | |
| "reward_std": 0.5984498858451843, | |
| "rewards/": 6.492745876312256, | |
| "rewards/math_compute_score": -0.0357142873108387, | |
| "step": 77 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1851.9107666015625, | |
| "epoch": 0.24761904761904763, | |
| "grad_norm": 0.2883614180110202, | |
| "kl": 0.00138092041015625, | |
| "learning_rate": 8.822855624088097e-07, | |
| "loss": -0.0185, | |
| "reward": 1.5325753688812256, | |
| "reward_std": 0.6921989321708679, | |
| "rewards/": 6.734305381774902, | |
| "rewards/math_compute_score": 0.2321428656578064, | |
| "step": 78 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1517.8751220703125, | |
| "epoch": 0.2507936507936508, | |
| "grad_norm": 0.3716530172552612, | |
| "kl": 0.00165557861328125, | |
| "learning_rate": 8.789458661134942e-07, | |
| "loss": 0.049, | |
| "reward": 1.7504465579986572, | |
| "reward_std": 0.39499327540397644, | |
| "rewards/": 6.466517925262451, | |
| "rewards/math_compute_score": 0.5714285969734192, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 0.25396825396825395, | |
| "grad_norm": 0.3590906991873432, | |
| "learning_rate": 8.755659654352599e-07, | |
| "loss": 0.0028, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.25396825396825395, | |
| "eval_clip_ratio": 0.0, | |
| "eval_completion_length": 1681.073486328125, | |
| "eval_kl": 0.017834981282552082, | |
| "eval_loss": 0.015299047343432903, | |
| "eval_reward": 1.720870574315389, | |
| "eval_reward_std": 0.5228437781333923, | |
| "eval_rewards/": 6.556733926137288, | |
| "eval_rewards/math_compute_score": 0.5119047897557417, | |
| "eval_runtime": 135.3946, | |
| "eval_samples_per_second": 0.155, | |
| "eval_steps_per_second": 0.007, | |
| "step": 80 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1618.1160888671875, | |
| "epoch": 0.2571428571428571, | |
| "grad_norm": 0.3174887107646142, | |
| "kl": 0.00168609619140625, | |
| "learning_rate": 8.721462189657509e-07, | |
| "loss": 0.0154, | |
| "reward": 1.8046876192092896, | |
| "reward_std": 0.45894815027713776, | |
| "rewards/": 6.273437738418579, | |
| "rewards/math_compute_score": 0.6875000298023224, | |
| "step": 81 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1785.607177734375, | |
| "epoch": 0.26031746031746034, | |
| "grad_norm": 0.34079321817593966, | |
| "kl": 0.0016326904296875, | |
| "learning_rate": 8.686869895240631e-07, | |
| "loss": -0.01, | |
| "reward": 1.5418108701705933, | |
| "reward_std": 0.6146999597549438, | |
| "rewards/": 6.923340320587158, | |
| "rewards/math_compute_score": 0.196428582072258, | |
| "step": 82 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1692.9285888671875, | |
| "epoch": 0.2634920634920635, | |
| "grad_norm": 0.3374435642100782, | |
| "kl": 0.00174713134765625, | |
| "learning_rate": 8.651886441182508e-07, | |
| "loss": 0.027, | |
| "reward": 1.9006696939468384, | |
| "reward_std": 0.45941323041915894, | |
| "rewards/": 7.217634201049805, | |
| "rewards/math_compute_score": 0.5714285969734192, | |
| "step": 83 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1753.946533203125, | |
| "epoch": 0.26666666666666666, | |
| "grad_norm": 0.31783244878975037, | |
| "kl": 0.001495361328125, | |
| "learning_rate": 8.616515539063894e-07, | |
| "loss": 0.003, | |
| "reward": 1.582680106163025, | |
| "reward_std": 0.7250080704689026, | |
| "rewards/": 6.556257247924805, | |
| "rewards/math_compute_score": 0.3392857313156128, | |
| "step": 84 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1419.107177734375, | |
| "epoch": 0.2698412698412698, | |
| "grad_norm": 0.42796739682838303, | |
| "kl": 0.0025634765625, | |
| "learning_rate": 8.580760941571966e-07, | |
| "loss": 0.0542, | |
| "reward": 1.8185827732086182, | |
| "reward_std": 0.24573805928230286, | |
| "rewards/": 6.521484851837158, | |
| "rewards/math_compute_score": 0.6428571939468384, | |
| "step": 85 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1636.6607666015625, | |
| "epoch": 0.273015873015873, | |
| "grad_norm": 0.42716916595705023, | |
| "kl": 0.0019989013671875, | |
| "learning_rate": 8.544626442102187e-07, | |
| "loss": 0.0444, | |
| "reward": 1.9161133766174316, | |
| "reward_std": 0.5437954664230347, | |
| "rewards/": 7.009138107299805, | |
| "rewards/math_compute_score": 0.6428571939468384, | |
| "step": 86 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1772.196533203125, | |
| "epoch": 0.2761904761904762, | |
| "grad_norm": 0.36873824811015804, | |
| "kl": 0.00189208984375, | |
| "learning_rate": 8.508115874355839e-07, | |
| "loss": 0.059, | |
| "reward": 1.403194785118103, | |
| "reward_std": 0.5319852828979492, | |
| "rewards/": 6.873116970062256, | |
| "rewards/math_compute_score": 0.0357142873108387, | |
| "step": 87 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1486.107177734375, | |
| "epoch": 0.27936507936507937, | |
| "grad_norm": 0.4198714949042119, | |
| "kl": 0.00194549560546875, | |
| "learning_rate": 8.47123311193329e-07, | |
| "loss": 0.0626, | |
| "reward": 1.9519531726837158, | |
| "reward_std": 0.4156142473220825, | |
| "rewards/": 7.3311944007873535, | |
| "rewards/math_compute_score": 0.6071428656578064, | |
| "step": 88 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1490.982177734375, | |
| "epoch": 0.28253968253968254, | |
| "grad_norm": 0.3741322266459614, | |
| "kl": 0.0023956298828125, | |
| "learning_rate": 8.433982067923021e-07, | |
| "loss": 0.0168, | |
| "reward": 1.99573814868927, | |
| "reward_std": 0.4969152510166168, | |
| "rewards/": 6.978690147399902, | |
| "rewards/math_compute_score": 0.7500000596046448, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 0.2857142857142857, | |
| "grad_norm": 0.32032292095086534, | |
| "learning_rate": 8.396366694486466e-07, | |
| "loss": 0.0544, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.2857142857142857, | |
| "eval_clip_ratio": 0.0, | |
| "eval_completion_length": 1615.83935546875, | |
| "eval_kl": 0.002166748046875, | |
| "eval_loss": 0.04793115332722664, | |
| "eval_reward": 1.9063432614008586, | |
| "eval_reward_std": 0.47585757573445636, | |
| "eval_rewards/": 6.746001084645589, | |
| "eval_rewards/math_compute_score": 0.6964285969734192, | |
| "eval_runtime": 133.7778, | |
| "eval_samples_per_second": 0.157, | |
| "eval_steps_per_second": 0.007, | |
| "step": 90 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1566.7858276367188, | |
| "epoch": 0.28888888888888886, | |
| "grad_norm": 0.3482667132376765, | |
| "kl": 0.00211334228515625, | |
| "learning_rate": 8.358390982438705e-07, | |
| "loss": -0.0106, | |
| "reward": 1.6325893998146057, | |
| "reward_std": 0.42033930122852325, | |
| "rewards/": 6.4486610889434814, | |
| "rewards/math_compute_score": 0.4285714477300644, | |
| "step": 91 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1555.5535888671875, | |
| "epoch": 0.2920634920634921, | |
| "grad_norm": 0.3459245295737851, | |
| "kl": 0.00384521484375, | |
| "learning_rate": 8.320058960825058e-07, | |
| "loss": 0.0209, | |
| "reward": 1.5438895225524902, | |
| "reward_std": 0.4550023376941681, | |
| "rewards/": 6.4337334632873535, | |
| "rewards/math_compute_score": 0.3214285969734192, | |
| "step": 92 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1714.1607666015625, | |
| "epoch": 0.29523809523809524, | |
| "grad_norm": 0.42497165251755675, | |
| "kl": 0.0028839111328125, | |
| "learning_rate": 8.281374696493626e-07, | |
| "loss": 0.0165, | |
| "reward": 1.7839986085891724, | |
| "reward_std": 0.6411929130554199, | |
| "rewards/": 7.062849044799805, | |
| "rewards/math_compute_score": 0.4642857313156128, | |
| "step": 93 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1584.946533203125, | |
| "epoch": 0.2984126984126984, | |
| "grad_norm": 0.39574012867238795, | |
| "kl": 0.0023956298828125, | |
| "learning_rate": 8.242342293663809e-07, | |
| "loss": 0.0325, | |
| "reward": 1.5983260869979858, | |
| "reward_std": 0.4390745162963867, | |
| "rewards/": 6.991629600524902, | |
| "rewards/math_compute_score": 0.25, | |
| "step": 94 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1597.482177734375, | |
| "epoch": 0.30158730158730157, | |
| "grad_norm": 0.3834564701675289, | |
| "kl": 0.0025634765625, | |
| "learning_rate": 8.202965893490876e-07, | |
| "loss": -0.0186, | |
| "reward": 1.340318202972412, | |
| "reward_std": 0.5021482110023499, | |
| "rewards/": 5.915876388549805, | |
| "rewards/math_compute_score": 0.196428582072258, | |
| "step": 95 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1800.607177734375, | |
| "epoch": 0.3047619047619048, | |
| "grad_norm": 0.39206140103553405, | |
| "kl": 0.0027923583984375, | |
| "learning_rate": 8.163249673626602e-07, | |
| "loss": 0.0298, | |
| "reward": 1.2919502258300781, | |
| "reward_std": 0.6428667902946472, | |
| "rewards/": 6.031180381774902, | |
| "rewards/math_compute_score": 0.1071428656578064, | |
| "step": 96 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1651.8035888671875, | |
| "epoch": 0.30793650793650795, | |
| "grad_norm": 0.3826880063008121, | |
| "kl": 0.00238037109375, | |
| "learning_rate": 8.123197847776042e-07, | |
| "loss": 0.0461, | |
| "reward": 1.705224633216858, | |
| "reward_std": 0.5447785258293152, | |
| "rewards/": 6.668980598449707, | |
| "rewards/math_compute_score": 0.4642857313156128, | |
| "step": 97 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1901.321533203125, | |
| "epoch": 0.3111111111111111, | |
| "grad_norm": 0.3643029236288363, | |
| "kl": 0.0025177001953125, | |
| "learning_rate": 8.082814665250476e-07, | |
| "loss": 0.0243, | |
| "reward": 1.3659180402755737, | |
| "reward_std": 0.7389653921127319, | |
| "rewards/": 6.543875694274902, | |
| "rewards/math_compute_score": 0.0714285746216774, | |
| "step": 98 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1414.696533203125, | |
| "epoch": 0.3142857142857143, | |
| "grad_norm": 0.42740784739438414, | |
| "kl": 0.0030975341796875, | |
| "learning_rate": 8.042104410516575e-07, | |
| "loss": 0.0245, | |
| "reward": 1.5442662239074707, | |
| "reward_std": 0.5408600568771362, | |
| "rewards/": 6.292759895324707, | |
| "rewards/math_compute_score": 0.3571428656578064, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 0.31746031746031744, | |
| "grad_norm": 0.3367757080515649, | |
| "learning_rate": 8.001071402741842e-07, | |
| "loss": 0.007, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.31746031746031744, | |
| "eval_clip_ratio": 0.0, | |
| "eval_completion_length": 1632.1806233723958, | |
| "eval_kl": 0.0024922688802083335, | |
| "eval_loss": 0.026421261951327324, | |
| "eval_reward": 1.9330463409423828, | |
| "eval_reward_std": 0.39154160519440967, | |
| "eval_rewards/": 7.0461835861206055, | |
| "eval_rewards/math_compute_score": 0.6547619154055914, | |
| "eval_runtime": 134.9073, | |
| "eval_samples_per_second": 0.156, | |
| "eval_steps_per_second": 0.007, | |
| "step": 100 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1608.5447387695312, | |
| "epoch": 0.32063492063492066, | |
| "grad_norm": 0.35681279315589565, | |
| "kl": 0.00269317626953125, | |
| "learning_rate": 7.959719995336363e-07, | |
| "loss": 0.029, | |
| "reward": 1.7852399349212646, | |
| "reward_std": 0.40309859812259674, | |
| "rewards/": 6.890485763549805, | |
| "rewards/math_compute_score": 0.508928582072258, | |
| "step": 101 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1488.3751220703125, | |
| "epoch": 0.3238095238095238, | |
| "grad_norm": 0.44950984490187945, | |
| "kl": 0.0035400390625, | |
| "learning_rate": 7.918054575490943e-07, | |
| "loss": 0.0435, | |
| "reward": 1.8601562976837158, | |
| "reward_std": 0.5413349270820618, | |
| "rewards/": 6.8722100257873535, | |
| "rewards/math_compute_score": 0.6071428656578064, | |
| "step": 102 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1592.2501220703125, | |
| "epoch": 0.326984126984127, | |
| "grad_norm": 0.3485861719916155, | |
| "kl": 0.002471923828125, | |
| "learning_rate": 7.876079563711631e-07, | |
| "loss": 0.0519, | |
| "reward": 1.5621094703674316, | |
| "reward_std": 0.5689576864242554, | |
| "rewards/": 6.524832725524902, | |
| "rewards/math_compute_score": 0.3214285969734192, | |
| "step": 103 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1584.821533203125, | |
| "epoch": 0.33015873015873015, | |
| "grad_norm": 0.42767837474542764, | |
| "kl": 0.00335693359375, | |
| "learning_rate": 7.83379941335073e-07, | |
| "loss": 0.093, | |
| "reward": 1.7258999347686768, | |
| "reward_std": 0.7036234140396118, | |
| "rewards/": 6.486642360687256, | |
| "rewards/math_compute_score": 0.535714328289032, | |
| "step": 104 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1589.08935546875, | |
| "epoch": 0.3333333333333333, | |
| "grad_norm": 0.32351467813897694, | |
| "kl": 0.002105712890625, | |
| "learning_rate": 7.791218610134322e-07, | |
| "loss": 0.0241, | |
| "reward": 1.7169644832611084, | |
| "reward_std": 0.38069403171539307, | |
| "rewards/": 6.656250476837158, | |
| "rewards/math_compute_score": 0.4821428656578064, | |
| "step": 105 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1518.446533203125, | |
| "epoch": 0.33650793650793653, | |
| "grad_norm": 0.39674608646311715, | |
| "kl": 0.00274658203125, | |
| "learning_rate": 7.748341671686354e-07, | |
| "loss": 0.0463, | |
| "reward": 1.785309910774231, | |
| "reward_std": 0.5290029048919678, | |
| "rewards/": 6.8551201820373535, | |
| "rewards/math_compute_score": 0.5178571939468384, | |
| "step": 106 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1639.982177734375, | |
| "epoch": 0.3396825396825397, | |
| "grad_norm": 0.34724586686391384, | |
| "kl": 0.0028839111328125, | |
| "learning_rate": 7.705173147049325e-07, | |
| "loss": 0.0139, | |
| "reward": 1.8501187562942505, | |
| "reward_std": 0.4504697918891907, | |
| "rewards/": 6.964879035949707, | |
| "rewards/math_compute_score": 0.5714285969734192, | |
| "step": 107 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1484.1785888671875, | |
| "epoch": 0.34285714285714286, | |
| "grad_norm": 0.3499020427456334, | |
| "kl": 0.0025787353515625, | |
| "learning_rate": 7.661717616201668e-07, | |
| "loss": -0.0305, | |
| "reward": 1.6777344942092896, | |
| "reward_std": 0.3427577316761017, | |
| "rewards/": 5.817243576049805, | |
| "rewards/math_compute_score": 0.6428571939468384, | |
| "step": 108 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1837.96435546875, | |
| "epoch": 0.346031746031746, | |
| "grad_norm": 0.4037551751682681, | |
| "kl": 0.0028228759765625, | |
| "learning_rate": 7.617979689571839e-07, | |
| "loss": 0.0442, | |
| "reward": 1.3643137216567993, | |
| "reward_std": 0.6223567724227905, | |
| "rewards/": 6.535853862762451, | |
| "rewards/math_compute_score": 0.0714285746216774, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 0.3492063492063492, | |
| "grad_norm": 0.4514262612365042, | |
| "learning_rate": 7.573964007549154e-07, | |
| "loss": -0.0428, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.3492063492063492, | |
| "eval_clip_ratio": 0.0, | |
| "eval_completion_length": 1610.9127604166667, | |
| "eval_kl": 0.0030568440755208335, | |
| "eval_loss": 0.059969693422317505, | |
| "eval_reward": 1.8927596807479858, | |
| "eval_reward_std": 0.5094525118668874, | |
| "eval_rewards/": 6.86855951944987, | |
| "eval_rewards/math_compute_score": 0.6488095621267954, | |
| "eval_runtime": 139.4711, | |
| "eval_samples_per_second": 0.151, | |
| "eval_steps_per_second": 0.007, | |
| "step": 110 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1355.6964721679688, | |
| "epoch": 0.3523809523809524, | |
| "grad_norm": 0.416064136246473, | |
| "kl": 0.003692626953125, | |
| "learning_rate": 7.529675239991482e-07, | |
| "loss": 0.007, | |
| "reward": 2.1203389167785645, | |
| "reward_std": 0.34416940808296204, | |
| "rewards/": 7.280264854431152, | |
| "rewards/math_compute_score": 0.830357164144516, | |
| "step": 111 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1646.607177734375, | |
| "epoch": 0.35555555555555557, | |
| "grad_norm": 0.3080164040844727, | |
| "kl": 0.00250244140625, | |
| "learning_rate": 7.485118085729789e-07, | |
| "loss": 0.013, | |
| "reward": 1.5503767728805542, | |
| "reward_std": 0.4306219816207886, | |
| "rewards/": 6.180455207824707, | |
| "rewards/math_compute_score": 0.392857164144516, | |
| "step": 112 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1678.1785888671875, | |
| "epoch": 0.35873015873015873, | |
| "grad_norm": 0.3694201814855355, | |
| "kl": 0.00311279296875, | |
| "learning_rate": 7.440297272069614e-07, | |
| "loss": 0.0761, | |
| "reward": 2.034709930419922, | |
| "reward_std": 0.43929895758628845, | |
| "rewards/": 7.530692100524902, | |
| "rewards/math_compute_score": 0.660714328289032, | |
| "step": 113 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1546.7679443359375, | |
| "epoch": 0.3619047619047619, | |
| "grad_norm": 0.3908115750027717, | |
| "kl": 0.0034942626953125, | |
| "learning_rate": 7.395217554289523e-07, | |
| "loss": -0.011, | |
| "reward": 1.7967495918273926, | |
| "reward_std": 0.22745420038700104, | |
| "rewards/": 6.983747482299805, | |
| "rewards/math_compute_score": 0.5, | |
| "step": 114 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1633.9107666015625, | |
| "epoch": 0.36507936507936506, | |
| "grad_norm": 0.3199396618433185, | |
| "kl": 0.0025634765625, | |
| "learning_rate": 7.3498837151366e-07, | |
| "loss": 0.0194, | |
| "reward": 1.8163504600524902, | |
| "reward_std": 0.5509271621704102, | |
| "rewards/": 7.081752777099609, | |
| "rewards/math_compute_score": 0.5, | |
| "step": 115 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1633.83935546875, | |
| "epoch": 0.3682539682539683, | |
| "grad_norm": 0.3989797872601438, | |
| "kl": 0.002685546875, | |
| "learning_rate": 7.304300564319013e-07, | |
| "loss": 0.0215, | |
| "reward": 1.52039635181427, | |
| "reward_std": 0.4950607120990753, | |
| "rewards/": 6.744838237762451, | |
| "rewards/math_compute_score": 0.2142857313156128, | |
| "step": 116 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1765.0537109375, | |
| "epoch": 0.37142857142857144, | |
| "grad_norm": 0.33315626188294234, | |
| "kl": 0.003143310546875, | |
| "learning_rate": 7.258472937995735e-07, | |
| "loss": -0.0229, | |
| "reward": 1.795814871788025, | |
| "reward_std": 0.49838095903396606, | |
| "rewards/": 7.193359851837158, | |
| "rewards/math_compute_score": 0.4464285969734192, | |
| "step": 117 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1562.5535888671875, | |
| "epoch": 0.3746031746031746, | |
| "grad_norm": 0.3286142283747825, | |
| "kl": 0.0029754638671875, | |
| "learning_rate": 7.212405698263446e-07, | |
| "loss": 0.0002, | |
| "reward": 2.0463171005249023, | |
| "reward_std": 0.4698004722595215, | |
| "rewards/": 7.2315850257873535, | |
| "rewards/math_compute_score": 0.7500000596046448, | |
| "step": 118 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1634.5535888671875, | |
| "epoch": 0.37777777777777777, | |
| "grad_norm": 0.3834857277163386, | |
| "kl": 0.003143310546875, | |
| "learning_rate": 7.166103732640681e-07, | |
| "loss": 0.0034, | |
| "reward": 1.5640347003936768, | |
| "reward_std": 0.48661187291145325, | |
| "rewards/": 6.820173263549805, | |
| "rewards/math_compute_score": 0.25, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 0.38095238095238093, | |
| "grad_norm": 0.3930354077134069, | |
| "learning_rate": 7.119571953549304e-07, | |
| "loss": 0.0164, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.38095238095238093, | |
| "eval_clip_ratio": 0.0, | |
| "eval_completion_length": 1561.2083740234375, | |
| "eval_kl": 0.0033162434895833335, | |
| "eval_loss": 0.034597255289554596, | |
| "eval_reward": 1.9965635935465496, | |
| "eval_reward_std": 0.49989163875579834, | |
| "eval_rewards/": 6.982817490895589, | |
| "eval_rewards/math_compute_score": 0.7500000397364298, | |
| "eval_runtime": 131.6643, | |
| "eval_samples_per_second": 0.159, | |
| "eval_steps_per_second": 0.008, | |
| "step": 120 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1541.071533203125, | |
| "epoch": 0.38412698412698415, | |
| "grad_norm": 0.3736113886490285, | |
| "kl": 0.00330352783203125, | |
| "learning_rate": 7.072815297793302e-07, | |
| "loss": -0.0227, | |
| "reward": 1.750962734222412, | |
| "reward_std": 0.44300225377082825, | |
| "rewards/": 6.719099044799805, | |
| "rewards/math_compute_score": 0.5089285969734192, | |
| "step": 121 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1791.482177734375, | |
| "epoch": 0.3873015873015873, | |
| "grad_norm": 0.36023685839775266, | |
| "kl": 0.00299072265625, | |
| "learning_rate": 7.025838726035031e-07, | |
| "loss": 0.0158, | |
| "reward": 1.2251116037368774, | |
| "reward_std": 0.5226312875747681, | |
| "rewards/": 6.4112725257873535, | |
| "rewards/math_compute_score": -0.0714285746216774, | |
| "step": 122 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1458.21435546875, | |
| "epoch": 0.3904761904761905, | |
| "grad_norm": 0.4040624897084209, | |
| "kl": 0.0032501220703125, | |
| "learning_rate": 6.978647222268903e-07, | |
| "loss": 0.0145, | |
| "reward": 1.9194753170013428, | |
| "reward_std": 0.5019935965538025, | |
| "rewards/": 7.025949001312256, | |
| "rewards/math_compute_score": 0.6428571939468384, | |
| "step": 123 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1518.232177734375, | |
| "epoch": 0.39365079365079364, | |
| "grad_norm": 0.3500940260968403, | |
| "kl": 0.0032958984375, | |
| "learning_rate": 6.93124579329261e-07, | |
| "loss": 0.0429, | |
| "reward": 1.8887277841567993, | |
| "reward_std": 0.3185231387615204, | |
| "rewards/": 7.157924652099609, | |
| "rewards/math_compute_score": 0.5714285969734192, | |
| "step": 124 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1352.5179443359375, | |
| "epoch": 0.3968253968253968, | |
| "grad_norm": 0.35908221717719274, | |
| "kl": 0.0038909912109375, | |
| "learning_rate": 6.883639468175925e-07, | |
| "loss": 0.0207, | |
| "reward": 2.0699777603149414, | |
| "reward_std": 0.2467387318611145, | |
| "rewards/": 6.992745876312256, | |
| "rewards/math_compute_score": 0.8392857313156128, | |
| "step": 125 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1712.21435546875, | |
| "epoch": 0.4, | |
| "grad_norm": 0.3624308105870553, | |
| "kl": 0.0030364990234375, | |
| "learning_rate": 6.835833297727147e-07, | |
| "loss": 0.0443, | |
| "reward": 1.5869420766830444, | |
| "reward_std": 0.39860856533050537, | |
| "rewards/": 6.506138801574707, | |
| "rewards/math_compute_score": 0.3571428656578064, | |
| "step": 126 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1478.2857666015625, | |
| "epoch": 0.4031746031746032, | |
| "grad_norm": 0.41421111807827343, | |
| "kl": 0.0032958984375, | |
| "learning_rate": 6.787832353957224e-07, | |
| "loss": 0.0465, | |
| "reward": 2.0360493659973145, | |
| "reward_std": 0.5292332172393799, | |
| "rewards/": 7.180245876312256, | |
| "rewards/math_compute_score": 0.7500000596046448, | |
| "step": 127 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1562.3751220703125, | |
| "epoch": 0.40634920634920635, | |
| "grad_norm": 0.30993254735116554, | |
| "kl": 0.003448486328125, | |
| "learning_rate": 6.739641729541644e-07, | |
| "loss": 0.0384, | |
| "reward": 1.7399276494979858, | |
| "reward_std": 0.3811955749988556, | |
| "rewards/": 6.699637413024902, | |
| "rewards/math_compute_score": 0.5, | |
| "step": 128 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1551.0535888671875, | |
| "epoch": 0.4095238095238095, | |
| "grad_norm": 0.3771726274865134, | |
| "kl": 0.00347900390625, | |
| "learning_rate": 6.691266537280127e-07, | |
| "loss": -0.0003, | |
| "reward": 1.817131757736206, | |
| "reward_std": 0.511991024017334, | |
| "rewards/": 6.7999444007873535, | |
| "rewards/math_compute_score": 0.5714285969734192, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 0.4126984126984127, | |
| "grad_norm": 0.35621529328169527, | |
| "learning_rate": 6.642711909554174e-07, | |
| "loss": 0.0192, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.4126984126984127, | |
| "eval_clip_ratio": 0.0, | |
| "eval_completion_length": 1621.2857666015625, | |
| "eval_kl": 0.0035502115885416665, | |
| "eval_loss": 0.03674715757369995, | |
| "eval_reward": 1.8930153846740723, | |
| "eval_reward_std": 0.4688274661699931, | |
| "eval_rewards/": 6.774600346883138, | |
| "eval_rewards/math_compute_score": 0.6726190646489462, | |
| "eval_runtime": 133.1814, | |
| "eval_samples_per_second": 0.158, | |
| "eval_steps_per_second": 0.008, | |
| "step": 130 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1673.3839721679688, | |
| "epoch": 0.4158730158730159, | |
| "grad_norm": 0.4405464519743735, | |
| "kl": 0.0037384033203125, | |
| "learning_rate": 6.593982997782548e-07, | |
| "loss": 0.0243, | |
| "reward": 1.7884975671768188, | |
| "reward_std": 0.6518445014953613, | |
| "rewards/": 6.942487955093384, | |
| "rewards/math_compute_score": 0.5000000149011612, | |
| "step": 131 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1482.696533203125, | |
| "epoch": 0.41904761904761906, | |
| "grad_norm": 0.39457572146801523, | |
| "kl": 0.003662109375, | |
| "learning_rate": 6.545084971874736e-07, | |
| "loss": 0.0455, | |
| "reward": 1.9908483028411865, | |
| "reward_std": 0.49568915367126465, | |
| "rewards/": 6.9542412757873535, | |
| "rewards/math_compute_score": 0.7500000596046448, | |
| "step": 132 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1558.321533203125, | |
| "epoch": 0.4222222222222222, | |
| "grad_norm": 0.3723322635366814, | |
| "kl": 0.0032196044921875, | |
| "learning_rate": 6.496023019682446e-07, | |
| "loss": 0.0261, | |
| "reward": 2.116741180419922, | |
| "reward_std": 0.34433886408805847, | |
| "rewards/": 7.155134201049805, | |
| "rewards/math_compute_score": 0.8571429252624512, | |
| "step": 133 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1776.482177734375, | |
| "epoch": 0.4253968253968254, | |
| "grad_norm": 0.3342896457505393, | |
| "kl": 0.00286865234375, | |
| "learning_rate": 6.44680234644919e-07, | |
| "loss": 0.0427, | |
| "reward": 1.4701590538024902, | |
| "reward_std": 0.552353024482727, | |
| "rewards/": 6.636509895324707, | |
| "rewards/math_compute_score": 0.1785714328289032, | |
| "step": 134 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1594.1429443359375, | |
| "epoch": 0.42857142857142855, | |
| "grad_norm": 0.3548357521298244, | |
| "kl": 0.0038299560546875, | |
| "learning_rate": 6.397428174258047e-07, | |
| "loss": 0.0483, | |
| "reward": 2.0053014755249023, | |
| "reward_std": 0.3594396710395813, | |
| "rewards/": 7.455078601837158, | |
| "rewards/math_compute_score": 0.6428571939468384, | |
| "step": 135 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1353.71435546875, | |
| "epoch": 0.43174603174603177, | |
| "grad_norm": 0.4428436053114652, | |
| "kl": 0.004180908203125, | |
| "learning_rate": 6.347905741477612e-07, | |
| "loss": 0.0562, | |
| "reward": 1.9524275064468384, | |
| "reward_std": 0.42881521582603455, | |
| "rewards/": 7.1192803382873535, | |
| "rewards/math_compute_score": 0.660714328289032, | |
| "step": 136 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1373.732177734375, | |
| "epoch": 0.43492063492063493, | |
| "grad_norm": 0.3804164543335114, | |
| "kl": 0.004302978515625, | |
| "learning_rate": 6.298240302206241e-07, | |
| "loss": 0.0371, | |
| "reward": 2.0057549476623535, | |
| "reward_std": 0.3291456401348114, | |
| "rewards/": 7.028774261474609, | |
| "rewards/math_compute_score": 0.7500000596046448, | |
| "step": 137 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1614.732177734375, | |
| "epoch": 0.4380952380952381, | |
| "grad_norm": 0.3920737510841866, | |
| "kl": 0.004119873046875, | |
| "learning_rate": 6.2484371257146e-07, | |
| "loss": 0.031, | |
| "reward": 1.8299667835235596, | |
| "reward_std": 0.5113915205001831, | |
| "rewards/": 7.149832725524902, | |
| "rewards/math_compute_score": 0.5, | |
| "step": 138 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1481.08935546875, | |
| "epoch": 0.44126984126984126, | |
| "grad_norm": 0.38144253817590884, | |
| "kl": 0.00408935546875, | |
| "learning_rate": 6.198501495886638e-07, | |
| "loss": -0.0053, | |
| "reward": 1.7764790058135986, | |
| "reward_std": 0.40745845437049866, | |
| "rewards/": 6.739537239074707, | |
| "rewards/math_compute_score": 0.535714328289032, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 0.4444444444444444, | |
| "grad_norm": 0.40734808818587626, | |
| "learning_rate": 6.148438710658978e-07, | |
| "loss": 0.0635, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.4444444444444444, | |
| "eval_clip_ratio": 0.0, | |
| "eval_completion_length": 1595.1012369791667, | |
| "eval_kl": 0.0038808186848958335, | |
| "eval_loss": 0.03390444815158844, | |
| "eval_reward": 1.973921298980713, | |
| "eval_reward_std": 0.47822797298431396, | |
| "eval_rewards/": 7.107701142628987, | |
| "eval_rewards/math_compute_score": 0.6904762188593546, | |
| "eval_runtime": 132.0027, | |
| "eval_samples_per_second": 0.159, | |
| "eval_steps_per_second": 0.008, | |
| "step": 140 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1492.65185546875, | |
| "epoch": 0.44761904761904764, | |
| "grad_norm": 0.4080068354053614, | |
| "kl": 0.0045166015625, | |
| "learning_rate": 6.098254081458838e-07, | |
| "loss": 0.0728, | |
| "reward": 1.9614050388336182, | |
| "reward_std": 0.5336401164531708, | |
| "rewards/": 7.09273886680603, | |
| "rewards/math_compute_score": 0.6785714626312256, | |
| "step": 141 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1850.3929443359375, | |
| "epoch": 0.4507936507936508, | |
| "grad_norm": 0.34401058222235625, | |
| "kl": 0.0030670166015625, | |
| "learning_rate": 6.047952932640512e-07, | |
| "loss": 0.0162, | |
| "reward": 1.3241490125656128, | |
| "reward_std": 0.568393886089325, | |
| "rewards/": 6.263602256774902, | |
| "rewards/math_compute_score": 0.0892857164144516, | |
| "step": 142 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1395.8035888671875, | |
| "epoch": 0.45396825396825397, | |
| "grad_norm": 0.3084325410352855, | |
| "kl": 0.0038909912109375, | |
| "learning_rate": 5.997540600920478e-07, | |
| "loss": 0.0174, | |
| "reward": 1.953850507736206, | |
| "reward_std": 0.4086017906665802, | |
| "rewards/": 6.769252300262451, | |
| "rewards/math_compute_score": 0.7500000596046448, | |
| "step": 143 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1625.21435546875, | |
| "epoch": 0.45714285714285713, | |
| "grad_norm": 0.41338154741382704, | |
| "kl": 0.0048828125, | |
| "learning_rate": 5.947022434811201e-07, | |
| "loss": 0.0367, | |
| "reward": 1.5787110328674316, | |
| "reward_std": 0.49698716402053833, | |
| "rewards/": 6.750697612762451, | |
| "rewards/math_compute_score": 0.2857142984867096, | |
| "step": 144 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1505.0179443359375, | |
| "epoch": 0.4603174603174603, | |
| "grad_norm": 0.38852067924976885, | |
| "kl": 0.00421142578125, | |
| "learning_rate": 5.896403794053678e-07, | |
| "loss": 0.0451, | |
| "reward": 1.9318640232086182, | |
| "reward_std": 0.5010179877281189, | |
| "rewards/": 6.945034027099609, | |
| "rewards/math_compute_score": 0.6785714626312256, | |
| "step": 145 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1491.3751220703125, | |
| "epoch": 0.4634920634920635, | |
| "grad_norm": 0.3492147620184295, | |
| "kl": 0.0042724609375, | |
| "learning_rate": 5.845690049048798e-07, | |
| "loss": 0.0706, | |
| "reward": 2.178906202316284, | |
| "reward_std": 0.457103431224823, | |
| "rewards/": 7.323102951049805, | |
| "rewards/math_compute_score": 0.8928571939468384, | |
| "step": 146 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1746.1607666015625, | |
| "epoch": 0.4666666666666667, | |
| "grad_norm": 0.44104615846617024, | |
| "kl": 0.005401611328125, | |
| "learning_rate": 5.794886580287564e-07, | |
| "loss": 0.0404, | |
| "reward": 1.4707032442092896, | |
| "reward_std": 0.8705157041549683, | |
| "rewards/": 6.4249444007873535, | |
| "rewards/math_compute_score": 0.2321428656578064, | |
| "step": 147 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1627.08935546875, | |
| "epoch": 0.46984126984126984, | |
| "grad_norm": 0.395676436764285, | |
| "kl": 0.003997802734375, | |
| "learning_rate": 5.743998777780251e-07, | |
| "loss": 0.057, | |
| "reward": 1.7963100671768188, | |
| "reward_std": 0.5916603207588196, | |
| "rewards/": 7.410121917724609, | |
| "rewards/math_compute_score": 0.392857164144516, | |
| "step": 148 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1585.08935546875, | |
| "epoch": 0.473015873015873, | |
| "grad_norm": 0.32362413396596257, | |
| "kl": 0.0037078857421875, | |
| "learning_rate": 5.693032040484547e-07, | |
| "loss": -0.0072, | |
| "reward": 1.691545844078064, | |
| "reward_std": 0.4373229146003723, | |
| "rewards/": 6.529157638549805, | |
| "rewards/math_compute_score": 0.4821428656578064, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 0.47619047619047616, | |
| "grad_norm": 0.38103977643395054, | |
| "learning_rate": 5.641991775732755e-07, | |
| "loss": 0.0482, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.47619047619047616, | |
| "eval_clip_ratio": 0.0, | |
| "eval_completion_length": 1561.9564208984375, | |
| "eval_kl": 0.004628499348958333, | |
| "eval_loss": 0.023879073560237885, | |
| "eval_reward": 1.9818453788757324, | |
| "eval_reward_std": 0.42304734388987225, | |
| "eval_rewards/": 6.980655034383138, | |
| "eval_rewards/math_compute_score": 0.7321428954601288, | |
| "eval_runtime": 131.655, | |
| "eval_samples_per_second": 0.16, | |
| "eval_steps_per_second": 0.008, | |
| "step": 150 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1668.759033203125, | |
| "epoch": 0.4793650793650794, | |
| "grad_norm": 0.31698478601407637, | |
| "kl": 0.00412750244140625, | |
| "learning_rate": 5.590883398658094e-07, | |
| "loss": 0.0239, | |
| "reward": 1.6429409980773926, | |
| "reward_std": 0.5557140111923218, | |
| "rewards/": 6.857561826705933, | |
| "rewards/math_compute_score": 0.3392857313156128, | |
| "step": 151 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1623.08935546875, | |
| "epoch": 0.48253968253968255, | |
| "grad_norm": 0.3705285203587748, | |
| "kl": 0.00408935546875, | |
| "learning_rate": 5.539712331620185e-07, | |
| "loss": 0.0581, | |
| "reward": 1.4986224174499512, | |
| "reward_std": 0.43796294927597046, | |
| "rewards/": 6.7788262367248535, | |
| "rewards/math_compute_score": 0.1785714328289032, | |
| "step": 152 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1689.446533203125, | |
| "epoch": 0.4857142857142857, | |
| "grad_norm": 0.3861278436124767, | |
| "kl": 0.00445556640625, | |
| "learning_rate": 5.488484003629758e-07, | |
| "loss": 0.0305, | |
| "reward": 1.5913225412368774, | |
| "reward_std": 0.6033198833465576, | |
| "rewards/": 6.813755989074707, | |
| "rewards/math_compute_score": 0.2857142984867096, | |
| "step": 153 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1477.107177734375, | |
| "epoch": 0.4888888888888889, | |
| "grad_norm": 0.36475288664194044, | |
| "kl": 0.0047607421875, | |
| "learning_rate": 5.437203849772664e-07, | |
| "loss": 0.0404, | |
| "reward": 1.8446986675262451, | |
| "reward_std": 0.32607829570770264, | |
| "rewards/": 7.080636501312256, | |
| "rewards/math_compute_score": 0.535714328289032, | |
| "step": 154 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1420.3035888671875, | |
| "epoch": 0.49206349206349204, | |
| "grad_norm": 0.3896113098470113, | |
| "kl": 0.004180908203125, | |
| "learning_rate": 5.385877310633232e-07, | |
| "loss": -0.0126, | |
| "reward": 1.838978886604309, | |
| "reward_std": 0.4181511104106903, | |
| "rewards/": 6.694894313812256, | |
| "rewards/math_compute_score": 0.625, | |
| "step": 155 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1735.4107666015625, | |
| "epoch": 0.49523809523809526, | |
| "grad_norm": 0.366034646195721, | |
| "kl": 0.004364013671875, | |
| "learning_rate": 5.334509831717058e-07, | |
| "loss": 0.0191, | |
| "reward": 1.687611699104309, | |
| "reward_std": 0.5151606798171997, | |
| "rewards/": 7.2237725257873535, | |
| "rewards/math_compute_score": 0.3035714328289032, | |
| "step": 156 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1403.2857666015625, | |
| "epoch": 0.4984126984126984, | |
| "grad_norm": 0.3664836889631256, | |
| "kl": 0.005584716796875, | |
| "learning_rate": 5.283106862873252e-07, | |
| "loss": 0.0848, | |
| "reward": 2.293659210205078, | |
| "reward_std": 0.24820633232593536, | |
| "rewards/": 7.611154079437256, | |
| "rewards/math_compute_score": 0.9642857313156128, | |
| "step": 157 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1685.2501220703125, | |
| "epoch": 0.5015873015873016, | |
| "grad_norm": 0.3481512366201298, | |
| "kl": 0.004180908203125, | |
| "learning_rate": 5.231673857716243e-07, | |
| "loss": 0.041, | |
| "reward": 1.8583705425262451, | |
| "reward_std": 0.42498812079429626, | |
| "rewards/": 6.4347100257873535, | |
| "rewards/math_compute_score": 0.7142857313156128, | |
| "step": 158 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1417.6785888671875, | |
| "epoch": 0.5047619047619047, | |
| "grad_norm": 0.3977021065047324, | |
| "kl": 0.00439453125, | |
| "learning_rate": 5.18021627304717e-07, | |
| "loss": 0.0716, | |
| "reward": 1.9547433853149414, | |
| "reward_std": 0.4087882936000824, | |
| "rewards/": 7.273716926574707, | |
| "rewards/math_compute_score": 0.625, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 0.5079365079365079, | |
| "grad_norm": 0.3539481723030545, | |
| "learning_rate": 5.128739568274943e-07, | |
| "loss": 0.0753, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.5079365079365079, | |
| "eval_clip_ratio": 0.0, | |
| "eval_completion_length": 1508.3075764973958, | |
| "eval_kl": 0.004781087239583333, | |
| "eval_loss": 0.010089995339512825, | |
| "eval_reward": 1.9843844572703044, | |
| "eval_reward_std": 0.4194992780685425, | |
| "eval_rewards/": 7.017159938812256, | |
| "eval_rewards/math_compute_score": 0.7261905074119568, | |
| "eval_runtime": 130.1634, | |
| "eval_samples_per_second": 0.161, | |
| "eval_steps_per_second": 0.008, | |
| "step": 160 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1747.4286499023438, | |
| "epoch": 0.5111111111111111, | |
| "grad_norm": 0.3330449401616063, | |
| "kl": 0.00437164306640625, | |
| "learning_rate": 5.077249204837025e-07, | |
| "loss": 0.019, | |
| "reward": 1.6631278991699219, | |
| "reward_std": 0.6911021769046783, | |
| "rewards/": 6.672781944274902, | |
| "rewards/math_compute_score": 0.4107142984867096, | |
| "step": 161 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1497.482177734375, | |
| "epoch": 0.5142857142857142, | |
| "grad_norm": 0.42213450988969753, | |
| "kl": 0.004486083984375, | |
| "learning_rate": 5.025750645620004e-07, | |
| "loss": 0.0252, | |
| "reward": 1.7366769313812256, | |
| "reward_std": 0.6020164489746094, | |
| "rewards/": 6.540527820587158, | |
| "rewards/math_compute_score": 0.535714328289032, | |
| "step": 162 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1306.2857666015625, | |
| "epoch": 0.5174603174603175, | |
| "grad_norm": 0.6578798978662977, | |
| "kl": 0.011474609375, | |
| "learning_rate": 4.974249354379996e-07, | |
| "loss": 0.05, | |
| "reward": 2.152064800262451, | |
| "reward_std": 0.43625935912132263, | |
| "rewards/": 7.474609851837158, | |
| "rewards/math_compute_score": 0.8214285969734192, | |
| "step": 163 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1578.5179443359375, | |
| "epoch": 0.5206349206349207, | |
| "grad_norm": 0.3630493679815709, | |
| "kl": 0.0047607421875, | |
| "learning_rate": 4.922750795162973e-07, | |
| "loss": 0.0061, | |
| "reward": 1.8181921243667603, | |
| "reward_std": 0.45378485321998596, | |
| "rewards/": 7.305245876312256, | |
| "rewards/math_compute_score": 0.4464285969734192, | |
| "step": 164 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1523.857177734375, | |
| "epoch": 0.5238095238095238, | |
| "grad_norm": 0.40781680936494136, | |
| "kl": 0.0057373046875, | |
| "learning_rate": 4.871260431725058e-07, | |
| "loss": 0.0301, | |
| "reward": 1.784919261932373, | |
| "reward_std": 0.5600239634513855, | |
| "rewards/": 6.710309982299805, | |
| "rewards/math_compute_score": 0.5535714626312256, | |
| "step": 165 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1653.607177734375, | |
| "epoch": 0.526984126984127, | |
| "grad_norm": 0.3627537206069918, | |
| "kl": 0.005035400390625, | |
| "learning_rate": 4.81978372695283e-07, | |
| "loss": 0.0296, | |
| "reward": 1.5985910892486572, | |
| "reward_std": 0.39210641384124756, | |
| "rewards/": 6.707240581512451, | |
| "rewards/math_compute_score": 0.3214285969734192, | |
| "step": 166 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1424.1785888671875, | |
| "epoch": 0.5301587301587302, | |
| "grad_norm": 0.3566511267927936, | |
| "kl": 0.00543212890625, | |
| "learning_rate": 4.768326142283756e-07, | |
| "loss": 0.0401, | |
| "reward": 1.9029020071029663, | |
| "reward_std": 0.253780335187912, | |
| "rewards/": 6.871652126312256, | |
| "rewards/math_compute_score": 0.660714328289032, | |
| "step": 167 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1728.1251220703125, | |
| "epoch": 0.5333333333333333, | |
| "grad_norm": 0.3668634566920126, | |
| "kl": 0.004486083984375, | |
| "learning_rate": 4.7168931371267473e-07, | |
| "loss": 0.0457, | |
| "reward": 1.5901787281036377, | |
| "reward_std": 0.5735574960708618, | |
| "rewards/": 6.950893402099609, | |
| "rewards/math_compute_score": 0.25, | |
| "step": 168 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1611.5357666015625, | |
| "epoch": 0.5365079365079365, | |
| "grad_norm": 0.3213785200743163, | |
| "kl": 0.00469970703125, | |
| "learning_rate": 4.665490168282943e-07, | |
| "loss": 0.0072, | |
| "reward": 1.8737167119979858, | |
| "reward_std": 0.6218880414962769, | |
| "rewards/": 6.940011501312256, | |
| "rewards/math_compute_score": 0.6071428656578064, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 0.5396825396825397, | |
| "grad_norm": 0.35976271648220715, | |
| "learning_rate": 4.614122689366768e-07, | |
| "loss": 0.0402, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.5396825396825397, | |
| "eval_clip_ratio": 0.0, | |
| "eval_completion_length": 1549.6785888671875, | |
| "eval_kl": 0.004964192708333333, | |
| "eval_loss": 0.03550202399492264, | |
| "eval_reward": 1.9592356284459431, | |
| "eval_reward_std": 0.42070769270261127, | |
| "eval_rewards/": 6.962844530741374, | |
| "eval_rewards/math_compute_score": 0.7083333631356558, | |
| "eval_runtime": 131.0866, | |
| "eval_samples_per_second": 0.16, | |
| "eval_steps_per_second": 0.008, | |
| "step": 170 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1788.1697387695312, | |
| "epoch": 0.5428571428571428, | |
| "grad_norm": 0.30243975738537365, | |
| "kl": 0.00441741943359375, | |
| "learning_rate": 4.562796150227337e-07, | |
| "loss": 0.0388, | |
| "reward": 1.6102469563484192, | |
| "reward_std": 0.5664084255695343, | |
| "rewards/": 6.872663497924805, | |
| "rewards/math_compute_score": 0.2946428656578064, | |
| "step": 171 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1281.8929443359375, | |
| "epoch": 0.546031746031746, | |
| "grad_norm": 0.4239327955548188, | |
| "kl": 0.006072998046875, | |
| "learning_rate": 4.511515996370243e-07, | |
| "loss": -0.0323, | |
| "reward": 2.155747890472412, | |
| "reward_std": 0.38098111748695374, | |
| "rewards/": 7.064453601837158, | |
| "rewards/math_compute_score": 0.9285714626312256, | |
| "step": 172 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1426.08935546875, | |
| "epoch": 0.5492063492063493, | |
| "grad_norm": 0.3848409508825075, | |
| "kl": 0.004547119140625, | |
| "learning_rate": 4.460287668379814e-07, | |
| "loss": 0.0612, | |
| "reward": 1.9045759439468384, | |
| "reward_std": 0.19524678587913513, | |
| "rewards/": 6.522879600524902, | |
| "rewards/math_compute_score": 0.7500000596046448, | |
| "step": 173 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1506.8751220703125, | |
| "epoch": 0.5523809523809524, | |
| "grad_norm": 0.36448706461592817, | |
| "kl": 0.00469970703125, | |
| "learning_rate": 4.409116601341907e-07, | |
| "loss": -0.0125, | |
| "reward": 2.0860493183135986, | |
| "reward_std": 0.4216456711292267, | |
| "rewards/": 7.073102951049805, | |
| "rewards/math_compute_score": 0.8392857313156128, | |
| "step": 174 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1531.4285888671875, | |
| "epoch": 0.5555555555555556, | |
| "grad_norm": 0.4058679353297223, | |
| "kl": 0.005218505859375, | |
| "learning_rate": 4.3580082242672444e-07, | |
| "loss": 0.0513, | |
| "reward": 2.0486607551574707, | |
| "reward_std": 0.5620574951171875, | |
| "rewards/": 7.171875476837158, | |
| "rewards/math_compute_score": 0.7678571939468384, | |
| "step": 175 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1354.0535888671875, | |
| "epoch": 0.5587301587301587, | |
| "grad_norm": 0.41211128125944263, | |
| "kl": 0.006805419921875, | |
| "learning_rate": 4.3069679595154536e-07, | |
| "loss": 0.0461, | |
| "reward": 2.362277030944824, | |
| "reward_std": 0.20701220631599426, | |
| "rewards/": 7.811384201049805, | |
| "rewards/math_compute_score": 1.0, | |
| "step": 176 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1543.571533203125, | |
| "epoch": 0.5619047619047619, | |
| "grad_norm": 0.3763535929052217, | |
| "kl": 0.005279541015625, | |
| "learning_rate": 4.2560012222197506e-07, | |
| "loss": 0.0072, | |
| "reward": 1.9439733028411865, | |
| "reward_std": 0.26243603229522705, | |
| "rewards/": 6.791295051574707, | |
| "rewards/math_compute_score": 0.7321428656578064, | |
| "step": 177 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1764.21435546875, | |
| "epoch": 0.5650793650793651, | |
| "grad_norm": 0.38459117939596527, | |
| "kl": 0.0054931640625, | |
| "learning_rate": 4.205113419712435e-07, | |
| "loss": 0.0308, | |
| "reward": 1.413002371788025, | |
| "reward_std": 0.48099908232688904, | |
| "rewards/": 6.493582725524902, | |
| "rewards/math_compute_score": 0.1428571492433548, | |
| "step": 178 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1457.071533203125, | |
| "epoch": 0.5682539682539682, | |
| "grad_norm": 0.3648394332582025, | |
| "kl": 0.005157470703125, | |
| "learning_rate": 4.1543099509512023e-07, | |
| "loss": 0.0135, | |
| "reward": 2.1016740798950195, | |
| "reward_std": 0.4655519425868988, | |
| "rewards/": 7.508370876312256, | |
| "rewards/math_compute_score": 0.7500000596046448, | |
| "step": 179 | |
| }, | |
| { | |
| "epoch": 0.5714285714285714, | |
| "grad_norm": 0.4347832296603526, | |
| "learning_rate": 4.1035962059463224e-07, | |
| "loss": -0.0031, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.5714285714285714, | |
| "eval_clip_ratio": 0.0, | |
| "eval_completion_length": 1564.9504801432292, | |
| "eval_kl": 0.005472819010416667, | |
| "eval_loss": 0.04810946434736252, | |
| "eval_reward": 2.0953497886657715, | |
| "eval_reward_std": 0.3557452509800593, | |
| "eval_rewards/": 7.143415451049805, | |
| "eval_rewards/math_compute_score": 0.8333333532015482, | |
| "eval_runtime": 130.6017, | |
| "eval_samples_per_second": 0.161, | |
| "eval_steps_per_second": 0.008, | |
| "step": 180 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1442.7589721679688, | |
| "epoch": 0.5746031746031746, | |
| "grad_norm": 0.31604533911482663, | |
| "kl": 0.0054168701171875, | |
| "learning_rate": 4.052977565188799e-07, | |
| "loss": 0.0286, | |
| "reward": 1.9001396894454956, | |
| "reward_std": 0.35335803031921387, | |
| "rewards/": 6.929269313812256, | |
| "rewards/math_compute_score": 0.6428571492433548, | |
| "step": 181 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1675.7857666015625, | |
| "epoch": 0.5777777777777777, | |
| "grad_norm": 0.40418194066418917, | |
| "kl": 0.00537109375, | |
| "learning_rate": 4.0024593990795223e-07, | |
| "loss": 0.0231, | |
| "reward": 1.45106041431427, | |
| "reward_std": 0.7710850238800049, | |
| "rewards/": 6.826730251312256, | |
| "rewards/math_compute_score": 0.1071428656578064, | |
| "step": 182 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1623.696533203125, | |
| "epoch": 0.580952380952381, | |
| "grad_norm": 20.231824794338248, | |
| "kl": 1.4296875, | |
| "learning_rate": 3.952047067359487e-07, | |
| "loss": 0.1031, | |
| "reward": 1.6864677667617798, | |
| "reward_std": 0.3528647720813751, | |
| "rewards/": 6.860909938812256, | |
| "rewards/math_compute_score": 0.392857164144516, | |
| "step": 183 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1415.08935546875, | |
| "epoch": 0.5841269841269842, | |
| "grad_norm": 0.38374407884466366, | |
| "kl": 0.005889892578125, | |
| "learning_rate": 3.9017459185411614e-07, | |
| "loss": -0.0095, | |
| "reward": 1.9470704793930054, | |
| "reward_std": 0.3512076735496521, | |
| "rewards/": 6.878209114074707, | |
| "rewards/math_compute_score": 0.7142857313156128, | |
| "step": 184 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1437.857177734375, | |
| "epoch": 0.5873015873015873, | |
| "grad_norm": 0.3960042132238463, | |
| "kl": 0.004913330078125, | |
| "learning_rate": 3.8515612893410224e-07, | |
| "loss": 0.0284, | |
| "reward": 1.9784600734710693, | |
| "reward_std": 0.3158036768436432, | |
| "rewards/": 7.035156726837158, | |
| "rewards/math_compute_score": 0.7142857313156128, | |
| "step": 185 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1376.5535888671875, | |
| "epoch": 0.5904761904761905, | |
| "grad_norm": 0.4231910527525782, | |
| "kl": 0.005767822265625, | |
| "learning_rate": 3.8014985041133626e-07, | |
| "loss": 0.0177, | |
| "reward": 2.132868528366089, | |
| "reward_std": 0.47174835205078125, | |
| "rewards/": 7.450056076049805, | |
| "rewards/math_compute_score": 0.8035714626312256, | |
| "step": 186 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1735.0357666015625, | |
| "epoch": 0.5936507936507937, | |
| "grad_norm": 0.370413487882503, | |
| "kl": 0.004730224609375, | |
| "learning_rate": 3.7515628742853997e-07, | |
| "loss": 0.032, | |
| "reward": 1.568275809288025, | |
| "reward_std": 0.4366031587123871, | |
| "rewards/": 6.555664539337158, | |
| "rewards/math_compute_score": 0.3214285969734192, | |
| "step": 187 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1384.107177734375, | |
| "epoch": 0.5968253968253968, | |
| "grad_norm": 0.34069036045598083, | |
| "kl": 0.005096435546875, | |
| "learning_rate": 3.70175969779376e-07, | |
| "loss": -0.0015, | |
| "reward": 1.8890068531036377, | |
| "reward_std": 0.3061581552028656, | |
| "rewards/": 7.1593194007873535, | |
| "rewards/math_compute_score": 0.5714285969734192, | |
| "step": 188 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1711.58935546875, | |
| "epoch": 0.6, | |
| "grad_norm": 0.3411825559980518, | |
| "kl": 0.004913330078125, | |
| "learning_rate": 3.6520942585223866e-07, | |
| "loss": -0.0253, | |
| "reward": 1.6066406965255737, | |
| "reward_std": 0.5342792868614197, | |
| "rewards/": 6.533203601837158, | |
| "rewards/math_compute_score": 0.3750000298023224, | |
| "step": 189 | |
| }, | |
| { | |
| "epoch": 0.6031746031746031, | |
| "grad_norm": 0.35592352794000803, | |
| "learning_rate": 3.602571825741953e-07, | |
| "loss": 0.0205, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.6031746031746031, | |
| "eval_clip_ratio": 0.0, | |
| "eval_completion_length": 1554.7837727864583, | |
| "eval_kl": 0.005961100260416667, | |
| "eval_loss": -0.003977527376264334, | |
| "eval_reward": 1.9975679318110149, | |
| "eval_reward_std": 0.45795708894729614, | |
| "eval_rewards/": 6.940220673878987, | |
| "eval_rewards/math_compute_score": 0.761904795964559, | |
| "eval_runtime": 131.2541, | |
| "eval_samples_per_second": 0.16, | |
| "eval_steps_per_second": 0.008, | |
| "step": 190 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1356.5625610351562, | |
| "epoch": 0.6063492063492063, | |
| "grad_norm": 0.3596374444943596, | |
| "kl": 0.0055999755859375, | |
| "learning_rate": 3.55319765355081e-07, | |
| "loss": -0.0037, | |
| "reward": 2.0135952830314636, | |
| "reward_std": 0.33204740285873413, | |
| "rewards/": 6.925118923187256, | |
| "rewards/math_compute_score": 0.785714328289032, | |
| "step": 191 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1531.6251220703125, | |
| "epoch": 0.6095238095238096, | |
| "grad_norm": 0.3616323272673789, | |
| "kl": 0.006195068359375, | |
| "learning_rate": 3.503976980317554e-07, | |
| "loss": -0.0153, | |
| "reward": 2.023493528366089, | |
| "reward_std": 0.3276682496070862, | |
| "rewards/": 7.5460381507873535, | |
| "rewards/math_compute_score": 0.6428571939468384, | |
| "step": 192 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1549.982177734375, | |
| "epoch": 0.6126984126984127, | |
| "grad_norm": 0.3739633675188606, | |
| "kl": 0.0057373046875, | |
| "learning_rate": 3.454915028125263e-07, | |
| "loss": 0.0195, | |
| "reward": 1.88074791431427, | |
| "reward_std": 0.5293351411819458, | |
| "rewards/": 6.975167751312256, | |
| "rewards/math_compute_score": 0.6071428656578064, | |
| "step": 193 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1632.857177734375, | |
| "epoch": 0.6158730158730159, | |
| "grad_norm": 0.3844476214545633, | |
| "kl": 0.00555419921875, | |
| "learning_rate": 3.4060170022174517e-07, | |
| "loss": -0.0172, | |
| "reward": 1.9814037084579468, | |
| "reward_std": 0.6114475727081299, | |
| "rewards/": 7.192731857299805, | |
| "rewards/math_compute_score": 0.6785714626312256, | |
| "step": 194 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1197.08935546875, | |
| "epoch": 0.6190476190476191, | |
| "grad_norm": 0.3588871009271226, | |
| "kl": 0.005828857421875, | |
| "learning_rate": 3.357288090445826e-07, | |
| "loss": 0.0417, | |
| "reward": 2.2577009201049805, | |
| "reward_std": 0.24351288378238678, | |
| "rewards/": 7.431362152099609, | |
| "rewards/math_compute_score": 0.9642857313156128, | |
| "step": 195 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1516.5535888671875, | |
| "epoch": 0.6222222222222222, | |
| "grad_norm": 0.4221755143300646, | |
| "kl": 0.006378173828125, | |
| "learning_rate": 3.3087334627198727e-07, | |
| "loss": 0.0476, | |
| "reward": 2.0338730812072754, | |
| "reward_std": 0.45778489112854004, | |
| "rewards/": 7.455078601837158, | |
| "rewards/math_compute_score": 0.6785714626312256, | |
| "step": 196 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1591.9285888671875, | |
| "epoch": 0.6253968253968254, | |
| "grad_norm": 0.37451153018874117, | |
| "kl": 0.005859375, | |
| "learning_rate": 3.260358270458354e-07, | |
| "loss": 0.0074, | |
| "reward": 1.9349645376205444, | |
| "reward_std": 0.4895531237125397, | |
| "rewards/": 6.960536956787109, | |
| "rewards/math_compute_score": 0.6785714626312256, | |
| "step": 197 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1527.196533203125, | |
| "epoch": 0.6285714285714286, | |
| "grad_norm": 0.3302300732895513, | |
| "kl": 0.005035400390625, | |
| "learning_rate": 3.212167646042776e-07, | |
| "loss": -0.0122, | |
| "reward": 1.9998327493667603, | |
| "reward_std": 0.2854258418083191, | |
| "rewards/": 7.213449001312256, | |
| "rewards/math_compute_score": 0.6964285969734192, | |
| "step": 198 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1533.3751220703125, | |
| "epoch": 0.6317460317460317, | |
| "grad_norm": 0.36563930742752804, | |
| "kl": 0.005340576171875, | |
| "learning_rate": 3.164166702272855e-07, | |
| "loss": 0.0625, | |
| "reward": 2.099107265472412, | |
| "reward_std": 0.40774399042129517, | |
| "rewards/": 7.638393402099609, | |
| "rewards/math_compute_score": 0.7142857313156128, | |
| "step": 199 | |
| }, | |
| { | |
| "epoch": 0.6349206349206349, | |
| "grad_norm": 0.3003969277270976, | |
| "learning_rate": 3.1163605318240736e-07, | |
| "loss": 0.0211, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.6349206349206349, | |
| "eval_clip_ratio": 0.0, | |
| "eval_completion_length": 1513.3889567057292, | |
| "eval_kl": 0.0057373046875, | |
| "eval_loss": -0.0034506141673773527, | |
| "eval_reward": 2.0185548464457193, | |
| "eval_reward_std": 0.42673546075820923, | |
| "eval_rewards/": 7.140392780303955, | |
| "eval_rewards/math_compute_score": 0.7380952636400858, | |
| "eval_runtime": 131.2464, | |
| "eval_samples_per_second": 0.16, | |
| "eval_steps_per_second": 0.008, | |
| "step": 200 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1553.607177734375, | |
| "epoch": 0.638095238095238, | |
| "grad_norm": 0.4204253637756095, | |
| "kl": 0.00592041015625, | |
| "learning_rate": 3.0687542067073915e-07, | |
| "loss": 0.0522, | |
| "reward": 1.829387605190277, | |
| "reward_std": 0.4581097811460495, | |
| "rewards/": 6.932652473449707, | |
| "rewards/math_compute_score": 0.5535714626312256, | |
| "step": 201 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1458.4107666015625, | |
| "epoch": 0.6412698412698413, | |
| "grad_norm": 0.37293790377807984, | |
| "kl": 0.00726318359375, | |
| "learning_rate": 3.021352777731095e-07, | |
| "loss": 0.0798, | |
| "reward": 1.893429160118103, | |
| "reward_std": 0.7099537253379822, | |
| "rewards/": 7.252860069274902, | |
| "rewards/math_compute_score": 0.5535714626312256, | |
| "step": 202 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1727.446533203125, | |
| "epoch": 0.6444444444444445, | |
| "grad_norm": 0.364624475214311, | |
| "kl": 0.005828857421875, | |
| "learning_rate": 2.974161273964969e-07, | |
| "loss": 0.0132, | |
| "reward": 1.3616769313812256, | |
| "reward_std": 0.5690730214118958, | |
| "rewards/": 6.379813194274902, | |
| "rewards/math_compute_score": 0.1071428656578064, | |
| "step": 203 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1601.232177734375, | |
| "epoch": 0.6476190476190476, | |
| "grad_norm": 0.3723608933130564, | |
| "kl": 0.005279541015625, | |
| "learning_rate": 2.9271847022066987e-07, | |
| "loss": 0.0167, | |
| "reward": 1.7884488105773926, | |
| "reward_std": 0.3653627932071686, | |
| "rewards/": 7.227957725524902, | |
| "rewards/math_compute_score": 0.4285714626312256, | |
| "step": 204 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1664.3035888671875, | |
| "epoch": 0.6507936507936508, | |
| "grad_norm": 0.37220118357197624, | |
| "kl": 0.0057373046875, | |
| "learning_rate": 2.880428046450697e-07, | |
| "loss": 0.0547, | |
| "reward": 2.2032926082611084, | |
| "reward_std": 0.5352621674537659, | |
| "rewards/": 7.873605251312256, | |
| "rewards/math_compute_score": 0.785714328289032, | |
| "step": 205 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1527.446533203125, | |
| "epoch": 0.653968253968254, | |
| "grad_norm": 0.44669994879904146, | |
| "kl": 0.00592041015625, | |
| "learning_rate": 2.8338962673593194e-07, | |
| "loss": 0.0466, | |
| "reward": 1.9878350496292114, | |
| "reward_std": 0.36021357774734497, | |
| "rewards/": 7.224888801574707, | |
| "rewards/math_compute_score": 0.6785714626312256, | |
| "step": 206 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1485.3929443359375, | |
| "epoch": 0.6571428571428571, | |
| "grad_norm": 0.36237163203607514, | |
| "kl": 0.0067138671875, | |
| "learning_rate": 2.7875943017365556e-07, | |
| "loss": 0.0087, | |
| "reward": 2.0885045528411865, | |
| "reward_std": 0.23854570090770721, | |
| "rewards/": 7.585379600524902, | |
| "rewards/math_compute_score": 0.7142857313156128, | |
| "step": 207 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1390.21435546875, | |
| "epoch": 0.6603174603174603, | |
| "grad_norm": 0.43492788329993204, | |
| "kl": 0.006011962890625, | |
| "learning_rate": 2.7415270620042634e-07, | |
| "loss": 0.0368, | |
| "reward": 2.027120590209961, | |
| "reward_std": 0.32408618927001953, | |
| "rewards/": 7.135602951049805, | |
| "rewards/math_compute_score": 0.7500000596046448, | |
| "step": 208 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1614.232177734375, | |
| "epoch": 0.6634920634920635, | |
| "grad_norm": 0.4285860950638222, | |
| "kl": 0.005950927734375, | |
| "learning_rate": 2.695699435680986e-07, | |
| "loss": -0.0219, | |
| "reward": 1.80555260181427, | |
| "reward_std": 0.39699044823646545, | |
| "rewards/": 7.170619964599609, | |
| "rewards/math_compute_score": 0.4642857313156128, | |
| "step": 209 | |
| }, | |
| { | |
| "epoch": 0.6666666666666666, | |
| "grad_norm": 0.4073585516563059, | |
| "learning_rate": 2.6501162848634016e-07, | |
| "loss": 0.0635, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.6666666666666666, | |
| "eval_clip_ratio": 0.0, | |
| "eval_completion_length": 1593.8790690104167, | |
| "eval_kl": 0.006266276041666667, | |
| "eval_loss": 0.010452189482748508, | |
| "eval_reward": 2.0239213705062866, | |
| "eval_reward_std": 0.4319620430469513, | |
| "eval_rewards/": 7.357701142628987, | |
| "eval_rewards/math_compute_score": 0.6904762089252472, | |
| "eval_runtime": 132.1993, | |
| "eval_samples_per_second": 0.159, | |
| "eval_steps_per_second": 0.008, | |
| "step": 210 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1548.5625610351562, | |
| "epoch": 0.6698412698412698, | |
| "grad_norm": 0.3896154656236107, | |
| "kl": 0.0064697265625, | |
| "learning_rate": 2.604782445710476e-07, | |
| "loss": 0.0685, | |
| "reward": 1.9150113463401794, | |
| "reward_std": 0.5420868694782257, | |
| "rewards/": 7.289341926574707, | |
| "rewards/math_compute_score": 0.5714285969734192, | |
| "step": 211 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1175.875, | |
| "epoch": 0.6730158730158731, | |
| "grad_norm": 0.3761166584078419, | |
| "kl": 0.00653076171875, | |
| "learning_rate": 2.559702727930386e-07, | |
| "loss": 0.0143, | |
| "reward": 2.3515625, | |
| "reward_std": 0.2300196886062622, | |
| "rewards/": 7.757812976837158, | |
| "rewards/math_compute_score": 1.0, | |
| "step": 212 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1592.0535888671875, | |
| "epoch": 0.6761904761904762, | |
| "grad_norm": 0.3547503060575964, | |
| "kl": 0.005767822265625, | |
| "learning_rate": 2.5148819142702095e-07, | |
| "loss": 0.0157, | |
| "reward": 2.0560269355773926, | |
| "reward_std": 0.4543492794036865, | |
| "rewards/": 7.565848350524902, | |
| "rewards/math_compute_score": 0.6785714626312256, | |
| "step": 213 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1390.7679443359375, | |
| "epoch": 0.6793650793650794, | |
| "grad_norm": 0.41727155544044037, | |
| "kl": 0.00811767578125, | |
| "learning_rate": 2.470324760008517e-07, | |
| "loss": 0.0064, | |
| "reward": 1.9091730117797852, | |
| "reward_std": 0.4644983112812042, | |
| "rewards/": 6.545863628387451, | |
| "rewards/math_compute_score": 0.7500000596046448, | |
| "step": 214 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1498.2857666015625, | |
| "epoch": 0.6825396825396826, | |
| "grad_norm": 0.38605442508491833, | |
| "kl": 0.006195068359375, | |
| "learning_rate": 2.426035992450848e-07, | |
| "loss": 0.0188, | |
| "reward": 1.966071605682373, | |
| "reward_std": 0.4697768986225128, | |
| "rewards/": 7.2589287757873535, | |
| "rewards/math_compute_score": 0.6428571939468384, | |
| "step": 215 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1406.232177734375, | |
| "epoch": 0.6857142857142857, | |
| "grad_norm": 0.3466649961934692, | |
| "kl": 0.004974365234375, | |
| "learning_rate": 2.382020310428161e-07, | |
| "loss": 0.0045, | |
| "reward": 2.04665207862854, | |
| "reward_std": 0.2766338586807251, | |
| "rewards/": 7.233259201049805, | |
| "rewards/math_compute_score": 0.7500000596046448, | |
| "step": 216 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1642.9107666015625, | |
| "epoch": 0.6888888888888889, | |
| "grad_norm": 0.3395883468462042, | |
| "kl": 0.00592041015625, | |
| "learning_rate": 2.3382823837983312e-07, | |
| "loss": 0.0283, | |
| "reward": 1.9188058376312256, | |
| "reward_std": 0.31119585037231445, | |
| "rewards/": 7.308315277099609, | |
| "rewards/math_compute_score": 0.5714285969734192, | |
| "step": 217 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1288.75, | |
| "epoch": 0.692063492063492, | |
| "grad_norm": 0.4169289379787409, | |
| "kl": 0.00665283203125, | |
| "learning_rate": 2.2948268529506765e-07, | |
| "loss": 0.0257, | |
| "reward": 2.0765626430511475, | |
| "reward_std": 0.4043300747871399, | |
| "rewards/": 7.811384201049805, | |
| "rewards/math_compute_score": 0.6428571939468384, | |
| "step": 218 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1746.0535888671875, | |
| "epoch": 0.6952380952380952, | |
| "grad_norm": 0.38975967722789145, | |
| "kl": 0.005126953125, | |
| "learning_rate": 2.251658328313647e-07, | |
| "loss": -0.002, | |
| "reward": 1.3670480251312256, | |
| "reward_std": 0.6043983697891235, | |
| "rewards/": 6.7638115882873535, | |
| "rewards/math_compute_score": 0.01785714365541935, | |
| "step": 219 | |
| }, | |
| { | |
| "epoch": 0.6984126984126984, | |
| "grad_norm": 0.3971534991234478, | |
| "learning_rate": 2.208781389865677e-07, | |
| "loss": 0.0496, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.6984126984126984, | |
| "eval_clip_ratio": 0.0, | |
| "eval_completion_length": 1515.7163492838542, | |
| "eval_kl": 0.006093343098958333, | |
| "eval_loss": 0.050898950546979904, | |
| "eval_reward": 1.9710845947265625, | |
| "eval_reward_std": 0.48362330595652264, | |
| "eval_rewards/": 7.117327372233073, | |
| "eval_rewards/math_compute_score": 0.6845238407452902, | |
| "eval_runtime": 130.2534, | |
| "eval_samples_per_second": 0.161, | |
| "eval_steps_per_second": 0.008, | |
| "step": 220 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1566.7857666015625, | |
| "epoch": 0.7015873015873015, | |
| "grad_norm": 0.32907894573216867, | |
| "kl": 0.0057525634765625, | |
| "learning_rate": 2.1662005866492715e-07, | |
| "loss": 0.0312, | |
| "reward": 1.7159180641174316, | |
| "reward_std": 0.4860241115093231, | |
| "rewards/": 7.329590320587158, | |
| "rewards/math_compute_score": 0.3125000149011612, | |
| "step": 221 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1622.3751220703125, | |
| "epoch": 0.7047619047619048, | |
| "grad_norm": 0.36878225894218164, | |
| "kl": 0.006378173828125, | |
| "learning_rate": 2.1239204362883695e-07, | |
| "loss": 0.0173, | |
| "reward": 1.4388115406036377, | |
| "reward_std": 0.48099273443222046, | |
| "rewards/": 6.4083428382873535, | |
| "rewards/math_compute_score": 0.196428582072258, | |
| "step": 222 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1494.3929443359375, | |
| "epoch": 0.707936507936508, | |
| "grad_norm": 0.38962068270348554, | |
| "kl": 0.006134033203125, | |
| "learning_rate": 2.0819454245090568e-07, | |
| "loss": 0.0288, | |
| "reward": 1.872544765472412, | |
| "reward_std": 0.4542202055454254, | |
| "rewards/": 6.934152126312256, | |
| "rewards/math_compute_score": 0.6071428656578064, | |
| "step": 223 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1317.2857666015625, | |
| "epoch": 0.7111111111111111, | |
| "grad_norm": 0.36894582377255875, | |
| "kl": 0.00543212890625, | |
| "learning_rate": 2.0402800046636364e-07, | |
| "loss": 0.0548, | |
| "reward": 2.053738832473755, | |
| "reward_std": 0.22289863228797913, | |
| "rewards/": 6.911551475524902, | |
| "rewards/math_compute_score": 0.8392857313156128, | |
| "step": 224 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1480.732177734375, | |
| "epoch": 0.7142857142857143, | |
| "grad_norm": 0.35221132675838623, | |
| "kl": 0.004974365234375, | |
| "learning_rate": 1.9989285972581593e-07, | |
| "loss": -0.0313, | |
| "reward": 1.808510184288025, | |
| "reward_std": 0.404258131980896, | |
| "rewards/": 6.471121788024902, | |
| "rewards/math_compute_score": 0.6428571939468384, | |
| "step": 225 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1309.25, | |
| "epoch": 0.7174603174603175, | |
| "grad_norm": 0.4356097059014855, | |
| "kl": 0.006500244140625, | |
| "learning_rate": 1.9578955894834258e-07, | |
| "loss": 0.0394, | |
| "reward": 2.175558090209961, | |
| "reward_std": 0.37214693427085876, | |
| "rewards/": 7.734933376312256, | |
| "rewards/math_compute_score": 0.785714328289032, | |
| "step": 226 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1685.0001220703125, | |
| "epoch": 0.7206349206349206, | |
| "grad_norm": 0.4352041974732675, | |
| "kl": 0.007537841796875, | |
| "learning_rate": 1.917185334749523e-07, | |
| "loss": 0.0324, | |
| "reward": 1.6248048543930054, | |
| "reward_std": 0.2427731454372406, | |
| "rewards/": 6.624023914337158, | |
| "rewards/math_compute_score": 0.3750000298023224, | |
| "step": 227 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1574.83935546875, | |
| "epoch": 0.7238095238095238, | |
| "grad_norm": 0.39645948514529894, | |
| "kl": 0.006988525390625, | |
| "learning_rate": 1.8768021522239574e-07, | |
| "loss": 0.0111, | |
| "reward": 1.8595423698425293, | |
| "reward_std": 0.20890839397907257, | |
| "rewards/": 7.297712326049805, | |
| "rewards/math_compute_score": 0.5, | |
| "step": 228 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1461.446533203125, | |
| "epoch": 0.726984126984127, | |
| "grad_norm": 0.43535613207045143, | |
| "kl": 0.006072998046875, | |
| "learning_rate": 1.836750326373398e-07, | |
| "loss": 0.024, | |
| "reward": 1.9434850215911865, | |
| "reward_std": 0.37709271907806396, | |
| "rewards/": 7.145996570587158, | |
| "rewards/math_compute_score": 0.6428571939468384, | |
| "step": 229 | |
| }, | |
| { | |
| "epoch": 0.7301587301587301, | |
| "grad_norm": 0.346576656418996, | |
| "learning_rate": 1.7970341065091243e-07, | |
| "loss": -0.0127, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.7301587301587301, | |
| "eval_clip_ratio": 0.0, | |
| "eval_completion_length": 1533.8690999348958, | |
| "eval_kl": 0.005940755208333333, | |
| "eval_loss": 0.012749183923006058, | |
| "eval_reward": 2.0542319615681968, | |
| "eval_reward_std": 0.41574782133102417, | |
| "eval_rewards/": 7.247349580128987, | |
| "eval_rewards/math_compute_score": 0.7559523979822794, | |
| "eval_runtime": 130.0387, | |
| "eval_samples_per_second": 0.161, | |
| "eval_steps_per_second": 0.008, | |
| "step": 230 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1560.6964721679688, | |
| "epoch": 0.7333333333333333, | |
| "grad_norm": 0.41442561377155895, | |
| "kl": 0.00677490234375, | |
| "learning_rate": 1.7576577063361918e-07, | |
| "loss": -0.0333, | |
| "reward": 1.7915493249893188, | |
| "reward_std": 0.3907308280467987, | |
| "rewards/": 6.957746505737305, | |
| "rewards/math_compute_score": 0.5000000298023224, | |
| "step": 231 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1500.7501220703125, | |
| "epoch": 0.7365079365079366, | |
| "grad_norm": 0.38386522257967753, | |
| "kl": 0.00628662109375, | |
| "learning_rate": 1.7186253035063736e-07, | |
| "loss": 0.0055, | |
| "reward": 1.9492467641830444, | |
| "reward_std": 0.5078251957893372, | |
| "rewards/": 7.031948089599609, | |
| "rewards/math_compute_score": 0.6785714626312256, | |
| "step": 232 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1712.83935546875, | |
| "epoch": 0.7396825396825397, | |
| "grad_norm": 0.3381282235296574, | |
| "kl": 0.005828857421875, | |
| "learning_rate": 1.6799410391749414e-07, | |
| "loss": 0.0371, | |
| "reward": 1.826283574104309, | |
| "reward_std": 0.623534083366394, | |
| "rewards/": 7.345703601837158, | |
| "rewards/math_compute_score": 0.4464285969734192, | |
| "step": 233 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1552.321533203125, | |
| "epoch": 0.7428571428571429, | |
| "grad_norm": 0.3949250198934789, | |
| "kl": 0.005645751953125, | |
| "learning_rate": 1.6416090175612958e-07, | |
| "loss": 0.0482, | |
| "reward": 1.816322684288025, | |
| "reward_std": 0.5752555131912231, | |
| "rewards/": 7.3673272132873535, | |
| "rewards/math_compute_score": 0.4285714626312256, | |
| "step": 234 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1474.71435546875, | |
| "epoch": 0.746031746031746, | |
| "grad_norm": 0.3997394557335043, | |
| "kl": 0.006072998046875, | |
| "learning_rate": 1.6036333055135344e-07, | |
| "loss": 0.0706, | |
| "reward": 2.0150113105773926, | |
| "reward_std": 0.37213414907455444, | |
| "rewards/": 7.360770225524902, | |
| "rewards/math_compute_score": 0.6785714626312256, | |
| "step": 235 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1357.3929443359375, | |
| "epoch": 0.7492063492063492, | |
| "grad_norm": 0.3581961494125316, | |
| "kl": 0.00592041015625, | |
| "learning_rate": 1.5660179320769788e-07, | |
| "loss": 0.0471, | |
| "reward": 2.19921875, | |
| "reward_std": 0.36445701122283936, | |
| "rewards/": 7.5675225257873535, | |
| "rewards/math_compute_score": 0.8571429252624512, | |
| "step": 236 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1471.83935546875, | |
| "epoch": 0.7523809523809524, | |
| "grad_norm": 0.37492809441966657, | |
| "kl": 0.006866455078125, | |
| "learning_rate": 1.5287668880667104e-07, | |
| "loss": 0.0772, | |
| "reward": 1.8618303537368774, | |
| "reward_std": 0.49302613735198975, | |
| "rewards/": 7.023437976837158, | |
| "rewards/math_compute_score": 0.5714285969734192, | |
| "step": 237 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1382.6785888671875, | |
| "epoch": 0.7555555555555555, | |
| "grad_norm": 0.38754661617692593, | |
| "kl": 0.00714111328125, | |
| "learning_rate": 1.49188412564416e-07, | |
| "loss": 0.0385, | |
| "reward": 1.967801570892334, | |
| "reward_std": 0.3383636176586151, | |
| "rewards/": 7.053292751312256, | |
| "rewards/math_compute_score": 0.6964285969734192, | |
| "step": 238 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1664.0535888671875, | |
| "epoch": 0.7587301587301587, | |
| "grad_norm": 0.4007176962544528, | |
| "kl": 0.0062255859375, | |
| "learning_rate": 1.455373557897814e-07, | |
| "loss": 0.0594, | |
| "reward": 1.5234934091567993, | |
| "reward_std": 0.551922619342804, | |
| "rewards/": 6.331752300262451, | |
| "rewards/math_compute_score": 0.3214285969734192, | |
| "step": 239 | |
| }, | |
| { | |
| "epoch": 0.7619047619047619, | |
| "grad_norm": 0.3831506478541147, | |
| "learning_rate": 1.4192390584280344e-07, | |
| "loss": 0.0014, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.7619047619047619, | |
| "eval_clip_ratio": 0.0, | |
| "eval_completion_length": 1517.2461344401042, | |
| "eval_kl": 0.005961100260416667, | |
| "eval_loss": 0.0475773885846138, | |
| "eval_reward": 1.965123454729716, | |
| "eval_reward_std": 0.47016530235608417, | |
| "eval_rewards/": 7.039902687072754, | |
| "eval_rewards/math_compute_score": 0.6964285969734192, | |
| "eval_runtime": 130.1244, | |
| "eval_samples_per_second": 0.161, | |
| "eval_steps_per_second": 0.008, | |
| "step": 240 | |
| } | |
| ], | |
| "logging_steps": 1.0, | |
| "max_steps": 315, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 80, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |