zd21's picture
Upload folder using huggingface_hub
a83c091 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.7619047619047619,
"eval_steps": 10,
"global_step": 240,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio": 0.0,
"completion_length": 1826.446533203125,
"epoch": 0.0031746031746031746,
"grad_norm": 0.34752090657228324,
"kl": 0.0,
"learning_rate": 1e-07,
"loss": -0.0327,
"reward": 1.2598215341567993,
"reward_std": 0.510444164276123,
"rewards/": 6.299107551574707,
"rewards/math_compute_score": 0.0,
"step": 1
},
{
"clip_ratio": 0.0,
"completion_length": 1842.607177734375,
"epoch": 0.006349206349206349,
"grad_norm": 0.37130067128404515,
"kl": 0.0,
"learning_rate": 2e-07,
"loss": 0.0544,
"reward": 1.4971821308135986,
"reward_std": 0.7506331205368042,
"rewards/": 5.914481163024902,
"rewards/math_compute_score": 0.392857164144516,
"step": 2
},
{
"clip_ratio": 0.0,
"completion_length": 2011.33935546875,
"epoch": 0.009523809523809525,
"grad_norm": 0.3200176373062262,
"kl": 0.0002689361572265625,
"learning_rate": 3e-07,
"loss": 0.0077,
"reward": 0.7667689919471741,
"reward_std": 0.6793785691261292,
"rewards/": 5.548130989074707,
"rewards/math_compute_score": -0.4285714626312256,
"step": 3
},
{
"clip_ratio": 0.0,
"completion_length": 1912.08935546875,
"epoch": 0.012698412698412698,
"grad_norm": 0.33904036931712517,
"kl": 0.000278472900390625,
"learning_rate": 4e-07,
"loss": 0.0411,
"reward": 1.14453125,
"reward_std": 0.7682722806930542,
"rewards/": 5.865513801574707,
"rewards/math_compute_score": -0.0357142873108387,
"step": 4
},
{
"clip_ratio": 0.0,
"completion_length": 1943.5357666015625,
"epoch": 0.015873015873015872,
"grad_norm": 0.3316962176535279,
"kl": 0.0002994537353515625,
"learning_rate": 5e-07,
"loss": 0.0439,
"reward": 0.9941790103912354,
"reward_std": 0.9224013090133667,
"rewards/": 5.899466514587402,
"rewards/math_compute_score": -0.2321428656578064,
"step": 5
},
{
"clip_ratio": 0.0,
"completion_length": 1823.5357666015625,
"epoch": 0.01904761904761905,
"grad_norm": 0.39521708946388423,
"kl": 0.0003261566162109375,
"learning_rate": 6e-07,
"loss": 0.0678,
"reward": 1.3013323545455933,
"reward_std": 0.7257120013237,
"rewards/": 6.363804817199707,
"rewards/math_compute_score": 0.0357142873108387,
"step": 6
},
{
"clip_ratio": 0.0,
"completion_length": 1816.696533203125,
"epoch": 0.022222222222222223,
"grad_norm": 0.3977474793000202,
"kl": 0.0002841949462890625,
"learning_rate": 7e-07,
"loss": 0.0524,
"reward": 0.7803781032562256,
"reward_std": 0.8279339075088501,
"rewards/": 4.901890754699707,
"rewards/math_compute_score": -0.25,
"step": 7
},
{
"clip_ratio": 0.0,
"completion_length": 1811.21435546875,
"epoch": 0.025396825396825397,
"grad_norm": 0.3883548683310925,
"kl": 0.000301361083984375,
"learning_rate": 8e-07,
"loss": 0.0334,
"reward": 1.5578125715255737,
"reward_std": 0.5970480442047119,
"rewards/": 6.503348350524902,
"rewards/math_compute_score": 0.3214285969734192,
"step": 8
},
{
"clip_ratio": 0.0,
"completion_length": 1728.857177734375,
"epoch": 0.02857142857142857,
"grad_norm": 0.37089633220316565,
"kl": 0.00034332275390625,
"learning_rate": 9e-07,
"loss": 0.087,
"reward": 1.2353515625,
"reward_std": 0.7409225702285767,
"rewards/": 5.891043663024902,
"rewards/math_compute_score": 0.0714285746216774,
"step": 9
},
{
"epoch": 0.031746031746031744,
"grad_norm": 0.31106869753963556,
"learning_rate": 1e-06,
"loss": 0.0123,
"step": 10
},
{
"epoch": 0.031746031746031744,
"eval_clip_ratio": 0.0,
"eval_completion_length": 1893.8313802083333,
"eval_kl": 0.0003102620442708333,
"eval_loss": 0.026542577892541885,
"eval_reward": 1.1524926622708638,
"eval_reward_std": 0.7845939000447592,
"eval_rewards/": 6.024367809295654,
"eval_rewards/math_compute_score": -0.06547619650761287,
"eval_runtime": 142.6603,
"eval_samples_per_second": 0.147,
"eval_steps_per_second": 0.007,
"step": 10
},
{
"clip_ratio": 0.0,
"completion_length": 1901.52685546875,
"epoch": 0.03492063492063492,
"grad_norm": 0.4187176065388408,
"kl": 0.000331878662109375,
"learning_rate": 9.99973476170006e-07,
"loss": 0.052,
"reward": 1.0824219584465027,
"reward_std": 0.5579104721546173,
"rewards/": 6.197824001312256,
"rewards/math_compute_score": -0.1964285857975483,
"step": 11
},
{
"clip_ratio": 0.0,
"completion_length": 1792.232177734375,
"epoch": 0.0380952380952381,
"grad_norm": 0.3897812351982433,
"kl": 0.000331878662109375,
"learning_rate": 9.998939074940787e-07,
"loss": 0.0559,
"reward": 1.3557896614074707,
"reward_std": 0.6514952778816223,
"rewards/": 5.921805381774902,
"rewards/math_compute_score": 0.2142857313156128,
"step": 12
},
{
"clip_ratio": 0.0,
"completion_length": 1953.571533203125,
"epoch": 0.04126984126984127,
"grad_norm": 0.3386069686056558,
"kl": 0.000308990478515625,
"learning_rate": 9.997613024140818e-07,
"loss": 0.0464,
"reward": 1.1771763563156128,
"reward_std": 0.8344842195510864,
"rewards/": 6.528738975524902,
"rewards/math_compute_score": -0.1607142984867096,
"step": 13
},
{
"clip_ratio": 0.0,
"completion_length": 1966.3037109375,
"epoch": 0.044444444444444446,
"grad_norm": 0.3436190087829617,
"kl": 0.0003261566162109375,
"learning_rate": 9.995756749987941e-07,
"loss": 0.0058,
"reward": 1.0176271200180054,
"reward_std": 0.6552860736846924,
"rewards/": 6.373849391937256,
"rewards/math_compute_score": -0.3214285969734192,
"step": 14
},
{
"clip_ratio": 0.0,
"completion_length": 1922.58935546875,
"epoch": 0.047619047619047616,
"grad_norm": 0.3312111219541393,
"kl": 0.0002460479736328125,
"learning_rate": 9.993370449424152e-07,
"loss": 0.0395,
"reward": 1.332235336303711,
"reward_std": 0.7883400321006775,
"rewards/": 6.089747905731201,
"rewards/math_compute_score": 0.1428571492433548,
"step": 15
},
{
"clip_ratio": 0.0,
"completion_length": 1920.607177734375,
"epoch": 0.050793650793650794,
"grad_norm": 0.3660345032806764,
"kl": 0.0003185272216796875,
"learning_rate": 9.990454375624776e-07,
"loss": 0.0429,
"reward": 0.9837054014205933,
"reward_std": 0.56076979637146,
"rewards/": 6.489955425262451,
"rewards/math_compute_score": -0.392857164144516,
"step": 16
},
{
"clip_ratio": 0.0,
"completion_length": 1781.5537109375,
"epoch": 0.05396825396825397,
"grad_norm": 0.3159968435422686,
"kl": 0.00029754638671875,
"learning_rate": 9.987008837971594e-07,
"loss": 0.0258,
"reward": 1.4613840579986572,
"reward_std": 0.5660971403121948,
"rewards/": 5.878348350524902,
"rewards/math_compute_score": 0.3571428656578064,
"step": 17
},
{
"clip_ratio": 0.0,
"completion_length": 1722.08935546875,
"epoch": 0.05714285714285714,
"grad_norm": 0.3769499591892152,
"kl": 0.0002956390380859375,
"learning_rate": 9.98303420202003e-07,
"loss": 0.0515,
"reward": 1.5921318531036377,
"reward_std": 0.657545268535614,
"rewards/": 6.103516101837158,
"rewards/math_compute_score": 0.4642857313156128,
"step": 18
},
{
"clip_ratio": 0.0,
"completion_length": 1859.8751220703125,
"epoch": 0.06031746031746032,
"grad_norm": 0.37701505516549555,
"kl": 0.000335693359375,
"learning_rate": 9.978530889460349e-07,
"loss": 0.0532,
"reward": 1.2727400064468384,
"reward_std": 0.6805964708328247,
"rewards/": 6.435128688812256,
"rewards/math_compute_score": -0.01785714365541935,
"step": 19
},
{
"epoch": 0.06349206349206349,
"grad_norm": 0.3560929502812583,
"learning_rate": 9.973499378072946e-07,
"loss": 0.0505,
"step": 20
},
{
"epoch": 0.06349206349206349,
"eval_clip_ratio": 0.0,
"eval_completion_length": 1902.3988444010417,
"eval_kl": 0.0003344217936197917,
"eval_loss": 0.03622707724571228,
"eval_reward": 1.2230852444966633,
"eval_reward_std": 0.7650324503580729,
"eval_rewards/": 6.020188331604004,
"eval_rewards/math_compute_score": 0.023809528599182766,
"eval_runtime": 140.9236,
"eval_samples_per_second": 0.149,
"eval_steps_per_second": 0.007,
"step": 20
},
{
"clip_ratio": 0.0,
"completion_length": 1718.2589721679688,
"epoch": 0.06666666666666667,
"grad_norm": 0.3375758039734621,
"kl": 0.000278472900390625,
"learning_rate": 9.967940201677625e-07,
"loss": 0.0309,
"reward": 1.532582402229309,
"reward_std": 0.5138258934020996,
"rewards/": 6.23434042930603,
"rewards/math_compute_score": 0.3571428805589676,
"step": 21
},
{
"clip_ratio": 0.0,
"completion_length": 1812.821533203125,
"epoch": 0.06984126984126984,
"grad_norm": 0.3990303343369039,
"kl": 0.0004177093505859375,
"learning_rate": 9.96185395007699e-07,
"loss": 0.0801,
"reward": 1.3509488105773926,
"reward_std": 0.9690964818000793,
"rewards/": 6.040457725524902,
"rewards/math_compute_score": 0.1785714328289032,
"step": 22
},
{
"clip_ratio": 0.0,
"completion_length": 1906.821533203125,
"epoch": 0.07301587301587302,
"grad_norm": 0.3549321131443626,
"kl": 0.00028228759765625,
"learning_rate": 9.95524126899385e-07,
"loss": -0.0105,
"reward": 1.0774554014205933,
"reward_std": 0.8010032176971436,
"rewards/": 6.244420051574707,
"rewards/math_compute_score": -0.2142857313156128,
"step": 23
},
{
"clip_ratio": 0.0,
"completion_length": 1859.821533203125,
"epoch": 0.0761904761904762,
"grad_norm": 0.400791590884524,
"kl": 0.0003490447998046875,
"learning_rate": 9.94810286000272e-07,
"loss": 0.0269,
"reward": 1.2509558200836182,
"reward_std": 0.6348705291748047,
"rewards/": 5.683350086212158,
"rewards/math_compute_score": 0.1428571492433548,
"step": 24
},
{
"clip_ratio": 0.0,
"completion_length": 1910.6787109375,
"epoch": 0.07936507936507936,
"grad_norm": 0.32248521687421966,
"kl": 0.0003147125244140625,
"learning_rate": 9.940439480455385e-07,
"loss": 0.0147,
"reward": 1.06690514087677,
"reward_std": 0.6229907870292664,
"rewards/": 6.048810958862305,
"rewards/math_compute_score": -0.1785714328289032,
"step": 25
},
{
"clip_ratio": 0.0,
"completion_length": 1673.5357666015625,
"epoch": 0.08253968253968254,
"grad_norm": 0.4098565397161361,
"kl": 0.0003509521484375,
"learning_rate": 9.932251943400553e-07,
"loss": 0.0784,
"reward": 1.8423550128936768,
"reward_std": 0.5337907075881958,
"rewards/": 6.497488975524902,
"rewards/math_compute_score": 0.6785714626312256,
"step": 26
},
{
"clip_ratio": 0.0,
"completion_length": 1848.6429443359375,
"epoch": 0.08571428571428572,
"grad_norm": 0.3413878426395489,
"kl": 0.000400543212890625,
"learning_rate": 9.923541117497585e-07,
"loss": -0.0184,
"reward": 1.2410855293273926,
"reward_std": 0.5728386044502258,
"rewards/": 6.205427169799805,
"rewards/math_compute_score": 0.0,
"step": 27
},
{
"clip_ratio": 0.0,
"completion_length": 1833.607177734375,
"epoch": 0.08888888888888889,
"grad_norm": 0.398653320294794,
"kl": 0.0004863739013671875,
"learning_rate": 9.914307926924344e-07,
"loss": 0.0011,
"reward": 1.2547712326049805,
"reward_std": 0.5897310376167297,
"rewards/": 6.416713237762451,
"rewards/math_compute_score": -0.0357142873108387,
"step": 28
},
{
"clip_ratio": 0.0,
"completion_length": 1974.982177734375,
"epoch": 0.09206349206349207,
"grad_norm": 0.34060151455859505,
"kl": 0.00042724609375,
"learning_rate": 9.904553351279136e-07,
"loss": 0.0252,
"reward": 0.9052909016609192,
"reward_std": 0.7842908501625061,
"rewards/": 5.812169075012207,
"rewards/math_compute_score": -0.3214285969734192,
"step": 29
},
{
"epoch": 0.09523809523809523,
"grad_norm": 0.3555389006533955,
"learning_rate": 9.894278425476788e-07,
"loss": 0.0422,
"step": 30
},
{
"epoch": 0.09523809523809523,
"eval_clip_ratio": 0.0,
"eval_completion_length": 1847.3909505208333,
"eval_kl": 0.0004094441731770833,
"eval_loss": 0.027497123926877975,
"eval_reward": 1.240986665089925,
"eval_reward_std": 0.79307621717453,
"eval_rewards/": 5.8715996742248535,
"eval_rewards/math_compute_score": 0.0833333432674408,
"eval_runtime": 139.8981,
"eval_samples_per_second": 0.15,
"eval_steps_per_second": 0.007,
"step": 30
},
{
"clip_ratio": 0.0,
"completion_length": 1868.71435546875,
"epoch": 0.09841269841269841,
"grad_norm": 0.3260634697668482,
"kl": 0.0004024505615234375,
"learning_rate": 9.88348423963884e-07,
"loss": 0.0114,
"reward": 1.2039064168930054,
"reward_std": 0.64045649766922,
"rewards/": 6.305245876312256,
"rewards/math_compute_score": -0.0714285783469677,
"step": 31
},
{
"clip_ratio": 0.0,
"completion_length": 1790.857177734375,
"epoch": 0.10158730158730159,
"grad_norm": 0.3567249854105024,
"kl": 0.00042724609375,
"learning_rate": 9.872171938977893e-07,
"loss": 0.0165,
"reward": 1.3144984245300293,
"reward_std": 0.8585119843482971,
"rewards/": 5.858206748962402,
"rewards/math_compute_score": 0.1785714328289032,
"step": 32
},
{
"clip_ratio": 0.0,
"completion_length": 1813.7857666015625,
"epoch": 0.10476190476190476,
"grad_norm": 0.380215941310899,
"kl": 0.0005035400390625,
"learning_rate": 9.860342723676104e-07,
"loss": 0.0179,
"reward": 1.340485692024231,
"reward_std": 0.7554614543914795,
"rewards/": 5.773856163024902,
"rewards/math_compute_score": 0.2321428656578064,
"step": 33
},
{
"clip_ratio": 0.0,
"completion_length": 1642.2501220703125,
"epoch": 0.10793650793650794,
"grad_norm": 0.3669725090275098,
"kl": 0.0003681182861328125,
"learning_rate": 9.847997848757854e-07,
"loss": 0.0739,
"reward": 1.350502371788025,
"reward_std": 0.4604892134666443,
"rewards/": 5.681082725524902,
"rewards/math_compute_score": 0.267857164144516,
"step": 34
},
{
"clip_ratio": 0.0,
"completion_length": 1810.3751220703125,
"epoch": 0.1111111111111111,
"grad_norm": 0.3898398578219401,
"kl": 0.00051116943359375,
"learning_rate": 9.835138623956602e-07,
"loss": -0.0012,
"reward": 1.1720424890518188,
"reward_std": 0.6071317195892334,
"rewards/": 5.717355251312256,
"rewards/math_compute_score": 0.0357142873108387,
"step": 35
},
{
"clip_ratio": 0.0,
"completion_length": 1676.46435546875,
"epoch": 0.11428571428571428,
"grad_norm": 0.3415556235291945,
"kl": 0.000461578369140625,
"learning_rate": 9.821766413575914e-07,
"loss": 0.0237,
"reward": 1.5515068769454956,
"reward_std": 0.41333118081092834,
"rewards/": 6.043248176574707,
"rewards/math_compute_score": 0.4285714626312256,
"step": 36
},
{
"clip_ratio": 0.0,
"completion_length": 1821.0357666015625,
"epoch": 0.11746031746031746,
"grad_norm": 0.3841969882035549,
"kl": 0.00049591064453125,
"learning_rate": 9.80788263634473e-07,
"loss": 0.0204,
"reward": 1.4578125476837158,
"reward_std": 0.6289081573486328,
"rewards/": 6.574777126312256,
"rewards/math_compute_score": 0.1785714328289032,
"step": 37
},
{
"clip_ratio": 0.0,
"completion_length": 1789.196533203125,
"epoch": 0.12063492063492064,
"grad_norm": 0.4014410966440469,
"kl": 0.000530242919921875,
"learning_rate": 9.793488765266838e-07,
"loss": 0.07,
"reward": 1.5906460285186768,
"reward_std": 0.8234072327613831,
"rewards/": 6.0960869789123535,
"rewards/math_compute_score": 0.4642857313156128,
"step": 38
},
{
"clip_ratio": 0.0,
"completion_length": 1829.0537109375,
"epoch": 0.12380952380952381,
"grad_norm": 0.34941213890761874,
"kl": 0.00049591064453125,
"learning_rate": 9.778586327464597e-07,
"loss": 0.0278,
"reward": 1.5846540927886963,
"reward_std": 0.6426271200180054,
"rewards/": 6.637556076049805,
"rewards/math_compute_score": 0.3214285969734192,
"step": 39
},
{
"epoch": 0.12698412698412698,
"grad_norm": 0.38140427899057316,
"learning_rate": 9.763176904016913e-07,
"loss": 0.0294,
"step": 40
},
{
"epoch": 0.12698412698412698,
"eval_clip_ratio": 0.0,
"eval_completion_length": 1791.4822184244792,
"eval_kl": 0.0005544026692708334,
"eval_loss": 0.05520148575305939,
"eval_reward": 1.4818546374638875,
"eval_reward_std": 0.7397708296775818,
"eval_rewards/": 6.504511038462321,
"eval_rewards/math_compute_score": 0.2261904776096344,
"eval_runtime": 138.6839,
"eval_samples_per_second": 0.151,
"eval_steps_per_second": 0.007,
"step": 40
},
{
"clip_ratio": 0.0,
"completion_length": 1758.419677734375,
"epoch": 0.13015873015873017,
"grad_norm": 0.38719549506679857,
"kl": 0.0005283355712890625,
"learning_rate": 9.747262129791495e-07,
"loss": 0.0051,
"reward": 1.2441372275352478,
"reward_std": 0.5995893478393555,
"rewards/": 6.0064003467559814,
"rewards/math_compute_score": 0.0535714253783226,
"step": 41
},
{
"clip_ratio": 0.0,
"completion_length": 1710.3035888671875,
"epoch": 0.13333333333333333,
"grad_norm": 0.3698050435119778,
"kl": 0.000522613525390625,
"learning_rate": 9.730843693271413e-07,
"loss": 0.0453,
"reward": 1.4925503730773926,
"reward_std": 0.6253259181976318,
"rewards/": 5.819894313812256,
"rewards/math_compute_score": 0.4107142984867096,
"step": 42
},
{
"clip_ratio": 0.0,
"completion_length": 1744.3929443359375,
"epoch": 0.1365079365079365,
"grad_norm": 0.3735801890902873,
"kl": 0.0006866455078125,
"learning_rate": 9.713923336375936e-07,
"loss": 0.0102,
"reward": 1.4331055879592896,
"reward_std": 0.829824686050415,
"rewards/": 5.951241970062256,
"rewards/math_compute_score": 0.3035714328289032,
"step": 43
},
{
"clip_ratio": 0.0,
"completion_length": 1797.232177734375,
"epoch": 0.13968253968253969,
"grad_norm": 0.35251183170280326,
"kl": 0.00058746337890625,
"learning_rate": 9.696502854275748e-07,
"loss": 0.0273,
"reward": 1.3867467641830444,
"reward_std": 0.6368395686149597,
"rewards/": 6.219447612762451,
"rewards/math_compute_score": 0.1785714328289032,
"step": 44
},
{
"clip_ratio": 0.0,
"completion_length": 1759.08935546875,
"epoch": 0.14285714285714285,
"grad_norm": 0.36571909135002856,
"kl": 0.000553131103515625,
"learning_rate": 9.678584095202469e-07,
"loss": 0.0341,
"reward": 0.931584894657135,
"reward_std": 0.5959246754646301,
"rewards/": 5.729352951049805,
"rewards/math_compute_score": -0.267857164144516,
"step": 45
},
{
"clip_ratio": 0.0,
"completion_length": 1736.357177734375,
"epoch": 0.14603174603174604,
"grad_norm": 0.33831757661050815,
"kl": 0.000553131103515625,
"learning_rate": 9.660168960252575e-07,
"loss": 0.0437,
"reward": 1.6665178537368774,
"reward_std": 0.4598635137081146,
"rewards/": 6.046875476837158,
"rewards/math_compute_score": 0.5714285969734192,
"step": 46
},
{
"clip_ratio": 0.0,
"completion_length": 1807.8037109375,
"epoch": 0.1492063492063492,
"grad_norm": 0.36813538905077847,
"kl": 0.000637054443359375,
"learning_rate": 9.641259403185704e-07,
"loss": 0.031,
"reward": 0.9589914083480835,
"reward_std": 0.5812153816223145,
"rewards/": 5.080671310424805,
"rewards/math_compute_score": -0.0714285746216774,
"step": 47
},
{
"clip_ratio": 0.0,
"completion_length": 1591.982177734375,
"epoch": 0.1523809523809524,
"grad_norm": 0.3231025820418364,
"kl": 0.000698089599609375,
"learning_rate": 9.621857430217365e-07,
"loss": 0.0424,
"reward": 1.8917970657348633,
"reward_std": 0.3157893121242523,
"rewards/": 6.744699001312256,
"rewards/math_compute_score": 0.6785714626312256,
"step": 48
},
{
"clip_ratio": 0.0,
"completion_length": 1881.982177734375,
"epoch": 0.15555555555555556,
"grad_norm": 0.3466507877034998,
"kl": 0.0007781982421875,
"learning_rate": 9.601965099806084e-07,
"loss": 0.0405,
"reward": 1.4976422786712646,
"reward_std": 0.7847145199775696,
"rewards/": 6.416783332824707,
"rewards/math_compute_score": 0.267857164144516,
"step": 49
},
{
"epoch": 0.15873015873015872,
"grad_norm": 0.3331388102754494,
"learning_rate": 9.581584522435023e-07,
"loss": 0.0388,
"step": 50
},
{
"epoch": 0.15873015873015872,
"eval_clip_ratio": 0.0,
"eval_completion_length": 1816.2877604166667,
"eval_kl": 0.0007731119791666666,
"eval_loss": 0.04274662211537361,
"eval_reward": 1.551771879196167,
"eval_reward_std": 0.641852875550588,
"eval_rewards/": 6.258859157562256,
"eval_rewards/math_compute_score": 0.3750000149011612,
"eval_runtime": 138.9269,
"eval_samples_per_second": 0.151,
"eval_steps_per_second": 0.007,
"step": 50
},
{
"clip_ratio": 0.0,
"completion_length": 1660.357177734375,
"epoch": 0.1619047619047619,
"grad_norm": 0.41262546854699034,
"kl": 0.0008182525634765625,
"learning_rate": 9.56071786038806e-07,
"loss": 0.0192,
"reward": 1.6869142055511475,
"reward_std": 0.39825020730495453,
"rewards/": 6.577427625656128,
"rewards/math_compute_score": 0.4642857313156128,
"step": 51
},
{
"clip_ratio": 0.0,
"completion_length": 1935.571533203125,
"epoch": 0.16507936507936508,
"grad_norm": 0.34500490912598375,
"kl": 0.000762939453125,
"learning_rate": 9.53936732752038e-07,
"loss": 0.0028,
"reward": 1.2544364929199219,
"reward_std": 0.6236512660980225,
"rewards/": 6.557896614074707,
"rewards/math_compute_score": -0.0714285746216774,
"step": 52
},
{
"clip_ratio": 0.0,
"completion_length": 1827.482177734375,
"epoch": 0.16825396825396827,
"grad_norm": 0.34348526502845717,
"kl": 0.000732421875,
"learning_rate": 9.517535189023601e-07,
"loss": -0.0054,
"reward": 1.2513673305511475,
"reward_std": 0.8205690979957581,
"rewards/": 5.8282647132873535,
"rewards/math_compute_score": 0.1071428656578064,
"step": 53
},
{
"clip_ratio": 0.0,
"completion_length": 1753.4285888671875,
"epoch": 0.17142857142857143,
"grad_norm": 0.36933753325659874,
"kl": 0.00099945068359375,
"learning_rate": 9.495223761185441e-07,
"loss": 0.0034,
"reward": 1.4741246700286865,
"reward_std": 0.6187431216239929,
"rewards/": 6.513480186462402,
"rewards/math_compute_score": 0.2142857313156128,
"step": 54
},
{
"clip_ratio": 0.0,
"completion_length": 1771.8751220703125,
"epoch": 0.1746031746031746,
"grad_norm": 0.34576119098823954,
"kl": 0.0009918212890625,
"learning_rate": 9.472435411143977e-07,
"loss": 0.0191,
"reward": 1.5563616752624512,
"reward_std": 0.6519899368286133,
"rewards/": 6.781808376312256,
"rewards/math_compute_score": 0.25,
"step": 55
},
{
"clip_ratio": 0.0,
"completion_length": 1711.1785888671875,
"epoch": 0.17777777777777778,
"grad_norm": 0.3525345385926051,
"kl": 0.000843048095703125,
"learning_rate": 9.449172556636497e-07,
"loss": -0.0206,
"reward": 1.422028660774231,
"reward_std": 0.8101202249526978,
"rewards/": 6.110142707824707,
"rewards/math_compute_score": 0.25,
"step": 56
},
{
"clip_ratio": 0.0,
"completion_length": 1774.6251220703125,
"epoch": 0.18095238095238095,
"grad_norm": 0.40494567349730853,
"kl": 0.000972747802734375,
"learning_rate": 9.425437665742997e-07,
"loss": 0.0519,
"reward": 1.3338658809661865,
"reward_std": 0.7232537865638733,
"rewards/": 6.240757942199707,
"rewards/math_compute_score": 0.1071428656578064,
"step": 57
},
{
"clip_ratio": 0.0,
"completion_length": 1828.071533203125,
"epoch": 0.18412698412698414,
"grad_norm": 0.33450583156617786,
"kl": 0.000881195068359375,
"learning_rate": 9.401233256624316e-07,
"loss": 0.0249,
"reward": 1.4562500715255737,
"reward_std": 0.6799939274787903,
"rewards/": 6.566964626312256,
"rewards/math_compute_score": 0.1785714328289032,
"step": 58
},
{
"clip_ratio": 0.0,
"completion_length": 1660.696533203125,
"epoch": 0.1873015873015873,
"grad_norm": 0.4070810399603553,
"kl": 0.0011444091796875,
"learning_rate": 9.376561897254987e-07,
"loss": -0.0269,
"reward": 1.5412318706512451,
"reward_std": 0.5850991606712341,
"rewards/": 6.277588367462158,
"rewards/math_compute_score": 0.3571428656578064,
"step": 59
},
{
"epoch": 0.19047619047619047,
"grad_norm": 0.37255871829826,
"learning_rate": 9.351426205150776e-07,
"loss": 0.038,
"step": 60
},
{
"epoch": 0.19047619047619047,
"eval_clip_ratio": 0.0,
"eval_completion_length": 1717.0694986979167,
"eval_kl": 0.0009918212890625,
"eval_loss": 0.03207956254482269,
"eval_reward": 1.5469355980555217,
"eval_reward_std": 0.6412561237812042,
"eval_rewards/": 6.091820240020752,
"eval_rewards/math_compute_score": 0.4107142984867096,
"eval_runtime": 136.3487,
"eval_samples_per_second": 0.154,
"eval_steps_per_second": 0.007,
"step": 60
},
{
"clip_ratio": 0.0,
"completion_length": 1612.7322387695312,
"epoch": 0.19365079365079366,
"grad_norm": 0.3484954653008926,
"kl": 0.001018524169921875,
"learning_rate": 9.32582884709098e-07,
"loss": 0.0535,
"reward": 1.569977879524231,
"reward_std": 0.5602003335952759,
"rewards/": 6.2784600257873535,
"rewards/math_compute_score": 0.39285717345774174,
"step": 61
},
{
"clip_ratio": 0.0,
"completion_length": 1474.1429443359375,
"epoch": 0.19682539682539682,
"grad_norm": 0.456250830578578,
"kl": 0.0013275146484375,
"learning_rate": 9.299772538835491e-07,
"loss": -0.0324,
"reward": 1.570549726486206,
"reward_std": 0.3879093527793884,
"rewards/": 6.138463020324707,
"rewards/math_compute_score": 0.4285714626312256,
"step": 62
},
{
"clip_ratio": 0.0,
"completion_length": 1645.482177734375,
"epoch": 0.2,
"grad_norm": 0.3717148388714456,
"kl": 0.00115203857421875,
"learning_rate": 9.273260044836673e-07,
"loss": 0.0842,
"reward": 1.741573691368103,
"reward_std": 0.49386849999427795,
"rewards/": 6.779297351837158,
"rewards/math_compute_score": 0.4821428656578064,
"step": 63
},
{
"clip_ratio": 0.0,
"completion_length": 1534.857177734375,
"epoch": 0.20317460317460317,
"grad_norm": 0.3745652306218341,
"kl": 0.00112152099609375,
"learning_rate": 9.246294177946062e-07,
"loss": 0.0523,
"reward": 1.82996666431427,
"reward_std": 0.42854541540145874,
"rewards/": 6.721261501312256,
"rewards/math_compute_score": 0.6071428656578064,
"step": 64
},
{
"clip_ratio": 0.0,
"completion_length": 1709.9285888671875,
"epoch": 0.20634920634920634,
"grad_norm": 0.4185811440041477,
"kl": 0.00122833251953125,
"learning_rate": 9.218877799115927e-07,
"loss": 0.0595,
"reward": 1.7054688930511475,
"reward_std": 0.5991591215133667,
"rewards/": 6.813058376312256,
"rewards/math_compute_score": 0.4285714626312256,
"step": 65
},
{
"clip_ratio": 0.0,
"completion_length": 1851.0357666015625,
"epoch": 0.20952380952380953,
"grad_norm": 0.34158790586957477,
"kl": 0.0010986328125,
"learning_rate": 9.191013817095761e-07,
"loss": 0.0013,
"reward": 1.466183066368103,
"reward_std": 0.5523228049278259,
"rewards/": 7.045201301574707,
"rewards/math_compute_score": 0.0714285746216774,
"step": 66
},
{
"clip_ratio": 0.0,
"completion_length": 1783.4287109375,
"epoch": 0.2126984126984127,
"grad_norm": 0.40084205903248576,
"kl": 0.0013427734375,
"learning_rate": 9.162705188123646e-07,
"loss": 0.0218,
"reward": 1.456040859222412,
"reward_std": 0.5050444602966309,
"rewards/": 6.994489669799805,
"rewards/math_compute_score": 0.0714285746216774,
"step": 67
},
{
"clip_ratio": 0.0,
"completion_length": 1766.2501220703125,
"epoch": 0.21587301587301588,
"grad_norm": 0.38102579204059783,
"kl": 0.0012969970703125,
"learning_rate": 9.133954915612634e-07,
"loss": 0.087,
"reward": 1.6629464626312256,
"reward_std": 0.7155088782310486,
"rewards/": 6.957589626312256,
"rewards/math_compute_score": 0.3392857313156128,
"step": 68
},
{
"clip_ratio": 0.0,
"completion_length": 1658.96435546875,
"epoch": 0.21904761904761905,
"grad_norm": 0.36394590935604426,
"kl": 0.00115966796875,
"learning_rate": 9.104766049832087e-07,
"loss": 0.0179,
"reward": 1.6870676279067993,
"reward_std": 0.4471067190170288,
"rewards/": 6.506766319274902,
"rewards/math_compute_score": 0.4821428656578064,
"step": 69
},
{
"epoch": 0.2222222222222222,
"grad_norm": 0.3374654129906019,
"learning_rate": 9.075141687584056e-07,
"loss": 0.0178,
"step": 70
},
{
"epoch": 0.2222222222222222,
"eval_clip_ratio": 0.0,
"eval_completion_length": 1738.0059814453125,
"eval_kl": 0.00133514404296875,
"eval_loss": 0.05646166205406189,
"eval_reward": 1.5958195527394612,
"eval_reward_std": 0.6562197208404541,
"eval_rewards/": 6.621954282124837,
"eval_rewards/math_compute_score": 0.3392857213815053,
"eval_runtime": 136.062,
"eval_samples_per_second": 0.154,
"eval_steps_per_second": 0.007,
"step": 70
},
{
"clip_ratio": 0.0,
"completion_length": 1752.3304443359375,
"epoch": 0.2253968253968254,
"grad_norm": 0.38489393931988863,
"kl": 0.001201629638671875,
"learning_rate": 9.045084971874737e-07,
"loss": -0.0611,
"reward": 1.3448200225830078,
"reward_std": 0.5091241598129272,
"rewards/": 6.402670860290527,
"rewards/math_compute_score": 0.0803571492433548,
"step": 71
},
{
"clip_ratio": 0.0,
"completion_length": 1880.2501220703125,
"epoch": 0.22857142857142856,
"grad_norm": 0.39120717997716,
"kl": 0.00131988525390625,
"learning_rate": 9.014599091580998e-07,
"loss": 0.048,
"reward": 1.76941978931427,
"reward_std": 0.5547811388969421,
"rewards/": 7.2042412757873535,
"rewards/math_compute_score": 0.4107142984867096,
"step": 72
},
{
"clip_ratio": 0.0,
"completion_length": 1588.08935546875,
"epoch": 0.23174603174603176,
"grad_norm": 0.3304077909442911,
"kl": 0.00144195556640625,
"learning_rate": 8.983687281112064e-07,
"loss": 0.0299,
"reward": 2.067634105682373,
"reward_std": 0.4065239131450653,
"rewards/": 7.195312976837158,
"rewards/math_compute_score": 0.785714328289032,
"step": 73
},
{
"clip_ratio": 0.0,
"completion_length": 1834.7857666015625,
"epoch": 0.23492063492063492,
"grad_norm": 0.31261547252144517,
"kl": 0.001434326171875,
"learning_rate": 8.952352820066358e-07,
"loss": 0.0338,
"reward": 1.8736608028411865,
"reward_std": 0.4773353934288025,
"rewards/": 6.939732551574707,
"rewards/math_compute_score": 0.6071428656578064,
"step": 74
},
{
"clip_ratio": 0.0,
"completion_length": 1668.071533203125,
"epoch": 0.23809523809523808,
"grad_norm": 0.37467501438016265,
"kl": 0.0016326904296875,
"learning_rate": 8.920599032883552e-07,
"loss": -0.0187,
"reward": 1.3380582332611084,
"reward_std": 0.505685031414032,
"rewards/": 5.690290451049805,
"rewards/math_compute_score": 0.25,
"step": 75
},
{
"clip_ratio": 0.0,
"completion_length": 1687.107177734375,
"epoch": 0.24126984126984127,
"grad_norm": 0.3823169614619227,
"kl": 0.00157928466796875,
"learning_rate": 8.888429288491855e-07,
"loss": 0.0274,
"reward": 1.6549667119979858,
"reward_std": 0.3212043046951294,
"rewards/": 6.560547351837158,
"rewards/math_compute_score": 0.4285714626312256,
"step": 76
},
{
"clip_ratio": 0.0,
"completion_length": 1807.1787109375,
"epoch": 0.24444444444444444,
"grad_norm": 0.35610484187919744,
"kl": 0.0015869140625,
"learning_rate": 8.855846999950595e-07,
"loss": 0.0261,
"reward": 1.2699779272079468,
"reward_std": 0.5984498858451843,
"rewards/": 6.492745876312256,
"rewards/math_compute_score": -0.0357142873108387,
"step": 77
},
{
"clip_ratio": 0.0,
"completion_length": 1851.9107666015625,
"epoch": 0.24761904761904763,
"grad_norm": 0.2883614180110202,
"kl": 0.00138092041015625,
"learning_rate": 8.822855624088097e-07,
"loss": -0.0185,
"reward": 1.5325753688812256,
"reward_std": 0.6921989321708679,
"rewards/": 6.734305381774902,
"rewards/math_compute_score": 0.2321428656578064,
"step": 78
},
{
"clip_ratio": 0.0,
"completion_length": 1517.8751220703125,
"epoch": 0.2507936507936508,
"grad_norm": 0.3716530172552612,
"kl": 0.00165557861328125,
"learning_rate": 8.789458661134942e-07,
"loss": 0.049,
"reward": 1.7504465579986572,
"reward_std": 0.39499327540397644,
"rewards/": 6.466517925262451,
"rewards/math_compute_score": 0.5714285969734192,
"step": 79
},
{
"epoch": 0.25396825396825395,
"grad_norm": 0.3590906991873432,
"learning_rate": 8.755659654352599e-07,
"loss": 0.0028,
"step": 80
},
{
"epoch": 0.25396825396825395,
"eval_clip_ratio": 0.0,
"eval_completion_length": 1681.073486328125,
"eval_kl": 0.017834981282552082,
"eval_loss": 0.015299047343432903,
"eval_reward": 1.720870574315389,
"eval_reward_std": 0.5228437781333923,
"eval_rewards/": 6.556733926137288,
"eval_rewards/math_compute_score": 0.5119047897557417,
"eval_runtime": 135.3946,
"eval_samples_per_second": 0.155,
"eval_steps_per_second": 0.007,
"step": 80
},
{
"clip_ratio": 0.0,
"completion_length": 1618.1160888671875,
"epoch": 0.2571428571428571,
"grad_norm": 0.3174887107646142,
"kl": 0.00168609619140625,
"learning_rate": 8.721462189657509e-07,
"loss": 0.0154,
"reward": 1.8046876192092896,
"reward_std": 0.45894815027713776,
"rewards/": 6.273437738418579,
"rewards/math_compute_score": 0.6875000298023224,
"step": 81
},
{
"clip_ratio": 0.0,
"completion_length": 1785.607177734375,
"epoch": 0.26031746031746034,
"grad_norm": 0.34079321817593966,
"kl": 0.0016326904296875,
"learning_rate": 8.686869895240631e-07,
"loss": -0.01,
"reward": 1.5418108701705933,
"reward_std": 0.6146999597549438,
"rewards/": 6.923340320587158,
"rewards/math_compute_score": 0.196428582072258,
"step": 82
},
{
"clip_ratio": 0.0,
"completion_length": 1692.9285888671875,
"epoch": 0.2634920634920635,
"grad_norm": 0.3374435642100782,
"kl": 0.00174713134765625,
"learning_rate": 8.651886441182508e-07,
"loss": 0.027,
"reward": 1.9006696939468384,
"reward_std": 0.45941323041915894,
"rewards/": 7.217634201049805,
"rewards/math_compute_score": 0.5714285969734192,
"step": 83
},
{
"clip_ratio": 0.0,
"completion_length": 1753.946533203125,
"epoch": 0.26666666666666666,
"grad_norm": 0.31783244878975037,
"kl": 0.001495361328125,
"learning_rate": 8.616515539063894e-07,
"loss": 0.003,
"reward": 1.582680106163025,
"reward_std": 0.7250080704689026,
"rewards/": 6.556257247924805,
"rewards/math_compute_score": 0.3392857313156128,
"step": 84
},
{
"clip_ratio": 0.0,
"completion_length": 1419.107177734375,
"epoch": 0.2698412698412698,
"grad_norm": 0.42796739682838303,
"kl": 0.0025634765625,
"learning_rate": 8.580760941571966e-07,
"loss": 0.0542,
"reward": 1.8185827732086182,
"reward_std": 0.24573805928230286,
"rewards/": 6.521484851837158,
"rewards/math_compute_score": 0.6428571939468384,
"step": 85
},
{
"clip_ratio": 0.0,
"completion_length": 1636.6607666015625,
"epoch": 0.273015873015873,
"grad_norm": 0.42716916595705023,
"kl": 0.0019989013671875,
"learning_rate": 8.544626442102187e-07,
"loss": 0.0444,
"reward": 1.9161133766174316,
"reward_std": 0.5437954664230347,
"rewards/": 7.009138107299805,
"rewards/math_compute_score": 0.6428571939468384,
"step": 86
},
{
"clip_ratio": 0.0,
"completion_length": 1772.196533203125,
"epoch": 0.2761904761904762,
"grad_norm": 0.36873824811015804,
"kl": 0.00189208984375,
"learning_rate": 8.508115874355839e-07,
"loss": 0.059,
"reward": 1.403194785118103,
"reward_std": 0.5319852828979492,
"rewards/": 6.873116970062256,
"rewards/math_compute_score": 0.0357142873108387,
"step": 87
},
{
"clip_ratio": 0.0,
"completion_length": 1486.107177734375,
"epoch": 0.27936507936507937,
"grad_norm": 0.4198714949042119,
"kl": 0.00194549560546875,
"learning_rate": 8.47123311193329e-07,
"loss": 0.0626,
"reward": 1.9519531726837158,
"reward_std": 0.4156142473220825,
"rewards/": 7.3311944007873535,
"rewards/math_compute_score": 0.6071428656578064,
"step": 88
},
{
"clip_ratio": 0.0,
"completion_length": 1490.982177734375,
"epoch": 0.28253968253968254,
"grad_norm": 0.3741322266459614,
"kl": 0.0023956298828125,
"learning_rate": 8.433982067923021e-07,
"loss": 0.0168,
"reward": 1.99573814868927,
"reward_std": 0.4969152510166168,
"rewards/": 6.978690147399902,
"rewards/math_compute_score": 0.7500000596046448,
"step": 89
},
{
"epoch": 0.2857142857142857,
"grad_norm": 0.32032292095086534,
"learning_rate": 8.396366694486466e-07,
"loss": 0.0544,
"step": 90
},
{
"epoch": 0.2857142857142857,
"eval_clip_ratio": 0.0,
"eval_completion_length": 1615.83935546875,
"eval_kl": 0.002166748046875,
"eval_loss": 0.04793115332722664,
"eval_reward": 1.9063432614008586,
"eval_reward_std": 0.47585757573445636,
"eval_rewards/": 6.746001084645589,
"eval_rewards/math_compute_score": 0.6964285969734192,
"eval_runtime": 133.7778,
"eval_samples_per_second": 0.157,
"eval_steps_per_second": 0.007,
"step": 90
},
{
"clip_ratio": 0.0,
"completion_length": 1566.7858276367188,
"epoch": 0.28888888888888886,
"grad_norm": 0.3482667132376765,
"kl": 0.00211334228515625,
"learning_rate": 8.358390982438705e-07,
"loss": -0.0106,
"reward": 1.6325893998146057,
"reward_std": 0.42033930122852325,
"rewards/": 6.4486610889434814,
"rewards/math_compute_score": 0.4285714477300644,
"step": 91
},
{
"clip_ratio": 0.0,
"completion_length": 1555.5535888671875,
"epoch": 0.2920634920634921,
"grad_norm": 0.3459245295737851,
"kl": 0.00384521484375,
"learning_rate": 8.320058960825058e-07,
"loss": 0.0209,
"reward": 1.5438895225524902,
"reward_std": 0.4550023376941681,
"rewards/": 6.4337334632873535,
"rewards/math_compute_score": 0.3214285969734192,
"step": 92
},
{
"clip_ratio": 0.0,
"completion_length": 1714.1607666015625,
"epoch": 0.29523809523809524,
"grad_norm": 0.42497165251755675,
"kl": 0.0028839111328125,
"learning_rate": 8.281374696493626e-07,
"loss": 0.0165,
"reward": 1.7839986085891724,
"reward_std": 0.6411929130554199,
"rewards/": 7.062849044799805,
"rewards/math_compute_score": 0.4642857313156128,
"step": 93
},
{
"clip_ratio": 0.0,
"completion_length": 1584.946533203125,
"epoch": 0.2984126984126984,
"grad_norm": 0.39574012867238795,
"kl": 0.0023956298828125,
"learning_rate": 8.242342293663809e-07,
"loss": 0.0325,
"reward": 1.5983260869979858,
"reward_std": 0.4390745162963867,
"rewards/": 6.991629600524902,
"rewards/math_compute_score": 0.25,
"step": 94
},
{
"clip_ratio": 0.0,
"completion_length": 1597.482177734375,
"epoch": 0.30158730158730157,
"grad_norm": 0.3834564701675289,
"kl": 0.0025634765625,
"learning_rate": 8.202965893490876e-07,
"loss": -0.0186,
"reward": 1.340318202972412,
"reward_std": 0.5021482110023499,
"rewards/": 5.915876388549805,
"rewards/math_compute_score": 0.196428582072258,
"step": 95
},
{
"clip_ratio": 0.0,
"completion_length": 1800.607177734375,
"epoch": 0.3047619047619048,
"grad_norm": 0.39206140103553405,
"kl": 0.0027923583984375,
"learning_rate": 8.163249673626602e-07,
"loss": 0.0298,
"reward": 1.2919502258300781,
"reward_std": 0.6428667902946472,
"rewards/": 6.031180381774902,
"rewards/math_compute_score": 0.1071428656578064,
"step": 96
},
{
"clip_ratio": 0.0,
"completion_length": 1651.8035888671875,
"epoch": 0.30793650793650795,
"grad_norm": 0.3826880063008121,
"kl": 0.00238037109375,
"learning_rate": 8.123197847776042e-07,
"loss": 0.0461,
"reward": 1.705224633216858,
"reward_std": 0.5447785258293152,
"rewards/": 6.668980598449707,
"rewards/math_compute_score": 0.4642857313156128,
"step": 97
},
{
"clip_ratio": 0.0,
"completion_length": 1901.321533203125,
"epoch": 0.3111111111111111,
"grad_norm": 0.3643029236288363,
"kl": 0.0025177001953125,
"learning_rate": 8.082814665250476e-07,
"loss": 0.0243,
"reward": 1.3659180402755737,
"reward_std": 0.7389653921127319,
"rewards/": 6.543875694274902,
"rewards/math_compute_score": 0.0714285746216774,
"step": 98
},
{
"clip_ratio": 0.0,
"completion_length": 1414.696533203125,
"epoch": 0.3142857142857143,
"grad_norm": 0.42740784739438414,
"kl": 0.0030975341796875,
"learning_rate": 8.042104410516575e-07,
"loss": 0.0245,
"reward": 1.5442662239074707,
"reward_std": 0.5408600568771362,
"rewards/": 6.292759895324707,
"rewards/math_compute_score": 0.3571428656578064,
"step": 99
},
{
"epoch": 0.31746031746031744,
"grad_norm": 0.3367757080515649,
"learning_rate": 8.001071402741842e-07,
"loss": 0.007,
"step": 100
},
{
"epoch": 0.31746031746031744,
"eval_clip_ratio": 0.0,
"eval_completion_length": 1632.1806233723958,
"eval_kl": 0.0024922688802083335,
"eval_loss": 0.026421261951327324,
"eval_reward": 1.9330463409423828,
"eval_reward_std": 0.39154160519440967,
"eval_rewards/": 7.0461835861206055,
"eval_rewards/math_compute_score": 0.6547619154055914,
"eval_runtime": 134.9073,
"eval_samples_per_second": 0.156,
"eval_steps_per_second": 0.007,
"step": 100
},
{
"clip_ratio": 0.0,
"completion_length": 1608.5447387695312,
"epoch": 0.32063492063492066,
"grad_norm": 0.35681279315589565,
"kl": 0.00269317626953125,
"learning_rate": 7.959719995336363e-07,
"loss": 0.029,
"reward": 1.7852399349212646,
"reward_std": 0.40309859812259674,
"rewards/": 6.890485763549805,
"rewards/math_compute_score": 0.508928582072258,
"step": 101
},
{
"clip_ratio": 0.0,
"completion_length": 1488.3751220703125,
"epoch": 0.3238095238095238,
"grad_norm": 0.44950984490187945,
"kl": 0.0035400390625,
"learning_rate": 7.918054575490943e-07,
"loss": 0.0435,
"reward": 1.8601562976837158,
"reward_std": 0.5413349270820618,
"rewards/": 6.8722100257873535,
"rewards/math_compute_score": 0.6071428656578064,
"step": 102
},
{
"clip_ratio": 0.0,
"completion_length": 1592.2501220703125,
"epoch": 0.326984126984127,
"grad_norm": 0.3485861719916155,
"kl": 0.002471923828125,
"learning_rate": 7.876079563711631e-07,
"loss": 0.0519,
"reward": 1.5621094703674316,
"reward_std": 0.5689576864242554,
"rewards/": 6.524832725524902,
"rewards/math_compute_score": 0.3214285969734192,
"step": 103
},
{
"clip_ratio": 0.0,
"completion_length": 1584.821533203125,
"epoch": 0.33015873015873015,
"grad_norm": 0.42767837474542764,
"kl": 0.00335693359375,
"learning_rate": 7.83379941335073e-07,
"loss": 0.093,
"reward": 1.7258999347686768,
"reward_std": 0.7036234140396118,
"rewards/": 6.486642360687256,
"rewards/math_compute_score": 0.535714328289032,
"step": 104
},
{
"clip_ratio": 0.0,
"completion_length": 1589.08935546875,
"epoch": 0.3333333333333333,
"grad_norm": 0.32351467813897694,
"kl": 0.002105712890625,
"learning_rate": 7.791218610134322e-07,
"loss": 0.0241,
"reward": 1.7169644832611084,
"reward_std": 0.38069403171539307,
"rewards/": 6.656250476837158,
"rewards/math_compute_score": 0.4821428656578064,
"step": 105
},
{
"clip_ratio": 0.0,
"completion_length": 1518.446533203125,
"epoch": 0.33650793650793653,
"grad_norm": 0.39674608646311715,
"kl": 0.00274658203125,
"learning_rate": 7.748341671686354e-07,
"loss": 0.0463,
"reward": 1.785309910774231,
"reward_std": 0.5290029048919678,
"rewards/": 6.8551201820373535,
"rewards/math_compute_score": 0.5178571939468384,
"step": 106
},
{
"clip_ratio": 0.0,
"completion_length": 1639.982177734375,
"epoch": 0.3396825396825397,
"grad_norm": 0.34724586686391384,
"kl": 0.0028839111328125,
"learning_rate": 7.705173147049325e-07,
"loss": 0.0139,
"reward": 1.8501187562942505,
"reward_std": 0.4504697918891907,
"rewards/": 6.964879035949707,
"rewards/math_compute_score": 0.5714285969734192,
"step": 107
},
{
"clip_ratio": 0.0,
"completion_length": 1484.1785888671875,
"epoch": 0.34285714285714286,
"grad_norm": 0.3499020427456334,
"kl": 0.0025787353515625,
"learning_rate": 7.661717616201668e-07,
"loss": -0.0305,
"reward": 1.6777344942092896,
"reward_std": 0.3427577316761017,
"rewards/": 5.817243576049805,
"rewards/math_compute_score": 0.6428571939468384,
"step": 108
},
{
"clip_ratio": 0.0,
"completion_length": 1837.96435546875,
"epoch": 0.346031746031746,
"grad_norm": 0.4037551751682681,
"kl": 0.0028228759765625,
"learning_rate": 7.617979689571839e-07,
"loss": 0.0442,
"reward": 1.3643137216567993,
"reward_std": 0.6223567724227905,
"rewards/": 6.535853862762451,
"rewards/math_compute_score": 0.0714285746216774,
"step": 109
},
{
"epoch": 0.3492063492063492,
"grad_norm": 0.4514262612365042,
"learning_rate": 7.573964007549154e-07,
"loss": -0.0428,
"step": 110
},
{
"epoch": 0.3492063492063492,
"eval_clip_ratio": 0.0,
"eval_completion_length": 1610.9127604166667,
"eval_kl": 0.0030568440755208335,
"eval_loss": 0.059969693422317505,
"eval_reward": 1.8927596807479858,
"eval_reward_std": 0.5094525118668874,
"eval_rewards/": 6.86855951944987,
"eval_rewards/math_compute_score": 0.6488095621267954,
"eval_runtime": 139.4711,
"eval_samples_per_second": 0.151,
"eval_steps_per_second": 0.007,
"step": 110
},
{
"clip_ratio": 0.0,
"completion_length": 1355.6964721679688,
"epoch": 0.3523809523809524,
"grad_norm": 0.416064136246473,
"kl": 0.003692626953125,
"learning_rate": 7.529675239991482e-07,
"loss": 0.007,
"reward": 2.1203389167785645,
"reward_std": 0.34416940808296204,
"rewards/": 7.280264854431152,
"rewards/math_compute_score": 0.830357164144516,
"step": 111
},
{
"clip_ratio": 0.0,
"completion_length": 1646.607177734375,
"epoch": 0.35555555555555557,
"grad_norm": 0.3080164040844727,
"kl": 0.00250244140625,
"learning_rate": 7.485118085729789e-07,
"loss": 0.013,
"reward": 1.5503767728805542,
"reward_std": 0.4306219816207886,
"rewards/": 6.180455207824707,
"rewards/math_compute_score": 0.392857164144516,
"step": 112
},
{
"clip_ratio": 0.0,
"completion_length": 1678.1785888671875,
"epoch": 0.35873015873015873,
"grad_norm": 0.3694201814855355,
"kl": 0.00311279296875,
"learning_rate": 7.440297272069614e-07,
"loss": 0.0761,
"reward": 2.034709930419922,
"reward_std": 0.43929895758628845,
"rewards/": 7.530692100524902,
"rewards/math_compute_score": 0.660714328289032,
"step": 113
},
{
"clip_ratio": 0.0,
"completion_length": 1546.7679443359375,
"epoch": 0.3619047619047619,
"grad_norm": 0.3908115750027717,
"kl": 0.0034942626953125,
"learning_rate": 7.395217554289523e-07,
"loss": -0.011,
"reward": 1.7967495918273926,
"reward_std": 0.22745420038700104,
"rewards/": 6.983747482299805,
"rewards/math_compute_score": 0.5,
"step": 114
},
{
"clip_ratio": 0.0,
"completion_length": 1633.9107666015625,
"epoch": 0.36507936507936506,
"grad_norm": 0.3199396618433185,
"kl": 0.0025634765625,
"learning_rate": 7.3498837151366e-07,
"loss": 0.0194,
"reward": 1.8163504600524902,
"reward_std": 0.5509271621704102,
"rewards/": 7.081752777099609,
"rewards/math_compute_score": 0.5,
"step": 115
},
{
"clip_ratio": 0.0,
"completion_length": 1633.83935546875,
"epoch": 0.3682539682539683,
"grad_norm": 0.3989797872601438,
"kl": 0.002685546875,
"learning_rate": 7.304300564319013e-07,
"loss": 0.0215,
"reward": 1.52039635181427,
"reward_std": 0.4950607120990753,
"rewards/": 6.744838237762451,
"rewards/math_compute_score": 0.2142857313156128,
"step": 116
},
{
"clip_ratio": 0.0,
"completion_length": 1765.0537109375,
"epoch": 0.37142857142857144,
"grad_norm": 0.33315626188294234,
"kl": 0.003143310546875,
"learning_rate": 7.258472937995735e-07,
"loss": -0.0229,
"reward": 1.795814871788025,
"reward_std": 0.49838095903396606,
"rewards/": 7.193359851837158,
"rewards/math_compute_score": 0.4464285969734192,
"step": 117
},
{
"clip_ratio": 0.0,
"completion_length": 1562.5535888671875,
"epoch": 0.3746031746031746,
"grad_norm": 0.3286142283747825,
"kl": 0.0029754638671875,
"learning_rate": 7.212405698263446e-07,
"loss": 0.0002,
"reward": 2.0463171005249023,
"reward_std": 0.4698004722595215,
"rewards/": 7.2315850257873535,
"rewards/math_compute_score": 0.7500000596046448,
"step": 118
},
{
"clip_ratio": 0.0,
"completion_length": 1634.5535888671875,
"epoch": 0.37777777777777777,
"grad_norm": 0.3834857277163386,
"kl": 0.003143310546875,
"learning_rate": 7.166103732640681e-07,
"loss": 0.0034,
"reward": 1.5640347003936768,
"reward_std": 0.48661187291145325,
"rewards/": 6.820173263549805,
"rewards/math_compute_score": 0.25,
"step": 119
},
{
"epoch": 0.38095238095238093,
"grad_norm": 0.3930354077134069,
"learning_rate": 7.119571953549304e-07,
"loss": 0.0164,
"step": 120
},
{
"epoch": 0.38095238095238093,
"eval_clip_ratio": 0.0,
"eval_completion_length": 1561.2083740234375,
"eval_kl": 0.0033162434895833335,
"eval_loss": 0.034597255289554596,
"eval_reward": 1.9965635935465496,
"eval_reward_std": 0.49989163875579834,
"eval_rewards/": 6.982817490895589,
"eval_rewards/math_compute_score": 0.7500000397364298,
"eval_runtime": 131.6643,
"eval_samples_per_second": 0.159,
"eval_steps_per_second": 0.008,
"step": 120
},
{
"clip_ratio": 0.0,
"completion_length": 1541.071533203125,
"epoch": 0.38412698412698415,
"grad_norm": 0.3736113886490285,
"kl": 0.00330352783203125,
"learning_rate": 7.072815297793302e-07,
"loss": -0.0227,
"reward": 1.750962734222412,
"reward_std": 0.44300225377082825,
"rewards/": 6.719099044799805,
"rewards/math_compute_score": 0.5089285969734192,
"step": 121
},
{
"clip_ratio": 0.0,
"completion_length": 1791.482177734375,
"epoch": 0.3873015873015873,
"grad_norm": 0.36023685839775266,
"kl": 0.00299072265625,
"learning_rate": 7.025838726035031e-07,
"loss": 0.0158,
"reward": 1.2251116037368774,
"reward_std": 0.5226312875747681,
"rewards/": 6.4112725257873535,
"rewards/math_compute_score": -0.0714285746216774,
"step": 122
},
{
"clip_ratio": 0.0,
"completion_length": 1458.21435546875,
"epoch": 0.3904761904761905,
"grad_norm": 0.4040624897084209,
"kl": 0.0032501220703125,
"learning_rate": 6.978647222268903e-07,
"loss": 0.0145,
"reward": 1.9194753170013428,
"reward_std": 0.5019935965538025,
"rewards/": 7.025949001312256,
"rewards/math_compute_score": 0.6428571939468384,
"step": 123
},
{
"clip_ratio": 0.0,
"completion_length": 1518.232177734375,
"epoch": 0.39365079365079364,
"grad_norm": 0.3500940260968403,
"kl": 0.0032958984375,
"learning_rate": 6.93124579329261e-07,
"loss": 0.0429,
"reward": 1.8887277841567993,
"reward_std": 0.3185231387615204,
"rewards/": 7.157924652099609,
"rewards/math_compute_score": 0.5714285969734192,
"step": 124
},
{
"clip_ratio": 0.0,
"completion_length": 1352.5179443359375,
"epoch": 0.3968253968253968,
"grad_norm": 0.35908221717719274,
"kl": 0.0038909912109375,
"learning_rate": 6.883639468175925e-07,
"loss": 0.0207,
"reward": 2.0699777603149414,
"reward_std": 0.2467387318611145,
"rewards/": 6.992745876312256,
"rewards/math_compute_score": 0.8392857313156128,
"step": 125
},
{
"clip_ratio": 0.0,
"completion_length": 1712.21435546875,
"epoch": 0.4,
"grad_norm": 0.3624308105870553,
"kl": 0.0030364990234375,
"learning_rate": 6.835833297727147e-07,
"loss": 0.0443,
"reward": 1.5869420766830444,
"reward_std": 0.39860856533050537,
"rewards/": 6.506138801574707,
"rewards/math_compute_score": 0.3571428656578064,
"step": 126
},
{
"clip_ratio": 0.0,
"completion_length": 1478.2857666015625,
"epoch": 0.4031746031746032,
"grad_norm": 0.41421111807827343,
"kl": 0.0032958984375,
"learning_rate": 6.787832353957224e-07,
"loss": 0.0465,
"reward": 2.0360493659973145,
"reward_std": 0.5292332172393799,
"rewards/": 7.180245876312256,
"rewards/math_compute_score": 0.7500000596046448,
"step": 127
},
{
"clip_ratio": 0.0,
"completion_length": 1562.3751220703125,
"epoch": 0.40634920634920635,
"grad_norm": 0.30993254735116554,
"kl": 0.003448486328125,
"learning_rate": 6.739641729541644e-07,
"loss": 0.0384,
"reward": 1.7399276494979858,
"reward_std": 0.3811955749988556,
"rewards/": 6.699637413024902,
"rewards/math_compute_score": 0.5,
"step": 128
},
{
"clip_ratio": 0.0,
"completion_length": 1551.0535888671875,
"epoch": 0.4095238095238095,
"grad_norm": 0.3771726274865134,
"kl": 0.00347900390625,
"learning_rate": 6.691266537280127e-07,
"loss": -0.0003,
"reward": 1.817131757736206,
"reward_std": 0.511991024017334,
"rewards/": 6.7999444007873535,
"rewards/math_compute_score": 0.5714285969734192,
"step": 129
},
{
"epoch": 0.4126984126984127,
"grad_norm": 0.35621529328169527,
"learning_rate": 6.642711909554174e-07,
"loss": 0.0192,
"step": 130
},
{
"epoch": 0.4126984126984127,
"eval_clip_ratio": 0.0,
"eval_completion_length": 1621.2857666015625,
"eval_kl": 0.0035502115885416665,
"eval_loss": 0.03674715757369995,
"eval_reward": 1.8930153846740723,
"eval_reward_std": 0.4688274661699931,
"eval_rewards/": 6.774600346883138,
"eval_rewards/math_compute_score": 0.6726190646489462,
"eval_runtime": 133.1814,
"eval_samples_per_second": 0.158,
"eval_steps_per_second": 0.008,
"step": 130
},
{
"clip_ratio": 0.0,
"completion_length": 1673.3839721679688,
"epoch": 0.4158730158730159,
"grad_norm": 0.4405464519743735,
"kl": 0.0037384033203125,
"learning_rate": 6.593982997782548e-07,
"loss": 0.0243,
"reward": 1.7884975671768188,
"reward_std": 0.6518445014953613,
"rewards/": 6.942487955093384,
"rewards/math_compute_score": 0.5000000149011612,
"step": 131
},
{
"clip_ratio": 0.0,
"completion_length": 1482.696533203125,
"epoch": 0.41904761904761906,
"grad_norm": 0.39457572146801523,
"kl": 0.003662109375,
"learning_rate": 6.545084971874736e-07,
"loss": 0.0455,
"reward": 1.9908483028411865,
"reward_std": 0.49568915367126465,
"rewards/": 6.9542412757873535,
"rewards/math_compute_score": 0.7500000596046448,
"step": 132
},
{
"clip_ratio": 0.0,
"completion_length": 1558.321533203125,
"epoch": 0.4222222222222222,
"grad_norm": 0.3723322635366814,
"kl": 0.0032196044921875,
"learning_rate": 6.496023019682446e-07,
"loss": 0.0261,
"reward": 2.116741180419922,
"reward_std": 0.34433886408805847,
"rewards/": 7.155134201049805,
"rewards/math_compute_score": 0.8571429252624512,
"step": 133
},
{
"clip_ratio": 0.0,
"completion_length": 1776.482177734375,
"epoch": 0.4253968253968254,
"grad_norm": 0.3342896457505393,
"kl": 0.00286865234375,
"learning_rate": 6.44680234644919e-07,
"loss": 0.0427,
"reward": 1.4701590538024902,
"reward_std": 0.552353024482727,
"rewards/": 6.636509895324707,
"rewards/math_compute_score": 0.1785714328289032,
"step": 134
},
{
"clip_ratio": 0.0,
"completion_length": 1594.1429443359375,
"epoch": 0.42857142857142855,
"grad_norm": 0.3548357521298244,
"kl": 0.0038299560546875,
"learning_rate": 6.397428174258047e-07,
"loss": 0.0483,
"reward": 2.0053014755249023,
"reward_std": 0.3594396710395813,
"rewards/": 7.455078601837158,
"rewards/math_compute_score": 0.6428571939468384,
"step": 135
},
{
"clip_ratio": 0.0,
"completion_length": 1353.71435546875,
"epoch": 0.43174603174603177,
"grad_norm": 0.4428436053114652,
"kl": 0.004180908203125,
"learning_rate": 6.347905741477612e-07,
"loss": 0.0562,
"reward": 1.9524275064468384,
"reward_std": 0.42881521582603455,
"rewards/": 7.1192803382873535,
"rewards/math_compute_score": 0.660714328289032,
"step": 136
},
{
"clip_ratio": 0.0,
"completion_length": 1373.732177734375,
"epoch": 0.43492063492063493,
"grad_norm": 0.3804164543335114,
"kl": 0.004302978515625,
"learning_rate": 6.298240302206241e-07,
"loss": 0.0371,
"reward": 2.0057549476623535,
"reward_std": 0.3291456401348114,
"rewards/": 7.028774261474609,
"rewards/math_compute_score": 0.7500000596046448,
"step": 137
},
{
"clip_ratio": 0.0,
"completion_length": 1614.732177734375,
"epoch": 0.4380952380952381,
"grad_norm": 0.3920737510841866,
"kl": 0.004119873046875,
"learning_rate": 6.2484371257146e-07,
"loss": 0.031,
"reward": 1.8299667835235596,
"reward_std": 0.5113915205001831,
"rewards/": 7.149832725524902,
"rewards/math_compute_score": 0.5,
"step": 138
},
{
"clip_ratio": 0.0,
"completion_length": 1481.08935546875,
"epoch": 0.44126984126984126,
"grad_norm": 0.38144253817590884,
"kl": 0.00408935546875,
"learning_rate": 6.198501495886638e-07,
"loss": -0.0053,
"reward": 1.7764790058135986,
"reward_std": 0.40745845437049866,
"rewards/": 6.739537239074707,
"rewards/math_compute_score": 0.535714328289032,
"step": 139
},
{
"epoch": 0.4444444444444444,
"grad_norm": 0.40734808818587626,
"learning_rate": 6.148438710658978e-07,
"loss": 0.0635,
"step": 140
},
{
"epoch": 0.4444444444444444,
"eval_clip_ratio": 0.0,
"eval_completion_length": 1595.1012369791667,
"eval_kl": 0.0038808186848958335,
"eval_loss": 0.03390444815158844,
"eval_reward": 1.973921298980713,
"eval_reward_std": 0.47822797298431396,
"eval_rewards/": 7.107701142628987,
"eval_rewards/math_compute_score": 0.6904762188593546,
"eval_runtime": 132.0027,
"eval_samples_per_second": 0.159,
"eval_steps_per_second": 0.008,
"step": 140
},
{
"clip_ratio": 0.0,
"completion_length": 1492.65185546875,
"epoch": 0.44761904761904764,
"grad_norm": 0.4080068354053614,
"kl": 0.0045166015625,
"learning_rate": 6.098254081458838e-07,
"loss": 0.0728,
"reward": 1.9614050388336182,
"reward_std": 0.5336401164531708,
"rewards/": 7.09273886680603,
"rewards/math_compute_score": 0.6785714626312256,
"step": 141
},
{
"clip_ratio": 0.0,
"completion_length": 1850.3929443359375,
"epoch": 0.4507936507936508,
"grad_norm": 0.34401058222235625,
"kl": 0.0030670166015625,
"learning_rate": 6.047952932640512e-07,
"loss": 0.0162,
"reward": 1.3241490125656128,
"reward_std": 0.568393886089325,
"rewards/": 6.263602256774902,
"rewards/math_compute_score": 0.0892857164144516,
"step": 142
},
{
"clip_ratio": 0.0,
"completion_length": 1395.8035888671875,
"epoch": 0.45396825396825397,
"grad_norm": 0.3084325410352855,
"kl": 0.0038909912109375,
"learning_rate": 5.997540600920478e-07,
"loss": 0.0174,
"reward": 1.953850507736206,
"reward_std": 0.4086017906665802,
"rewards/": 6.769252300262451,
"rewards/math_compute_score": 0.7500000596046448,
"step": 143
},
{
"clip_ratio": 0.0,
"completion_length": 1625.21435546875,
"epoch": 0.45714285714285713,
"grad_norm": 0.41338154741382704,
"kl": 0.0048828125,
"learning_rate": 5.947022434811201e-07,
"loss": 0.0367,
"reward": 1.5787110328674316,
"reward_std": 0.49698716402053833,
"rewards/": 6.750697612762451,
"rewards/math_compute_score": 0.2857142984867096,
"step": 144
},
{
"clip_ratio": 0.0,
"completion_length": 1505.0179443359375,
"epoch": 0.4603174603174603,
"grad_norm": 0.38852067924976885,
"kl": 0.00421142578125,
"learning_rate": 5.896403794053678e-07,
"loss": 0.0451,
"reward": 1.9318640232086182,
"reward_std": 0.5010179877281189,
"rewards/": 6.945034027099609,
"rewards/math_compute_score": 0.6785714626312256,
"step": 145
},
{
"clip_ratio": 0.0,
"completion_length": 1491.3751220703125,
"epoch": 0.4634920634920635,
"grad_norm": 0.3492147620184295,
"kl": 0.0042724609375,
"learning_rate": 5.845690049048798e-07,
"loss": 0.0706,
"reward": 2.178906202316284,
"reward_std": 0.457103431224823,
"rewards/": 7.323102951049805,
"rewards/math_compute_score": 0.8928571939468384,
"step": 146
},
{
"clip_ratio": 0.0,
"completion_length": 1746.1607666015625,
"epoch": 0.4666666666666667,
"grad_norm": 0.44104615846617024,
"kl": 0.005401611328125,
"learning_rate": 5.794886580287564e-07,
"loss": 0.0404,
"reward": 1.4707032442092896,
"reward_std": 0.8705157041549683,
"rewards/": 6.4249444007873535,
"rewards/math_compute_score": 0.2321428656578064,
"step": 147
},
{
"clip_ratio": 0.0,
"completion_length": 1627.08935546875,
"epoch": 0.46984126984126984,
"grad_norm": 0.395676436764285,
"kl": 0.003997802734375,
"learning_rate": 5.743998777780251e-07,
"loss": 0.057,
"reward": 1.7963100671768188,
"reward_std": 0.5916603207588196,
"rewards/": 7.410121917724609,
"rewards/math_compute_score": 0.392857164144516,
"step": 148
},
{
"clip_ratio": 0.0,
"completion_length": 1585.08935546875,
"epoch": 0.473015873015873,
"grad_norm": 0.32362413396596257,
"kl": 0.0037078857421875,
"learning_rate": 5.693032040484547e-07,
"loss": -0.0072,
"reward": 1.691545844078064,
"reward_std": 0.4373229146003723,
"rewards/": 6.529157638549805,
"rewards/math_compute_score": 0.4821428656578064,
"step": 149
},
{
"epoch": 0.47619047619047616,
"grad_norm": 0.38103977643395054,
"learning_rate": 5.641991775732755e-07,
"loss": 0.0482,
"step": 150
},
{
"epoch": 0.47619047619047616,
"eval_clip_ratio": 0.0,
"eval_completion_length": 1561.9564208984375,
"eval_kl": 0.004628499348958333,
"eval_loss": 0.023879073560237885,
"eval_reward": 1.9818453788757324,
"eval_reward_std": 0.42304734388987225,
"eval_rewards/": 6.980655034383138,
"eval_rewards/math_compute_score": 0.7321428954601288,
"eval_runtime": 131.655,
"eval_samples_per_second": 0.16,
"eval_steps_per_second": 0.008,
"step": 150
},
{
"clip_ratio": 0.0,
"completion_length": 1668.759033203125,
"epoch": 0.4793650793650794,
"grad_norm": 0.31698478601407637,
"kl": 0.00412750244140625,
"learning_rate": 5.590883398658094e-07,
"loss": 0.0239,
"reward": 1.6429409980773926,
"reward_std": 0.5557140111923218,
"rewards/": 6.857561826705933,
"rewards/math_compute_score": 0.3392857313156128,
"step": 151
},
{
"clip_ratio": 0.0,
"completion_length": 1623.08935546875,
"epoch": 0.48253968253968255,
"grad_norm": 0.3705285203587748,
"kl": 0.00408935546875,
"learning_rate": 5.539712331620185e-07,
"loss": 0.0581,
"reward": 1.4986224174499512,
"reward_std": 0.43796294927597046,
"rewards/": 6.7788262367248535,
"rewards/math_compute_score": 0.1785714328289032,
"step": 152
},
{
"clip_ratio": 0.0,
"completion_length": 1689.446533203125,
"epoch": 0.4857142857142857,
"grad_norm": 0.3861278436124767,
"kl": 0.00445556640625,
"learning_rate": 5.488484003629758e-07,
"loss": 0.0305,
"reward": 1.5913225412368774,
"reward_std": 0.6033198833465576,
"rewards/": 6.813755989074707,
"rewards/math_compute_score": 0.2857142984867096,
"step": 153
},
{
"clip_ratio": 0.0,
"completion_length": 1477.107177734375,
"epoch": 0.4888888888888889,
"grad_norm": 0.36475288664194044,
"kl": 0.0047607421875,
"learning_rate": 5.437203849772664e-07,
"loss": 0.0404,
"reward": 1.8446986675262451,
"reward_std": 0.32607829570770264,
"rewards/": 7.080636501312256,
"rewards/math_compute_score": 0.535714328289032,
"step": 154
},
{
"clip_ratio": 0.0,
"completion_length": 1420.3035888671875,
"epoch": 0.49206349206349204,
"grad_norm": 0.3896113098470113,
"kl": 0.004180908203125,
"learning_rate": 5.385877310633232e-07,
"loss": -0.0126,
"reward": 1.838978886604309,
"reward_std": 0.4181511104106903,
"rewards/": 6.694894313812256,
"rewards/math_compute_score": 0.625,
"step": 155
},
{
"clip_ratio": 0.0,
"completion_length": 1735.4107666015625,
"epoch": 0.49523809523809526,
"grad_norm": 0.366034646195721,
"kl": 0.004364013671875,
"learning_rate": 5.334509831717058e-07,
"loss": 0.0191,
"reward": 1.687611699104309,
"reward_std": 0.5151606798171997,
"rewards/": 7.2237725257873535,
"rewards/math_compute_score": 0.3035714328289032,
"step": 156
},
{
"clip_ratio": 0.0,
"completion_length": 1403.2857666015625,
"epoch": 0.4984126984126984,
"grad_norm": 0.3664836889631256,
"kl": 0.005584716796875,
"learning_rate": 5.283106862873252e-07,
"loss": 0.0848,
"reward": 2.293659210205078,
"reward_std": 0.24820633232593536,
"rewards/": 7.611154079437256,
"rewards/math_compute_score": 0.9642857313156128,
"step": 157
},
{
"clip_ratio": 0.0,
"completion_length": 1685.2501220703125,
"epoch": 0.5015873015873016,
"grad_norm": 0.3481512366201298,
"kl": 0.004180908203125,
"learning_rate": 5.231673857716243e-07,
"loss": 0.041,
"reward": 1.8583705425262451,
"reward_std": 0.42498812079429626,
"rewards/": 6.4347100257873535,
"rewards/math_compute_score": 0.7142857313156128,
"step": 158
},
{
"clip_ratio": 0.0,
"completion_length": 1417.6785888671875,
"epoch": 0.5047619047619047,
"grad_norm": 0.3977021065047324,
"kl": 0.00439453125,
"learning_rate": 5.18021627304717e-07,
"loss": 0.0716,
"reward": 1.9547433853149414,
"reward_std": 0.4087882936000824,
"rewards/": 7.273716926574707,
"rewards/math_compute_score": 0.625,
"step": 159
},
{
"epoch": 0.5079365079365079,
"grad_norm": 0.3539481723030545,
"learning_rate": 5.128739568274943e-07,
"loss": 0.0753,
"step": 160
},
{
"epoch": 0.5079365079365079,
"eval_clip_ratio": 0.0,
"eval_completion_length": 1508.3075764973958,
"eval_kl": 0.004781087239583333,
"eval_loss": 0.010089995339512825,
"eval_reward": 1.9843844572703044,
"eval_reward_std": 0.4194992780685425,
"eval_rewards/": 7.017159938812256,
"eval_rewards/math_compute_score": 0.7261905074119568,
"eval_runtime": 130.1634,
"eval_samples_per_second": 0.161,
"eval_steps_per_second": 0.008,
"step": 160
},
{
"clip_ratio": 0.0,
"completion_length": 1747.4286499023438,
"epoch": 0.5111111111111111,
"grad_norm": 0.3330449401616063,
"kl": 0.00437164306640625,
"learning_rate": 5.077249204837025e-07,
"loss": 0.019,
"reward": 1.6631278991699219,
"reward_std": 0.6911021769046783,
"rewards/": 6.672781944274902,
"rewards/math_compute_score": 0.4107142984867096,
"step": 161
},
{
"clip_ratio": 0.0,
"completion_length": 1497.482177734375,
"epoch": 0.5142857142857142,
"grad_norm": 0.42213450988969753,
"kl": 0.004486083984375,
"learning_rate": 5.025750645620004e-07,
"loss": 0.0252,
"reward": 1.7366769313812256,
"reward_std": 0.6020164489746094,
"rewards/": 6.540527820587158,
"rewards/math_compute_score": 0.535714328289032,
"step": 162
},
{
"clip_ratio": 0.0,
"completion_length": 1306.2857666015625,
"epoch": 0.5174603174603175,
"grad_norm": 0.6578798978662977,
"kl": 0.011474609375,
"learning_rate": 4.974249354379996e-07,
"loss": 0.05,
"reward": 2.152064800262451,
"reward_std": 0.43625935912132263,
"rewards/": 7.474609851837158,
"rewards/math_compute_score": 0.8214285969734192,
"step": 163
},
{
"clip_ratio": 0.0,
"completion_length": 1578.5179443359375,
"epoch": 0.5206349206349207,
"grad_norm": 0.3630493679815709,
"kl": 0.0047607421875,
"learning_rate": 4.922750795162973e-07,
"loss": 0.0061,
"reward": 1.8181921243667603,
"reward_std": 0.45378485321998596,
"rewards/": 7.305245876312256,
"rewards/math_compute_score": 0.4464285969734192,
"step": 164
},
{
"clip_ratio": 0.0,
"completion_length": 1523.857177734375,
"epoch": 0.5238095238095238,
"grad_norm": 0.40781680936494136,
"kl": 0.0057373046875,
"learning_rate": 4.871260431725058e-07,
"loss": 0.0301,
"reward": 1.784919261932373,
"reward_std": 0.5600239634513855,
"rewards/": 6.710309982299805,
"rewards/math_compute_score": 0.5535714626312256,
"step": 165
},
{
"clip_ratio": 0.0,
"completion_length": 1653.607177734375,
"epoch": 0.526984126984127,
"grad_norm": 0.3627537206069918,
"kl": 0.005035400390625,
"learning_rate": 4.81978372695283e-07,
"loss": 0.0296,
"reward": 1.5985910892486572,
"reward_std": 0.39210641384124756,
"rewards/": 6.707240581512451,
"rewards/math_compute_score": 0.3214285969734192,
"step": 166
},
{
"clip_ratio": 0.0,
"completion_length": 1424.1785888671875,
"epoch": 0.5301587301587302,
"grad_norm": 0.3566511267927936,
"kl": 0.00543212890625,
"learning_rate": 4.768326142283756e-07,
"loss": 0.0401,
"reward": 1.9029020071029663,
"reward_std": 0.253780335187912,
"rewards/": 6.871652126312256,
"rewards/math_compute_score": 0.660714328289032,
"step": 167
},
{
"clip_ratio": 0.0,
"completion_length": 1728.1251220703125,
"epoch": 0.5333333333333333,
"grad_norm": 0.3668634566920126,
"kl": 0.004486083984375,
"learning_rate": 4.7168931371267473e-07,
"loss": 0.0457,
"reward": 1.5901787281036377,
"reward_std": 0.5735574960708618,
"rewards/": 6.950893402099609,
"rewards/math_compute_score": 0.25,
"step": 168
},
{
"clip_ratio": 0.0,
"completion_length": 1611.5357666015625,
"epoch": 0.5365079365079365,
"grad_norm": 0.3213785200743163,
"kl": 0.00469970703125,
"learning_rate": 4.665490168282943e-07,
"loss": 0.0072,
"reward": 1.8737167119979858,
"reward_std": 0.6218880414962769,
"rewards/": 6.940011501312256,
"rewards/math_compute_score": 0.6071428656578064,
"step": 169
},
{
"epoch": 0.5396825396825397,
"grad_norm": 0.35976271648220715,
"learning_rate": 4.614122689366768e-07,
"loss": 0.0402,
"step": 170
},
{
"epoch": 0.5396825396825397,
"eval_clip_ratio": 0.0,
"eval_completion_length": 1549.6785888671875,
"eval_kl": 0.004964192708333333,
"eval_loss": 0.03550202399492264,
"eval_reward": 1.9592356284459431,
"eval_reward_std": 0.42070769270261127,
"eval_rewards/": 6.962844530741374,
"eval_rewards/math_compute_score": 0.7083333631356558,
"eval_runtime": 131.0866,
"eval_samples_per_second": 0.16,
"eval_steps_per_second": 0.008,
"step": 170
},
{
"clip_ratio": 0.0,
"completion_length": 1788.1697387695312,
"epoch": 0.5428571428571428,
"grad_norm": 0.30243975738537365,
"kl": 0.00441741943359375,
"learning_rate": 4.562796150227337e-07,
"loss": 0.0388,
"reward": 1.6102469563484192,
"reward_std": 0.5664084255695343,
"rewards/": 6.872663497924805,
"rewards/math_compute_score": 0.2946428656578064,
"step": 171
},
{
"clip_ratio": 0.0,
"completion_length": 1281.8929443359375,
"epoch": 0.546031746031746,
"grad_norm": 0.4239327955548188,
"kl": 0.006072998046875,
"learning_rate": 4.511515996370243e-07,
"loss": -0.0323,
"reward": 2.155747890472412,
"reward_std": 0.38098111748695374,
"rewards/": 7.064453601837158,
"rewards/math_compute_score": 0.9285714626312256,
"step": 172
},
{
"clip_ratio": 0.0,
"completion_length": 1426.08935546875,
"epoch": 0.5492063492063493,
"grad_norm": 0.3848409508825075,
"kl": 0.004547119140625,
"learning_rate": 4.460287668379814e-07,
"loss": 0.0612,
"reward": 1.9045759439468384,
"reward_std": 0.19524678587913513,
"rewards/": 6.522879600524902,
"rewards/math_compute_score": 0.7500000596046448,
"step": 173
},
{
"clip_ratio": 0.0,
"completion_length": 1506.8751220703125,
"epoch": 0.5523809523809524,
"grad_norm": 0.36448706461592817,
"kl": 0.00469970703125,
"learning_rate": 4.409116601341907e-07,
"loss": -0.0125,
"reward": 2.0860493183135986,
"reward_std": 0.4216456711292267,
"rewards/": 7.073102951049805,
"rewards/math_compute_score": 0.8392857313156128,
"step": 174
},
{
"clip_ratio": 0.0,
"completion_length": 1531.4285888671875,
"epoch": 0.5555555555555556,
"grad_norm": 0.4058679353297223,
"kl": 0.005218505859375,
"learning_rate": 4.3580082242672444e-07,
"loss": 0.0513,
"reward": 2.0486607551574707,
"reward_std": 0.5620574951171875,
"rewards/": 7.171875476837158,
"rewards/math_compute_score": 0.7678571939468384,
"step": 175
},
{
"clip_ratio": 0.0,
"completion_length": 1354.0535888671875,
"epoch": 0.5587301587301587,
"grad_norm": 0.41211128125944263,
"kl": 0.006805419921875,
"learning_rate": 4.3069679595154536e-07,
"loss": 0.0461,
"reward": 2.362277030944824,
"reward_std": 0.20701220631599426,
"rewards/": 7.811384201049805,
"rewards/math_compute_score": 1.0,
"step": 176
},
{
"clip_ratio": 0.0,
"completion_length": 1543.571533203125,
"epoch": 0.5619047619047619,
"grad_norm": 0.3763535929052217,
"kl": 0.005279541015625,
"learning_rate": 4.2560012222197506e-07,
"loss": 0.0072,
"reward": 1.9439733028411865,
"reward_std": 0.26243603229522705,
"rewards/": 6.791295051574707,
"rewards/math_compute_score": 0.7321428656578064,
"step": 177
},
{
"clip_ratio": 0.0,
"completion_length": 1764.21435546875,
"epoch": 0.5650793650793651,
"grad_norm": 0.38459117939596527,
"kl": 0.0054931640625,
"learning_rate": 4.205113419712435e-07,
"loss": 0.0308,
"reward": 1.413002371788025,
"reward_std": 0.48099908232688904,
"rewards/": 6.493582725524902,
"rewards/math_compute_score": 0.1428571492433548,
"step": 178
},
{
"clip_ratio": 0.0,
"completion_length": 1457.071533203125,
"epoch": 0.5682539682539682,
"grad_norm": 0.3648394332582025,
"kl": 0.005157470703125,
"learning_rate": 4.1543099509512023e-07,
"loss": 0.0135,
"reward": 2.1016740798950195,
"reward_std": 0.4655519425868988,
"rewards/": 7.508370876312256,
"rewards/math_compute_score": 0.7500000596046448,
"step": 179
},
{
"epoch": 0.5714285714285714,
"grad_norm": 0.4347832296603526,
"learning_rate": 4.1035962059463224e-07,
"loss": -0.0031,
"step": 180
},
{
"epoch": 0.5714285714285714,
"eval_clip_ratio": 0.0,
"eval_completion_length": 1564.9504801432292,
"eval_kl": 0.005472819010416667,
"eval_loss": 0.04810946434736252,
"eval_reward": 2.0953497886657715,
"eval_reward_std": 0.3557452509800593,
"eval_rewards/": 7.143415451049805,
"eval_rewards/math_compute_score": 0.8333333532015482,
"eval_runtime": 130.6017,
"eval_samples_per_second": 0.161,
"eval_steps_per_second": 0.008,
"step": 180
},
{
"clip_ratio": 0.0,
"completion_length": 1442.7589721679688,
"epoch": 0.5746031746031746,
"grad_norm": 0.31604533911482663,
"kl": 0.0054168701171875,
"learning_rate": 4.052977565188799e-07,
"loss": 0.0286,
"reward": 1.9001396894454956,
"reward_std": 0.35335803031921387,
"rewards/": 6.929269313812256,
"rewards/math_compute_score": 0.6428571492433548,
"step": 181
},
{
"clip_ratio": 0.0,
"completion_length": 1675.7857666015625,
"epoch": 0.5777777777777777,
"grad_norm": 0.40418194066418917,
"kl": 0.00537109375,
"learning_rate": 4.0024593990795223e-07,
"loss": 0.0231,
"reward": 1.45106041431427,
"reward_std": 0.7710850238800049,
"rewards/": 6.826730251312256,
"rewards/math_compute_score": 0.1071428656578064,
"step": 182
},
{
"clip_ratio": 0.0,
"completion_length": 1623.696533203125,
"epoch": 0.580952380952381,
"grad_norm": 20.231824794338248,
"kl": 1.4296875,
"learning_rate": 3.952047067359487e-07,
"loss": 0.1031,
"reward": 1.6864677667617798,
"reward_std": 0.3528647720813751,
"rewards/": 6.860909938812256,
"rewards/math_compute_score": 0.392857164144516,
"step": 183
},
{
"clip_ratio": 0.0,
"completion_length": 1415.08935546875,
"epoch": 0.5841269841269842,
"grad_norm": 0.38374407884466366,
"kl": 0.005889892578125,
"learning_rate": 3.9017459185411614e-07,
"loss": -0.0095,
"reward": 1.9470704793930054,
"reward_std": 0.3512076735496521,
"rewards/": 6.878209114074707,
"rewards/math_compute_score": 0.7142857313156128,
"step": 184
},
{
"clip_ratio": 0.0,
"completion_length": 1437.857177734375,
"epoch": 0.5873015873015873,
"grad_norm": 0.3960042132238463,
"kl": 0.004913330078125,
"learning_rate": 3.8515612893410224e-07,
"loss": 0.0284,
"reward": 1.9784600734710693,
"reward_std": 0.3158036768436432,
"rewards/": 7.035156726837158,
"rewards/math_compute_score": 0.7142857313156128,
"step": 185
},
{
"clip_ratio": 0.0,
"completion_length": 1376.5535888671875,
"epoch": 0.5904761904761905,
"grad_norm": 0.4231910527525782,
"kl": 0.005767822265625,
"learning_rate": 3.8014985041133626e-07,
"loss": 0.0177,
"reward": 2.132868528366089,
"reward_std": 0.47174835205078125,
"rewards/": 7.450056076049805,
"rewards/math_compute_score": 0.8035714626312256,
"step": 186
},
{
"clip_ratio": 0.0,
"completion_length": 1735.0357666015625,
"epoch": 0.5936507936507937,
"grad_norm": 0.370413487882503,
"kl": 0.004730224609375,
"learning_rate": 3.7515628742853997e-07,
"loss": 0.032,
"reward": 1.568275809288025,
"reward_std": 0.4366031587123871,
"rewards/": 6.555664539337158,
"rewards/math_compute_score": 0.3214285969734192,
"step": 187
},
{
"clip_ratio": 0.0,
"completion_length": 1384.107177734375,
"epoch": 0.5968253968253968,
"grad_norm": 0.34069036045598083,
"kl": 0.005096435546875,
"learning_rate": 3.70175969779376e-07,
"loss": -0.0015,
"reward": 1.8890068531036377,
"reward_std": 0.3061581552028656,
"rewards/": 7.1593194007873535,
"rewards/math_compute_score": 0.5714285969734192,
"step": 188
},
{
"clip_ratio": 0.0,
"completion_length": 1711.58935546875,
"epoch": 0.6,
"grad_norm": 0.3411825559980518,
"kl": 0.004913330078125,
"learning_rate": 3.6520942585223866e-07,
"loss": -0.0253,
"reward": 1.6066406965255737,
"reward_std": 0.5342792868614197,
"rewards/": 6.533203601837158,
"rewards/math_compute_score": 0.3750000298023224,
"step": 189
},
{
"epoch": 0.6031746031746031,
"grad_norm": 0.35592352794000803,
"learning_rate": 3.602571825741953e-07,
"loss": 0.0205,
"step": 190
},
{
"epoch": 0.6031746031746031,
"eval_clip_ratio": 0.0,
"eval_completion_length": 1554.7837727864583,
"eval_kl": 0.005961100260416667,
"eval_loss": -0.003977527376264334,
"eval_reward": 1.9975679318110149,
"eval_reward_std": 0.45795708894729614,
"eval_rewards/": 6.940220673878987,
"eval_rewards/math_compute_score": 0.761904795964559,
"eval_runtime": 131.2541,
"eval_samples_per_second": 0.16,
"eval_steps_per_second": 0.008,
"step": 190
},
{
"clip_ratio": 0.0,
"completion_length": 1356.5625610351562,
"epoch": 0.6063492063492063,
"grad_norm": 0.3596374444943596,
"kl": 0.0055999755859375,
"learning_rate": 3.55319765355081e-07,
"loss": -0.0037,
"reward": 2.0135952830314636,
"reward_std": 0.33204740285873413,
"rewards/": 6.925118923187256,
"rewards/math_compute_score": 0.785714328289032,
"step": 191
},
{
"clip_ratio": 0.0,
"completion_length": 1531.6251220703125,
"epoch": 0.6095238095238096,
"grad_norm": 0.3616323272673789,
"kl": 0.006195068359375,
"learning_rate": 3.503976980317554e-07,
"loss": -0.0153,
"reward": 2.023493528366089,
"reward_std": 0.3276682496070862,
"rewards/": 7.5460381507873535,
"rewards/math_compute_score": 0.6428571939468384,
"step": 192
},
{
"clip_ratio": 0.0,
"completion_length": 1549.982177734375,
"epoch": 0.6126984126984127,
"grad_norm": 0.3739633675188606,
"kl": 0.0057373046875,
"learning_rate": 3.454915028125263e-07,
"loss": 0.0195,
"reward": 1.88074791431427,
"reward_std": 0.5293351411819458,
"rewards/": 6.975167751312256,
"rewards/math_compute_score": 0.6071428656578064,
"step": 193
},
{
"clip_ratio": 0.0,
"completion_length": 1632.857177734375,
"epoch": 0.6158730158730159,
"grad_norm": 0.3844476214545633,
"kl": 0.00555419921875,
"learning_rate": 3.4060170022174517e-07,
"loss": -0.0172,
"reward": 1.9814037084579468,
"reward_std": 0.6114475727081299,
"rewards/": 7.192731857299805,
"rewards/math_compute_score": 0.6785714626312256,
"step": 194
},
{
"clip_ratio": 0.0,
"completion_length": 1197.08935546875,
"epoch": 0.6190476190476191,
"grad_norm": 0.3588871009271226,
"kl": 0.005828857421875,
"learning_rate": 3.357288090445826e-07,
"loss": 0.0417,
"reward": 2.2577009201049805,
"reward_std": 0.24351288378238678,
"rewards/": 7.431362152099609,
"rewards/math_compute_score": 0.9642857313156128,
"step": 195
},
{
"clip_ratio": 0.0,
"completion_length": 1516.5535888671875,
"epoch": 0.6222222222222222,
"grad_norm": 0.4221755143300646,
"kl": 0.006378173828125,
"learning_rate": 3.3087334627198727e-07,
"loss": 0.0476,
"reward": 2.0338730812072754,
"reward_std": 0.45778489112854004,
"rewards/": 7.455078601837158,
"rewards/math_compute_score": 0.6785714626312256,
"step": 196
},
{
"clip_ratio": 0.0,
"completion_length": 1591.9285888671875,
"epoch": 0.6253968253968254,
"grad_norm": 0.37451153018874117,
"kl": 0.005859375,
"learning_rate": 3.260358270458354e-07,
"loss": 0.0074,
"reward": 1.9349645376205444,
"reward_std": 0.4895531237125397,
"rewards/": 6.960536956787109,
"rewards/math_compute_score": 0.6785714626312256,
"step": 197
},
{
"clip_ratio": 0.0,
"completion_length": 1527.196533203125,
"epoch": 0.6285714285714286,
"grad_norm": 0.3302300732895513,
"kl": 0.005035400390625,
"learning_rate": 3.212167646042776e-07,
"loss": -0.0122,
"reward": 1.9998327493667603,
"reward_std": 0.2854258418083191,
"rewards/": 7.213449001312256,
"rewards/math_compute_score": 0.6964285969734192,
"step": 198
},
{
"clip_ratio": 0.0,
"completion_length": 1533.3751220703125,
"epoch": 0.6317460317460317,
"grad_norm": 0.36563930742752804,
"kl": 0.005340576171875,
"learning_rate": 3.164166702272855e-07,
"loss": 0.0625,
"reward": 2.099107265472412,
"reward_std": 0.40774399042129517,
"rewards/": 7.638393402099609,
"rewards/math_compute_score": 0.7142857313156128,
"step": 199
},
{
"epoch": 0.6349206349206349,
"grad_norm": 0.3003969277270976,
"learning_rate": 3.1163605318240736e-07,
"loss": 0.0211,
"step": 200
},
{
"epoch": 0.6349206349206349,
"eval_clip_ratio": 0.0,
"eval_completion_length": 1513.3889567057292,
"eval_kl": 0.0057373046875,
"eval_loss": -0.0034506141673773527,
"eval_reward": 2.0185548464457193,
"eval_reward_std": 0.42673546075820923,
"eval_rewards/": 7.140392780303955,
"eval_rewards/math_compute_score": 0.7380952636400858,
"eval_runtime": 131.2464,
"eval_samples_per_second": 0.16,
"eval_steps_per_second": 0.008,
"step": 200
},
{
"clip_ratio": 0.0,
"completion_length": 1553.607177734375,
"epoch": 0.638095238095238,
"grad_norm": 0.4204253637756095,
"kl": 0.00592041015625,
"learning_rate": 3.0687542067073915e-07,
"loss": 0.0522,
"reward": 1.829387605190277,
"reward_std": 0.4581097811460495,
"rewards/": 6.932652473449707,
"rewards/math_compute_score": 0.5535714626312256,
"step": 201
},
{
"clip_ratio": 0.0,
"completion_length": 1458.4107666015625,
"epoch": 0.6412698412698413,
"grad_norm": 0.37293790377807984,
"kl": 0.00726318359375,
"learning_rate": 3.021352777731095e-07,
"loss": 0.0798,
"reward": 1.893429160118103,
"reward_std": 0.7099537253379822,
"rewards/": 7.252860069274902,
"rewards/math_compute_score": 0.5535714626312256,
"step": 202
},
{
"clip_ratio": 0.0,
"completion_length": 1727.446533203125,
"epoch": 0.6444444444444445,
"grad_norm": 0.364624475214311,
"kl": 0.005828857421875,
"learning_rate": 2.974161273964969e-07,
"loss": 0.0132,
"reward": 1.3616769313812256,
"reward_std": 0.5690730214118958,
"rewards/": 6.379813194274902,
"rewards/math_compute_score": 0.1071428656578064,
"step": 203
},
{
"clip_ratio": 0.0,
"completion_length": 1601.232177734375,
"epoch": 0.6476190476190476,
"grad_norm": 0.3723608933130564,
"kl": 0.005279541015625,
"learning_rate": 2.9271847022066987e-07,
"loss": 0.0167,
"reward": 1.7884488105773926,
"reward_std": 0.3653627932071686,
"rewards/": 7.227957725524902,
"rewards/math_compute_score": 0.4285714626312256,
"step": 204
},
{
"clip_ratio": 0.0,
"completion_length": 1664.3035888671875,
"epoch": 0.6507936507936508,
"grad_norm": 0.37220118357197624,
"kl": 0.0057373046875,
"learning_rate": 2.880428046450697e-07,
"loss": 0.0547,
"reward": 2.2032926082611084,
"reward_std": 0.5352621674537659,
"rewards/": 7.873605251312256,
"rewards/math_compute_score": 0.785714328289032,
"step": 205
},
{
"clip_ratio": 0.0,
"completion_length": 1527.446533203125,
"epoch": 0.653968253968254,
"grad_norm": 0.44669994879904146,
"kl": 0.00592041015625,
"learning_rate": 2.8338962673593194e-07,
"loss": 0.0466,
"reward": 1.9878350496292114,
"reward_std": 0.36021357774734497,
"rewards/": 7.224888801574707,
"rewards/math_compute_score": 0.6785714626312256,
"step": 206
},
{
"clip_ratio": 0.0,
"completion_length": 1485.3929443359375,
"epoch": 0.6571428571428571,
"grad_norm": 0.36237163203607514,
"kl": 0.0067138671875,
"learning_rate": 2.7875943017365556e-07,
"loss": 0.0087,
"reward": 2.0885045528411865,
"reward_std": 0.23854570090770721,
"rewards/": 7.585379600524902,
"rewards/math_compute_score": 0.7142857313156128,
"step": 207
},
{
"clip_ratio": 0.0,
"completion_length": 1390.21435546875,
"epoch": 0.6603174603174603,
"grad_norm": 0.43492788329993204,
"kl": 0.006011962890625,
"learning_rate": 2.7415270620042634e-07,
"loss": 0.0368,
"reward": 2.027120590209961,
"reward_std": 0.32408618927001953,
"rewards/": 7.135602951049805,
"rewards/math_compute_score": 0.7500000596046448,
"step": 208
},
{
"clip_ratio": 0.0,
"completion_length": 1614.232177734375,
"epoch": 0.6634920634920635,
"grad_norm": 0.4285860950638222,
"kl": 0.005950927734375,
"learning_rate": 2.695699435680986e-07,
"loss": -0.0219,
"reward": 1.80555260181427,
"reward_std": 0.39699044823646545,
"rewards/": 7.170619964599609,
"rewards/math_compute_score": 0.4642857313156128,
"step": 209
},
{
"epoch": 0.6666666666666666,
"grad_norm": 0.4073585516563059,
"learning_rate": 2.6501162848634016e-07,
"loss": 0.0635,
"step": 210
},
{
"epoch": 0.6666666666666666,
"eval_clip_ratio": 0.0,
"eval_completion_length": 1593.8790690104167,
"eval_kl": 0.006266276041666667,
"eval_loss": 0.010452189482748508,
"eval_reward": 2.0239213705062866,
"eval_reward_std": 0.4319620430469513,
"eval_rewards/": 7.357701142628987,
"eval_rewards/math_compute_score": 0.6904762089252472,
"eval_runtime": 132.1993,
"eval_samples_per_second": 0.159,
"eval_steps_per_second": 0.008,
"step": 210
},
{
"clip_ratio": 0.0,
"completion_length": 1548.5625610351562,
"epoch": 0.6698412698412698,
"grad_norm": 0.3896154656236107,
"kl": 0.0064697265625,
"learning_rate": 2.604782445710476e-07,
"loss": 0.0685,
"reward": 1.9150113463401794,
"reward_std": 0.5420868694782257,
"rewards/": 7.289341926574707,
"rewards/math_compute_score": 0.5714285969734192,
"step": 211
},
{
"clip_ratio": 0.0,
"completion_length": 1175.875,
"epoch": 0.6730158730158731,
"grad_norm": 0.3761166584078419,
"kl": 0.00653076171875,
"learning_rate": 2.559702727930386e-07,
"loss": 0.0143,
"reward": 2.3515625,
"reward_std": 0.2300196886062622,
"rewards/": 7.757812976837158,
"rewards/math_compute_score": 1.0,
"step": 212
},
{
"clip_ratio": 0.0,
"completion_length": 1592.0535888671875,
"epoch": 0.6761904761904762,
"grad_norm": 0.3547503060575964,
"kl": 0.005767822265625,
"learning_rate": 2.5148819142702095e-07,
"loss": 0.0157,
"reward": 2.0560269355773926,
"reward_std": 0.4543492794036865,
"rewards/": 7.565848350524902,
"rewards/math_compute_score": 0.6785714626312256,
"step": 213
},
{
"clip_ratio": 0.0,
"completion_length": 1390.7679443359375,
"epoch": 0.6793650793650794,
"grad_norm": 0.41727155544044037,
"kl": 0.00811767578125,
"learning_rate": 2.470324760008517e-07,
"loss": 0.0064,
"reward": 1.9091730117797852,
"reward_std": 0.4644983112812042,
"rewards/": 6.545863628387451,
"rewards/math_compute_score": 0.7500000596046448,
"step": 214
},
{
"clip_ratio": 0.0,
"completion_length": 1498.2857666015625,
"epoch": 0.6825396825396826,
"grad_norm": 0.38605442508491833,
"kl": 0.006195068359375,
"learning_rate": 2.426035992450848e-07,
"loss": 0.0188,
"reward": 1.966071605682373,
"reward_std": 0.4697768986225128,
"rewards/": 7.2589287757873535,
"rewards/math_compute_score": 0.6428571939468384,
"step": 215
},
{
"clip_ratio": 0.0,
"completion_length": 1406.232177734375,
"epoch": 0.6857142857142857,
"grad_norm": 0.3466649961934692,
"kl": 0.004974365234375,
"learning_rate": 2.382020310428161e-07,
"loss": 0.0045,
"reward": 2.04665207862854,
"reward_std": 0.2766338586807251,
"rewards/": 7.233259201049805,
"rewards/math_compute_score": 0.7500000596046448,
"step": 216
},
{
"clip_ratio": 0.0,
"completion_length": 1642.9107666015625,
"epoch": 0.6888888888888889,
"grad_norm": 0.3395883468462042,
"kl": 0.00592041015625,
"learning_rate": 2.3382823837983312e-07,
"loss": 0.0283,
"reward": 1.9188058376312256,
"reward_std": 0.31119585037231445,
"rewards/": 7.308315277099609,
"rewards/math_compute_score": 0.5714285969734192,
"step": 217
},
{
"clip_ratio": 0.0,
"completion_length": 1288.75,
"epoch": 0.692063492063492,
"grad_norm": 0.4169289379787409,
"kl": 0.00665283203125,
"learning_rate": 2.2948268529506765e-07,
"loss": 0.0257,
"reward": 2.0765626430511475,
"reward_std": 0.4043300747871399,
"rewards/": 7.811384201049805,
"rewards/math_compute_score": 0.6428571939468384,
"step": 218
},
{
"clip_ratio": 0.0,
"completion_length": 1746.0535888671875,
"epoch": 0.6952380952380952,
"grad_norm": 0.38975967722789145,
"kl": 0.005126953125,
"learning_rate": 2.251658328313647e-07,
"loss": -0.002,
"reward": 1.3670480251312256,
"reward_std": 0.6043983697891235,
"rewards/": 6.7638115882873535,
"rewards/math_compute_score": 0.01785714365541935,
"step": 219
},
{
"epoch": 0.6984126984126984,
"grad_norm": 0.3971534991234478,
"learning_rate": 2.208781389865677e-07,
"loss": 0.0496,
"step": 220
},
{
"epoch": 0.6984126984126984,
"eval_clip_ratio": 0.0,
"eval_completion_length": 1515.7163492838542,
"eval_kl": 0.006093343098958333,
"eval_loss": 0.050898950546979904,
"eval_reward": 1.9710845947265625,
"eval_reward_std": 0.48362330595652264,
"eval_rewards/": 7.117327372233073,
"eval_rewards/math_compute_score": 0.6845238407452902,
"eval_runtime": 130.2534,
"eval_samples_per_second": 0.161,
"eval_steps_per_second": 0.008,
"step": 220
},
{
"clip_ratio": 0.0,
"completion_length": 1566.7857666015625,
"epoch": 0.7015873015873015,
"grad_norm": 0.32907894573216867,
"kl": 0.0057525634765625,
"learning_rate": 2.1662005866492715e-07,
"loss": 0.0312,
"reward": 1.7159180641174316,
"reward_std": 0.4860241115093231,
"rewards/": 7.329590320587158,
"rewards/math_compute_score": 0.3125000149011612,
"step": 221
},
{
"clip_ratio": 0.0,
"completion_length": 1622.3751220703125,
"epoch": 0.7047619047619048,
"grad_norm": 0.36878225894218164,
"kl": 0.006378173828125,
"learning_rate": 2.1239204362883695e-07,
"loss": 0.0173,
"reward": 1.4388115406036377,
"reward_std": 0.48099273443222046,
"rewards/": 6.4083428382873535,
"rewards/math_compute_score": 0.196428582072258,
"step": 222
},
{
"clip_ratio": 0.0,
"completion_length": 1494.3929443359375,
"epoch": 0.707936507936508,
"grad_norm": 0.38962068270348554,
"kl": 0.006134033203125,
"learning_rate": 2.0819454245090568e-07,
"loss": 0.0288,
"reward": 1.872544765472412,
"reward_std": 0.4542202055454254,
"rewards/": 6.934152126312256,
"rewards/math_compute_score": 0.6071428656578064,
"step": 223
},
{
"clip_ratio": 0.0,
"completion_length": 1317.2857666015625,
"epoch": 0.7111111111111111,
"grad_norm": 0.36894582377255875,
"kl": 0.00543212890625,
"learning_rate": 2.0402800046636364e-07,
"loss": 0.0548,
"reward": 2.053738832473755,
"reward_std": 0.22289863228797913,
"rewards/": 6.911551475524902,
"rewards/math_compute_score": 0.8392857313156128,
"step": 224
},
{
"clip_ratio": 0.0,
"completion_length": 1480.732177734375,
"epoch": 0.7142857142857143,
"grad_norm": 0.35221132675838623,
"kl": 0.004974365234375,
"learning_rate": 1.9989285972581593e-07,
"loss": -0.0313,
"reward": 1.808510184288025,
"reward_std": 0.404258131980896,
"rewards/": 6.471121788024902,
"rewards/math_compute_score": 0.6428571939468384,
"step": 225
},
{
"clip_ratio": 0.0,
"completion_length": 1309.25,
"epoch": 0.7174603174603175,
"grad_norm": 0.4356097059014855,
"kl": 0.006500244140625,
"learning_rate": 1.9578955894834258e-07,
"loss": 0.0394,
"reward": 2.175558090209961,
"reward_std": 0.37214693427085876,
"rewards/": 7.734933376312256,
"rewards/math_compute_score": 0.785714328289032,
"step": 226
},
{
"clip_ratio": 0.0,
"completion_length": 1685.0001220703125,
"epoch": 0.7206349206349206,
"grad_norm": 0.4352041974732675,
"kl": 0.007537841796875,
"learning_rate": 1.917185334749523e-07,
"loss": 0.0324,
"reward": 1.6248048543930054,
"reward_std": 0.2427731454372406,
"rewards/": 6.624023914337158,
"rewards/math_compute_score": 0.3750000298023224,
"step": 227
},
{
"clip_ratio": 0.0,
"completion_length": 1574.83935546875,
"epoch": 0.7238095238095238,
"grad_norm": 0.39645948514529894,
"kl": 0.006988525390625,
"learning_rate": 1.8768021522239574e-07,
"loss": 0.0111,
"reward": 1.8595423698425293,
"reward_std": 0.20890839397907257,
"rewards/": 7.297712326049805,
"rewards/math_compute_score": 0.5,
"step": 228
},
{
"clip_ratio": 0.0,
"completion_length": 1461.446533203125,
"epoch": 0.726984126984127,
"grad_norm": 0.43535613207045143,
"kl": 0.006072998046875,
"learning_rate": 1.836750326373398e-07,
"loss": 0.024,
"reward": 1.9434850215911865,
"reward_std": 0.37709271907806396,
"rewards/": 7.145996570587158,
"rewards/math_compute_score": 0.6428571939468384,
"step": 229
},
{
"epoch": 0.7301587301587301,
"grad_norm": 0.346576656418996,
"learning_rate": 1.7970341065091243e-07,
"loss": -0.0127,
"step": 230
},
{
"epoch": 0.7301587301587301,
"eval_clip_ratio": 0.0,
"eval_completion_length": 1533.8690999348958,
"eval_kl": 0.005940755208333333,
"eval_loss": 0.012749183923006058,
"eval_reward": 2.0542319615681968,
"eval_reward_std": 0.41574782133102417,
"eval_rewards/": 7.247349580128987,
"eval_rewards/math_compute_score": 0.7559523979822794,
"eval_runtime": 130.0387,
"eval_samples_per_second": 0.161,
"eval_steps_per_second": 0.008,
"step": 230
},
{
"clip_ratio": 0.0,
"completion_length": 1560.6964721679688,
"epoch": 0.7333333333333333,
"grad_norm": 0.41442561377155895,
"kl": 0.00677490234375,
"learning_rate": 1.7576577063361918e-07,
"loss": -0.0333,
"reward": 1.7915493249893188,
"reward_std": 0.3907308280467987,
"rewards/": 6.957746505737305,
"rewards/math_compute_score": 0.5000000298023224,
"step": 231
},
{
"clip_ratio": 0.0,
"completion_length": 1500.7501220703125,
"epoch": 0.7365079365079366,
"grad_norm": 0.38386522257967753,
"kl": 0.00628662109375,
"learning_rate": 1.7186253035063736e-07,
"loss": 0.0055,
"reward": 1.9492467641830444,
"reward_std": 0.5078251957893372,
"rewards/": 7.031948089599609,
"rewards/math_compute_score": 0.6785714626312256,
"step": 232
},
{
"clip_ratio": 0.0,
"completion_length": 1712.83935546875,
"epoch": 0.7396825396825397,
"grad_norm": 0.3381282235296574,
"kl": 0.005828857421875,
"learning_rate": 1.6799410391749414e-07,
"loss": 0.0371,
"reward": 1.826283574104309,
"reward_std": 0.623534083366394,
"rewards/": 7.345703601837158,
"rewards/math_compute_score": 0.4464285969734192,
"step": 233
},
{
"clip_ratio": 0.0,
"completion_length": 1552.321533203125,
"epoch": 0.7428571428571429,
"grad_norm": 0.3949250198934789,
"kl": 0.005645751953125,
"learning_rate": 1.6416090175612958e-07,
"loss": 0.0482,
"reward": 1.816322684288025,
"reward_std": 0.5752555131912231,
"rewards/": 7.3673272132873535,
"rewards/math_compute_score": 0.4285714626312256,
"step": 234
},
{
"clip_ratio": 0.0,
"completion_length": 1474.71435546875,
"epoch": 0.746031746031746,
"grad_norm": 0.3997394557335043,
"kl": 0.006072998046875,
"learning_rate": 1.6036333055135344e-07,
"loss": 0.0706,
"reward": 2.0150113105773926,
"reward_std": 0.37213414907455444,
"rewards/": 7.360770225524902,
"rewards/math_compute_score": 0.6785714626312256,
"step": 235
},
{
"clip_ratio": 0.0,
"completion_length": 1357.3929443359375,
"epoch": 0.7492063492063492,
"grad_norm": 0.3581961494125316,
"kl": 0.00592041015625,
"learning_rate": 1.5660179320769788e-07,
"loss": 0.0471,
"reward": 2.19921875,
"reward_std": 0.36445701122283936,
"rewards/": 7.5675225257873535,
"rewards/math_compute_score": 0.8571429252624512,
"step": 236
},
{
"clip_ratio": 0.0,
"completion_length": 1471.83935546875,
"epoch": 0.7523809523809524,
"grad_norm": 0.37492809441966657,
"kl": 0.006866455078125,
"learning_rate": 1.5287668880667104e-07,
"loss": 0.0772,
"reward": 1.8618303537368774,
"reward_std": 0.49302613735198975,
"rewards/": 7.023437976837158,
"rewards/math_compute_score": 0.5714285969734192,
"step": 237
},
{
"clip_ratio": 0.0,
"completion_length": 1382.6785888671875,
"epoch": 0.7555555555555555,
"grad_norm": 0.38754661617692593,
"kl": 0.00714111328125,
"learning_rate": 1.49188412564416e-07,
"loss": 0.0385,
"reward": 1.967801570892334,
"reward_std": 0.3383636176586151,
"rewards/": 7.053292751312256,
"rewards/math_compute_score": 0.6964285969734192,
"step": 238
},
{
"clip_ratio": 0.0,
"completion_length": 1664.0535888671875,
"epoch": 0.7587301587301587,
"grad_norm": 0.4007176962544528,
"kl": 0.0062255859375,
"learning_rate": 1.455373557897814e-07,
"loss": 0.0594,
"reward": 1.5234934091567993,
"reward_std": 0.551922619342804,
"rewards/": 6.331752300262451,
"rewards/math_compute_score": 0.3214285969734192,
"step": 239
},
{
"epoch": 0.7619047619047619,
"grad_norm": 0.3831506478541147,
"learning_rate": 1.4192390584280344e-07,
"loss": 0.0014,
"step": 240
},
{
"epoch": 0.7619047619047619,
"eval_clip_ratio": 0.0,
"eval_completion_length": 1517.2461344401042,
"eval_kl": 0.005961100260416667,
"eval_loss": 0.0475773885846138,
"eval_reward": 1.965123454729716,
"eval_reward_std": 0.47016530235608417,
"eval_rewards/": 7.039902687072754,
"eval_rewards/math_compute_score": 0.6964285969734192,
"eval_runtime": 130.1244,
"eval_samples_per_second": 0.161,
"eval_steps_per_second": 0.008,
"step": 240
}
],
"logging_steps": 1.0,
"max_steps": 315,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 80,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}